From d4d23502bfc20172a34ce36b8129696e1b63bfa7 Mon Sep 17 00:00:00 2001
From: ShengYang1 <yang.sheng@intel.com>
Date: Sun, 29 Mar 2020 16:23:03 +0800
Subject: [PATCH 001/412] Fuse BN and Relu in mkl path

---
 tensorflow/core/graph/mkl_layout_pass.cc      |  56 +++-
 tensorflow/core/graph/mkl_layout_pass_test.cc | 106 +++++++
 .../grappler/optimizers/mkl_remapper_test.cc  | 173 +++++++++++
 .../core/grappler/optimizers/remapper.cc      |  23 +-
 tensorflow/core/kernels/BUILD                 |   9 +-
 .../core/kernels/mkl_fused_batch_norm_op.cc   | 272 +++++++++++++-----
 tensorflow/core/ops/mkl_nn_ops.cc             |  42 +++
 tensorflow/core/ops/nn_ops.cc                 |   4 +
 8 files changed, 601 insertions(+), 84 deletions(-)

diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index c27c7aa911b..e5d0fbfbd09 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -268,6 +268,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.dequantize = "Dequantize";
     csinfo_.fused_batch_norm = "FusedBatchNorm";
     csinfo_.fused_batch_norm_grad = "FusedBatchNormGrad";
+    csinfo_.fused_batch_norm_ex = "_FusedBatchNormEx";
     csinfo_.fused_batch_norm_v2 = "FusedBatchNormV2";
     csinfo_.fused_batch_norm_grad_v2 = "FusedBatchNormGradV2";
     csinfo_.fused_batch_norm_v3 = "FusedBatchNormV3";
@@ -294,6 +295,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         "_MklDepthwiseConv2dNativeBackpropInput";
     csinfo_.mkl_depthwise_conv2d_grad_filter =
         "_MklDepthwiseConv2dNativeBackpropFilter";
+    csinfo_.mkl_fused_batch_norm_ex = "_MklFusedBatchNormEx";
     csinfo_.mkl_fused_conv2d = "_MklFusedConv2D";
     csinfo_.mkl_fused_matmul = "_MklFusedMatMul";
     csinfo_.mkl_pad_with_conv2d = "_MklPadWithConv2D";
@@ -476,6 +478,11 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
         {csinfo_.fused_batch_norm_grad_v3,
          mkl_op_registry::GetMklOpName(csinfo_.fused_batch_norm_grad_v3),
          CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
+#ifdef ENABLE_MKLDNN_V1
+    rinfo_.push_back({csinfo_.fused_batch_norm_ex,
+                      csinfo_.mkl_fused_batch_norm_ex, CopyAttrsAll,
+                      FusedBatchNormExRewrite, kRewriteForLayoutPropagation});
+#endif
     rinfo_.push_back({csinfo_.fused_conv2d, csinfo_.mkl_fused_conv2d,
                       CopyAttrsFusedConv2D, FusedConv2DRewrite,
                       kRewriteForLayoutPropagation});
@@ -920,6 +927,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string dequantize;
     string fused_batch_norm;
     string fused_batch_norm_grad;
+    string fused_batch_norm_ex;
     string fused_batch_norm_v2;
     string fused_batch_norm_grad_v2;
     string fused_batch_norm_v3;
@@ -944,6 +952,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_conv2d_with_bias;
     string mkl_depthwise_conv2d_grad_input;
     string mkl_depthwise_conv2d_grad_filter;
+    string mkl_fused_batch_norm_ex;
     string mkl_fused_conv2d;
     string mkl_fused_matmul;
     string mkl_pad_with_conv2d;
@@ -1652,6 +1661,31 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return do_rewrite;
   }
 
+  static bool FusedBatchNormExRewrite(const Node* n) {
+    CHECK_NOTNULL(n);
+
+    int num_side_inputs;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "num_side_inputs", &num_side_inputs));
+    string activation_mode;
+    TF_CHECK_OK(GetNodeAttr(n->def(), "activation_mode", &activation_mode));
+
+    // if the num_side_inputs is not 0, don't rewrite the node.
+    if (num_side_inputs != 0) {
+      VLOG(1) << "FusedBatchNormExRewrite: The model sets num_side_inputs"
+              << "larger than 0 is not optimized by Intel MKL.";
+      return false;
+    }
+
+    // if the activation_mode is not 'Relu', don't rewrite the node.
+    if (activation_mode != "Relu") {
+      VLOG(1) << "FusedBatchNormExRewrite: Only Relu activation mode is"
+              << "supported by Intel MKL.";
+      return false;
+    }
+
+    return true;
+  }
+
   static bool FusedConv2DRewrite(const Node* n) {
     // MKL DNN currently doesn't support all fusions that grappler fuses
     // together with Conv2D (ex. batchnorm). We rewrite _FusedConv2D only if
@@ -2131,9 +2165,6 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   // Number of input slots to original op
   // Input slots are represented by .Input() calls in REGISTER_OP.
   int old_node_input_slots = old_node->op_def().input_arg_size();
-  // Actual number of inputs can be greater than or equal to number
-  // of Input slots because inputs of type list could be unfolded.
-  CHECK_GE(old_node_inputs.size(), old_node_input_slots);
   int nn_slot_idx = 0;  // slot index for inputs of new node
 
   // Let's copy all inputs (TF tensors) of original node to new node.
@@ -2141,13 +2172,14 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
     // An input slot could be a single tensor or a list. We need
     // to handle this case accordingly.
-    CHECK_LT(iidx, old_node_inputs.size());
     const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
-      int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingTFTensorList(old_node_inputs, &iidx, N,
-                                    &new_node_inputs);
+      int tensor_list_length = GetTensorListLength(arg, old_node);
+      if (tensor_list_length != 0) {
+        GetNodesProducingTFTensorList(old_node_inputs, &iidx,
+                                      tensor_list_length, &new_node_inputs);
+      }
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -2180,13 +2212,14 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
   for (int on_slot_idx = 0; on_slot_idx < old_node_input_slots; on_slot_idx++) {
     // An input slot could be a single tensor or a list. We need
     // to handle this case accordingly.
-    CHECK_LT(iidx, old_node_inputs.size());
     const OpDef::ArgDef& arg = old_node->op_def().input_arg(on_slot_idx);
     if (ArgIsList(arg)) {
       std::vector<NodeBuilder::NodeOut> new_node_inputs;
-      int N = GetTensorListLength(arg, old_node);
-      GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx, N,
-                                     &new_node_inputs);
+      int tensor_list_length = GetTensorListLength(arg, old_node);
+      if (tensor_list_length != 0) {
+        GetNodesProducingMklTensorList(g, old_node, old_node_inputs, &iidx,
+                                       tensor_list_length, &new_node_inputs);
+      }
       nb->Input(new_node_inputs);
       nn_slot_idx++;
     } else {
@@ -3702,6 +3735,7 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
       n->type_string() != csinfo_.pad_with_conv2d &&
       n->type_string() != csinfo_.pad_with_fused_conv2d &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
+      n->type_string() != csinfo_.fused_batch_norm_ex &&
       n->type_string() != csinfo_.fused_conv2d &&
       n->type_string() != csinfo_.fused_matmul &&
       !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 02979d3ac2d..e5a50c27f0b 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -3108,6 +3108,112 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Negative) {
             "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
+#ifdef ENABLE_MKLDNN_V1
+#define REGISTER_TEST(NAME, T, INPUT)                                        \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
+    InitGraph("node { name: 'A' op: '" #INPUT                                \
+              "'}"                                                           \
+              "node { name: 'B' op: 'Input'}"                                \
+              "node { name: 'C' op: 'Input'}"                                \
+              "node { name: 'D' op: 'Input'}"                                \
+              "node { name: 'E' op: 'Input'}"                                \
+              "node { name: 'F' op: '_FusedBatchNormEx'"                     \
+              " attr { key: 'T'               value { type: " #T             \
+              " } }"                                                         \
+              " attr { key: 'U'               value { type: DT_FLOAT } }"    \
+              " attr { key: 'data_format'     value { s: 'NCHW' } }"         \
+              " attr { key: 'epsilon'         value { f: 0.0001 } }"         \
+              " attr { key: 'num_side_inputs' value { i: 0 } }"              \
+              " attr { key: 'is_training'     value { b: true } }"           \
+              " attr { key: 'activation_mode' value { s: 'Relu' } }"         \
+              " input: ['A', 'B', 'C', 'D', 'E'] }"                          \
+              "node { name: 'G' op: 'Zeta'"                                  \
+              " attr { key: 'T' value { type: " #T                           \
+              " } }"                                                         \
+              " input: ['A', 'F'] }");                                       \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
+              "A(" #INPUT                                                    \
+              ");B(Input);C(Input);D(Input);"                                \
+              "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);"     \
+              "DMT/_4(Const);E(Input);"                                      \
+              "F(_MklFusedBatchNormEx);G(Zeta)|A->F;A->G;"                   \
+              "A:control->DMT/_0:control;A:control->DMT/_1:control;"         \
+              "A:control->DMT/_2:control;A:control->DMT/_3:control;"         \
+              "A:control->DMT/_4:control;B->F:1;C->F:2;D->F:3;"              \
+              "DMT/_0->F:5;DMT/_1->F:6;DMT/_2->F:7;DMT/_3->F:8;DMT/_4->F:9;" \
+              "E->F:4;F->G:1");                                              \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Positive);
+#undef REGISTER_TEST
+
+// Rewrite test for _FusedBatchNormEx Op with side input
+#define REGISTER_TEST(NAME, T, INPUT)                                     \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                 \
+    InitGraph("node { name: 'A' op: '" #INPUT                             \
+              "'}"                                                        \
+              "node { name: 'B' op: 'Input'}"                             \
+              "node { name: 'C' op: 'Input'}"                             \
+              "node { name: 'D' op: 'Input'}"                             \
+              "node { name: 'E' op: 'Input'}"                             \
+              "node { name: 'F' op: '" #INPUT                             \
+              "'}"                                                        \
+              "node { name: 'G' op: '_FusedBatchNormEx'"                  \
+              " attr { key: 'T'               value { type: " #T          \
+              " } }"                                                      \
+              " attr { key: 'U'               value { type: DT_FLOAT } }" \
+              " attr { key: 'data_format'     value { s: 'NCHW' } }"      \
+              " attr { key: 'epsilon'         value { f: 0.0001 } }"      \
+              " attr { key: 'num_side_inputs' value { i: 1 } }"           \
+              " attr { key: 'is_training'     value { b: true } }"        \
+              " attr { key: 'activation_mode' value { s: 'Relu' } }"      \
+              " input: ['A', 'B', 'C', 'D', 'E', 'F'] }"                  \
+              "node { name: 'H' op: 'Zeta'"                               \
+              " attr { key: 'T' value { type: " #T                        \
+              " } }"                                                      \
+              " input: ['A', 'G'] }");                                    \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                              \
+              "A(" #INPUT                                                 \
+              ");B(Input);C(Input);D(Input);E(Input);"                    \
+              "F(" #INPUT                                                 \
+              ");G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"                 \
+              "B->G:1;C->G:2;D->G:3;E->G:4;F->G:5;G->H:1");               \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative1);
+#undef REGISTER_TEST
+
+// Rewrite test for _FusedBatchNormEx Op with Identity activation
+#define REGISTER_TEST(NAME, T, INPUT)                                     \
+  TEST_F(MklLayoutPassTest, NAME##_##T) {                                 \
+    InitGraph("node { name: 'A' op: '" #INPUT                             \
+              "'}"                                                        \
+              "node { name: 'B' op: 'Input'}"                             \
+              "node { name: 'C' op: 'Input'}"                             \
+              "node { name: 'D' op: 'Input'}"                             \
+              "node { name: 'E' op: 'Input'}"                             \
+              "node { name: 'G' op: '_FusedBatchNormEx'"                  \
+              " attr { key: 'T'               value { type: " #T          \
+              " } }"                                                      \
+              " attr { key: 'U'               value { type: DT_FLOAT } }" \
+              " attr { key: 'data_format'     value { s: 'NCHW' } }"      \
+              " attr { key: 'epsilon'         value { f: 0.0001 } }"      \
+              " attr { key: 'num_side_inputs' value { i: 1 } }"           \
+              " attr { key: 'is_training'     value { b: true } }"        \
+              " attr { key: 'activation_mode' value { s: 'Identity' } }"  \
+              " input: ['A', 'B', 'C', 'D', 'E'] }"                       \
+              "node { name: 'H' op: 'Zeta'"                               \
+              " attr { key: 'T' value { type: " #T                        \
+              " } }"                                                      \
+              " input: ['A', 'G'] }");                                    \
+    EXPECT_EQ(DoMklLayoutOptimizationPass(),                              \
+              "A(" #INPUT                                                 \
+              ");B(Input);C(Input);D(Input);E(Input);"                    \
+              "G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"                   \
+              "B->G:1;C->G:2;D->G:3;E->G:4;G->H:1");                      \
+  }
+REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative2);
+#undef REGISTER_TEST
+#endif  // ENABLE_MKLDNN_V1
+
 TEST_F(MklLayoutPassTest, NodeRewrite_QuantizedDepthwiseConv2D_Positive) {
   InitGraph(
       "node { name: 'A' op: 'QuantizedUnsignedInt8Input'}"
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index 87841316fc1..66cc3418f3a 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -173,6 +174,178 @@ TEST_F(MklRemapperTest, FuseConv2DWithBiasAndAddNRelu) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
+#ifdef ENABLE_MKLDNN_V1
+TEST_F(MklRemapperTest, FuseBatchNormWithRelu) {
+  using ::tensorflow::ops::Placeholder;
+
+  for (bool is_training : {true, false}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    const int num_channels = 24;
+
+    TensorShape channel_shape({num_channels});
+    TensorShape empty_shape({0});
+
+    auto input = Placeholder(s.WithOpName("input"), DT_FLOAT,
+                             ops::Placeholder::Shape({2, 8, 8, num_channels}));
+    auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_FLOAT);
+    auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
+    auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
+    auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
+    auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
+
+    float epsilon = 0.1f;
+    auto fbn = ops::FusedBatchNormV3(
+        s.WithOpName("fused_batch_norm"), input_cast, scale, offset, mean, var,
+        ops::FusedBatchNormV3::IsTraining(is_training)
+            .Epsilon(epsilon)
+            .DataFormat("NHWC"));
+    auto relu = ops::Relu(s.WithOpName("relu"), fbn.y);
+    auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+
+    auto input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
+    auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+    auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+    auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                             : channel_shape);
+    auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                            : channel_shape);
+
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"input", input_t},
+                 {"scale", scale_t},
+                 {"offset", offset_t},
+                 {"mean", mean_t},
+                 {"var", var_t}};
+    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+    // Place all nodes on CPU.
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+    }
+
+    Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
+    GraphDef output;
+    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() == "relu") {
+        EXPECT_EQ(node.op(), "Identity");
+        ASSERT_EQ(node.input_size(), 1);
+        EXPECT_EQ(node.input(0), "fused_batch_norm");
+        found++;
+      }
+      if (node.name() == "fused_batch_norm") {
+        EXPECT_EQ(node.op(), "_FusedBatchNormEx");
+        ASSERT_EQ(node.input_size(), 5);
+        EXPECT_EQ(node.input(0), "input_cast");
+        EXPECT_EQ(node.input(1), "scale");
+        EXPECT_EQ(node.input(2), "offset");
+        EXPECT_EQ(node.input(3), "mean");
+        EXPECT_EQ(node.input(4), "var");
+
+        auto attr = node.attr();
+        EXPECT_EQ(attr["num_side_inputs"].i(), 0);
+        EXPECT_EQ(attr["activation_mode"].s(), "Relu");
+        found++;
+      }
+    }
+    EXPECT_EQ(found, 2);
+  }
+}
+
+TEST_F(MklRemapperTest, FuseBatchNormWithAddAndRelu) {
+  using ::tensorflow::ops::Placeholder;
+
+  for (bool is_training : {true, false}) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    const int num_channels = 24;
+
+    TensorShape input_shape({2, 8, 8, num_channels});
+    TensorShape channel_shape({num_channels});
+    TensorShape empty_shape({0});
+
+    auto input = Placeholder(s.WithOpName("input"), DT_FLOAT,
+                             ops::Placeholder::Shape(input_shape));
+    auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_FLOAT);
+    auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
+    auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
+    auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
+    auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
+    auto side_input = Placeholder(s.WithOpName("side_input"), DT_FLOAT,
+                                  ops::Placeholder::Shape(input_shape));
+    auto side_input_cast =
+        ops::Cast(s.WithOpName("side_input_cast"), side_input, DT_FLOAT);
+
+    float epsilon = 0.1f;
+    auto fbn = ops::FusedBatchNormV3(
+        s.WithOpName("fused_batch_norm"), input_cast, scale, offset, mean, var,
+        ops::FusedBatchNormV3::IsTraining(is_training)
+            .Epsilon(epsilon)
+            .DataFormat("NHWC"));
+    auto add = ops::Add(s.WithOpName("add"), fbn.y, side_input_cast);
+    auto relu = ops::Relu(s.WithOpName("relu"), add);
+    auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+
+    auto input_t = GenerateRandomTensor<DT_FLOAT>(input_shape);
+    auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+    auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+    auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                             : channel_shape);
+    auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                            : channel_shape);
+    auto side_input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
+
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"input", input_t},   {"scale", scale_t},
+                 {"offset", offset_t}, {"mean", mean_t},
+                 {"var", var_t},       {"side_input", side_input_t}};
+    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+    // Place all nodes on CPU.
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+    }
+
+    Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
+    GraphDef output;
+    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() == "add") {
+        EXPECT_EQ(node.op(), "Add");
+        ASSERT_EQ(node.input_size(), 2);
+        EXPECT_EQ(node.input(0), "fused_batch_norm");
+        EXPECT_EQ(node.input(1), "side_input_cast");
+        found++;
+      }
+      if (node.name() == "relu") {
+        EXPECT_EQ(node.op(), "Relu");
+        ASSERT_EQ(node.input_size(), 1);
+        EXPECT_EQ(node.input(0), "add");
+        found++;
+      }
+      if (node.name() == "fused_batch_norm") {
+        EXPECT_EQ(node.op(), "FusedBatchNormV3");
+        ASSERT_EQ(node.input_size(), 5);
+        EXPECT_EQ(node.input(0), "input_cast");
+        EXPECT_EQ(node.input(1), "scale");
+        EXPECT_EQ(node.input(2), "offset");
+        EXPECT_EQ(node.input(3), "mean");
+        EXPECT_EQ(node.input(4), "var");
+        found++;
+      }
+    }
+    EXPECT_EQ(found, 3);
+  }
+}
+#endif  // ENABLE_MKLDNN_V1
+
 }  // namespace grappler
 }  // namespace tensorflow
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 5b41ad38089..fd8c7a0af12 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -741,24 +741,27 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
       [&](const utils::MutableNodeView& fused_batch_norm) -> bool {
     const auto* fused_batch_norm_node_def = fused_batch_norm.node();
     if (!IsFusedBatchNorm(*fused_batch_norm_node_def)) return false;
-
-    // We fuse FusedBatchNorm only on GPU, because on CPU we fuse it with
-    // contraction (MatMul or Conv2D node).
+// We fuse FusedBatchNorm on GPU or MKL CPU.
+#ifndef ENABLE_MKLDNN_V1
     if (!NodeIsOnGpu(fused_batch_norm_node_def)) return false;
+#endif
 
     DataType t_dtype = GetDataTypeFromAttr(*fused_batch_norm_node_def, "T");
+#ifndef ENABLE_MKLDNN_V1
     if (t_dtype != DT_FLOAT && t_dtype != DT_HALF) return false;
+#else
+    if (t_dtype != DT_FLOAT && t_dtype != DT_BFLOAT16) return false;
+#endif
 
     // Get the FusedBatchNorm training mode.
     bool is_training;
     if (!GetNodeAttr(*fused_batch_norm_node_def, kIsTraining, &is_training)
              .ok())
       return false;
-
     // In training mode we rely on cuDNN for computing FusedBatchNorm with side
     // inputs and activation, and it has its own limitations. In inference mode
     // we have a custom CUDA kernel that doesn't not have these constraints.
-    if (is_training) {
+    if (is_training && NodeIsOnGpu(fused_batch_norm_node_def)) {
       // cuDNN only supports NHWC data layout.
       string data_format;
       if (!GetNodeAttr(*fused_batch_norm_node_def, kDataFormat, &data_format)
@@ -810,6 +813,12 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
 
   // Input to a Relu can be an Add node with FusedBatchNorm as one of the inputs
   if (IsAdd(*relu_fanin_0_node_def)) {
+// Currently no CPU implementation for "FusedBatchNorm + SideInput +
+// <Activation>""
+#ifdef ENABLE_MKLDNN_V1
+    return false;
+#endif
+
     // Check that only Relu node consumes the output of an Add node.
     if (HasControlFaninOrFanout(*relu_fanin_0_node_view) ||
         !HasAtMostOneFanoutAtPort0(*relu_fanin_0_node_view) ||
@@ -881,7 +890,11 @@ void CopyFusedBatchNormAttributes(const NodeDef& fused_batch_norm,
   if (fused_batch_norm.op() != "FusedBatchNorm") {
     (*attr)["U"] = src_attr.at("U");
   } else {
+#ifndef ENABLE_MKLDNN_V1
     (*attr)["U"] = src_attr.at("T");
+#else
+    SetAttrValue(DT_FLOAT, &(*attr)["U"]);
+#endif
   }
 }
 
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 0477d260e10..327ada53ec1 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -8135,7 +8135,14 @@ tf_mkl_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
-    deps = NN_DEPS + mkl_deps(),
+    hdrs = [
+        "fused_batch_norm_op.h",
+        "no_op.h",
+    ],
+    deps = NN_DEPS + [
+        ":fused_batch_norm_op",
+        ":no_op",
+    ] + mkl_deps(),
 )
 
 tf_cc_test_mkl(
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 40e4825c0fa..6df02bc3023 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -14,14 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #ifdef INTEL_MKL
 #include "mkldnn.hpp"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/fused_batch_norm_op.h"
+#include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/util/mkl_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 
 #define GET_FLAG(bn_flag) static_cast<int>(BN_FLAGS::bn_flag)
 #define IS_SET(cflag) (context_.flags & GET_FLAG(cflag))
@@ -37,11 +39,14 @@ using BatchNormBwdPd = mkldnn::batch_normalization_backward::primitive_desc;
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 
+using FusedBNActivationMode = functor::FusedBatchNormActivationMode;
+
 struct MklBatchNormFwdParams {
   memory::dims src_dims;
   int depth;
   float eps;
   bool training;
+  FusedBNActivationMode activation_mode;
 #ifndef ENABLE_MKLDNN_V1
   MEMORY_FORMAT src_format;
 #else
@@ -50,14 +55,17 @@ struct MklBatchNormFwdParams {
 
   MklBatchNormFwdParams(const memory::dims& src_dims, int depth, float eps,
 #ifndef ENABLE_MKLDNN_V1
-                        bool training, MEMORY_FORMAT src_format)
+                        bool training, MEMORY_FORMAT src_format,
+                        FusedBNActivationMode activation_mode)
 #else
-                        bool training, memory::desc src_md)
+                        bool training, memory::desc src_md,
+                        FusedBNActivationMode activation_mode)
 #endif  // !ENABLE_MKLDNN_V1
       : src_dims(src_dims),
         depth(depth),
         eps(eps),
         training(training),
+        activation_mode(activation_mode),
 #ifndef ENABLE_MKLDNN_V1
         src_format(src_format) {
   }
@@ -90,7 +98,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
   //   mean_data:     output data buffer of means
   //   variance_data: output data buffer of variances
   void Execute(const T* src_data, const U* weights_data, T* dst_data,
-               U* mean_data, U* variance_data) {
+               U* mean_data, U* variance_data, U* workspace_data) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -104,6 +112,9 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
       context_.mean_mem->set_data_handle(static_cast<void*>(mean_data));
       context_.variance_mem->set_data_handle(static_cast<void*>(variance_data));
     }
+    if (workspace_data != nullptr) {
+      context_.ws_mem->set_data_handle(workspace_data);
+    }
 #ifdef ENABLE_MKLDNN_V1
     // Execute batch-normalization forward primitives.
     execute_primitives(context_.fwd_primitives, context_.fwd_stream,
@@ -123,6 +134,10 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
       context_.mean_mem->set_data_handle(DummyData);
       context_.variance_mem->set_data_handle(DummyData);
     }
+
+    if (workspace_data != nullptr) {
+      context_.ws_mem->set_data_handle(DummyData);
+    }
   }
 
   MEMORY_PRIMITIVE_DESC GetDstPd() const { return context_.dst_mem->GET_DESC; }
@@ -158,6 +173,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::memory> dst_mem;
     std::shared_ptr<mkldnn::memory> mean_mem;
     std::shared_ptr<mkldnn::memory> variance_mem;
+    std::shared_ptr<mkldnn::memory> ws_mem;
 
     // Forward BatchNorm primitive descriptor.
     std::shared_ptr<BatchNormFwdPd> fwd_pd;
@@ -179,6 +195,7 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
           dst_mem(nullptr),
           mean_mem(nullptr),
           variance_mem(nullptr),
+          ws_mem(nullptr),
           bn_fwd(nullptr),
           fwd_stream(nullptr) {}
   };
@@ -192,6 +209,9 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
                                         : prop_kind::forward_scoring;
 
 #ifdef ENABLE_MKLDNN_V1
+    if (fwdParams.activation_mode == FusedBNActivationMode::kRelu) {
+      context_.flags |= GET_FLAG(fuse_norm_relu);
+    }
     // Memory descriptor
     auto src_md = fwdParams.src_md;
     // Create forward BatchNorm descriptor and primitive descriptor.
@@ -229,6 +249,13 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
           m_dims, U, MEMORY_FORMAT::nc, cpu_engine_, DummyData));
     }
 
+#ifdef ENABLE_MKLDNN_V1
+    if (IS_SET(fuse_norm_relu)) {
+      context_.ws_mem.reset(new MEMORY_CONSTRUCTOR(
+          context_.fwd_pd->workspace_desc(), cpu_engine_, DummyData));
+    }
+#endif  // ENABLE_MKLDNN_V1
+
     // BatchNorm forward primitive.
     // TODO(intel-tf): Merge all the #ifdefs and simplify code
     if (!fwdParams.training && !(IS_SET(use_global_stats))) {
@@ -258,20 +285,41 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     } else if (IS_SET(use_global_stats)) {
 #ifdef ENABLE_MKLDNN_V1
       if ((IS_SET(use_scale_shift)) && GET_FLAG(use_scale_shift)) {
-        context_.net_args.push_back(
-            {{MKLDNN_ARG_SRC, *context_.src_mem},
-             {MKLDNN_ARG_MEAN, *context_.mean_mem},
-             {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
-             {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
-             { MKLDNN_ARG_DST,
-               *context_.dst_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               { MKLDNN_ARG_DST,
+                 *context_.dst_mem }});
+        }
       } else {
-        context_.net_args.push_back(
-            {{MKLDNN_ARG_SRC, *context_.src_mem},
-             {MKLDNN_ARG_MEAN, *context_.mean_mem},
-             {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
-             { MKLDNN_ARG_DST,
-               *context_.dst_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               { MKLDNN_ARG_DST,
+                 *context_.dst_mem }});
+        }
       }
       context_.bn_fwd.reset(new batch_normalization_forward(*context_.fwd_pd));
 #else
@@ -291,19 +339,40 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     } else {
 #ifdef ENABLE_MKLDNN_V1
       if ((IS_SET(use_scale_shift)) && GET_FLAG(use_scale_shift)) {
-        context_.net_args.push_back(
-            {{MKLDNN_ARG_SRC, *context_.src_mem},
-             {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
-             {MKLDNN_ARG_DST, *context_.dst_mem},
-             {MKLDNN_ARG_MEAN, *context_.mean_mem},
-             { MKLDNN_ARG_VARIANCE,
-               *context_.variance_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_WEIGHTS, *context_.weights_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               { MKLDNN_ARG_VARIANCE,
+                 *context_.variance_mem }});
+        }
       } else {
-        context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
-                                     {MKLDNN_ARG_DST, *context_.dst_mem},
-                                     {MKLDNN_ARG_MEAN, *context_.mean_mem},
-                                     { MKLDNN_ARG_VARIANCE,
-                                       *context_.variance_mem }});
+        if (IS_SET(fuse_norm_relu)) {
+          context_.net_args.push_back(
+              {{MKLDNN_ARG_SRC, *context_.src_mem},
+               {MKLDNN_ARG_DST, *context_.dst_mem},
+               {MKLDNN_ARG_MEAN, *context_.mean_mem},
+               {MKLDNN_ARG_VARIANCE, *context_.variance_mem},
+               { MKLDNN_ARG_WORKSPACE,
+                 *context_.ws_mem }});
+        } else {
+          context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.src_mem},
+                                       {MKLDNN_ARG_DST, *context_.dst_mem},
+                                       {MKLDNN_ARG_MEAN, *context_.mean_mem},
+                                       { MKLDNN_ARG_VARIANCE,
+                                         *context_.variance_mem }});
+        }
       }
       context_.bn_fwd.reset(new batch_normalization_forward(*context_.fwd_pd));
 #else
@@ -360,6 +429,7 @@ class MklFusedBatchNormFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey<int>(fwdParams.depth);
     key_creator.AddAsKey<float>(fwdParams.eps);
     key_creator.AddAsKey<bool>(fwdParams.training);
+    key_creator.AddAsKey<FusedBNActivationMode>(fwdParams.activation_mode);
     key_creator.AddAsKey(typeid(T).name());
     key_creator.AddAsKey(typeid(U).name());
     return key_creator.GetKey();
@@ -676,7 +746,8 @@ class MklFusedBatchNormBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
 //  Adding a third parameter to the template to support FusedBatchNormV3
 //  with MKL. This is different from default where the classes are
 //  derived. Moves enabling to compile-time rather than runtime.
-template <typename Device, typename T, typename U, bool reserved_space>
+template <typename Device, typename T, typename U, bool reserved_space,
+          bool is_batch_norm_ex = false>
 class MklFusedBatchNormOp : public OpKernel {
  public:
   explicit MklFusedBatchNormOp(OpKernelConstruction* context)
@@ -696,6 +767,28 @@ class MklFusedBatchNormOp : public OpKernel {
     depth_ = 0;
     mean_values_ = nullptr;
     variance_values_ = nullptr;
+
+#ifndef ENABLE_MKLDNN_V1
+    OP_REQUIRES(context, !is_batch_norm_ex,
+                errors::InvalidArgument(
+                    "_MklFusedBatchNormEx is not supported in DNNL 0.x ."));
+#endif
+    if (!is_batch_norm_ex) {
+      activation_mode_ = FusedBNActivationMode::kIdentity;
+    } else {
+      int num_side_inputs;
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("num_side_inputs", &num_side_inputs));
+      // Currently _MKLFusedBatchNormEx do not support "SideInput"
+      OP_REQUIRES(context, num_side_inputs == 0,
+                  errors::InvalidArgument(
+                      "_MKLFusedBatchNorm do not support side input now."));
+
+      OP_REQUIRES_OK(context, ParseActivationMode(context, &activation_mode_));
+      OP_REQUIRES(context, activation_mode_ == FusedBNActivationMode::kRelu,
+                  errors::InvalidArgument(
+                      "_MKLFusedBatchNorm only support Relu activation"));
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -744,9 +837,12 @@ class MklFusedBatchNormOp : public OpKernel {
 
       // Handle the special case: input with 0 element and 0 batch size.
       Tensor* dst_tensor = nullptr;
+      TensorShape workspace_tf_shape;
       if (tf_shape_src.num_elements() == 0) {
-        HandleEmptyInput(context, tf_shape_src, scale_tensor.shape(),
-                         &dst_tensor);
+        size_t workspace_bytes = 0;
+        workspace_tf_shape.AddDim(workspace_bytes);
+        HandleEmptyInput(context, tf_shape_src, workspace_tf_shape,
+                         scale_tensor.shape(), &dst_tensor);
         return;
       }
 
@@ -758,23 +854,16 @@ class MklFusedBatchNormOp : public OpKernel {
       // Index of output tensor(diff_src).
       const size_t kDstIndex = 0;
 
-      // Allocate 4 output TF tensors.
+      // Allocate 5 output TF tensors.
       Tensor* batch_mean_tensor = nullptr;
       Tensor* batch_variance_tensor = nullptr;
       Tensor* saved_mean_tensor = nullptr;
       Tensor* saved_variance_tensor = nullptr;
       Tensor* reserved_space_tensor = nullptr;
-      AllocateTFOutputs(context, scale_tensor.shape(), &batch_mean_tensor,
-                        &batch_variance_tensor, &saved_mean_tensor,
-                        &saved_variance_tensor, &reserved_space_tensor);
-
-      if (is_training_)
-        SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
-      else
-        SetMeanVariance(est_mean_tensor, est_variance_tensor);
 
       MklDnnData<T> src(&cpu_engine_);
       MklDnnData<U> weights(&cpu_engine_);
+      MklDnnData<U> wksp(&cpu_engine_);
 
       MEMORY_FORMAT dnn_fmt;
       MKL_TENSOR_FORMAT mkl_tensor_fmt;
@@ -801,6 +890,51 @@ class MklFusedBatchNormOp : public OpKernel {
                         ? dnn_shape_src.GetMklLayout()
                         : memory::desc(src_dims, MklDnnType<T>(), dnn_fmt);
 
+#ifdef ENABLE_MKLDNN_V1
+      MklBatchNormFwdParams fwdParams(src_dims, depth_, epsilon_, is_training_,
+                                      src_md, activation_mode_);
+#else
+      MklBatchNormFwdParams fwdParams(
+          src_dims, depth_, epsilon_, is_training_,
+          static_cast<MEMORY_FORMAT>(src_md.data.format), activation_mode_);
+#endif  // ENABLE_MKLDNN_V1
+      // Get forward batch-normalization op from the primitive caching pool.
+      MklFusedBatchNormFwdPrimitive<T, U>* bn_fwd =
+          MklFusedBatchNormFwdPrimitiveFactory<T, U>::Get(fwdParams);
+
+      // Allocate workspace tensor
+      U* ws_data = nullptr;
+      if (fwdParams.activation_mode == FusedBNActivationMode::kRelu) {
+#ifdef ENABLE_MKLDNN_V1
+        MEMORY_PRIMITIVE_DESC workspace_pd =
+            bn_fwd->GetBatchNormFwdPd()->workspace_desc();
+        size_t workspace_bytes = workspace_pd.get_size();
+        workspace_tf_shape.AddDim(workspace_bytes);
+
+        AllocateTFOutputs(context, scale_tensor.shape(), workspace_tf_shape,
+                          &batch_mean_tensor, &batch_variance_tensor,
+                          &saved_mean_tensor, &saved_variance_tensor,
+                          &reserved_space_tensor);
+        if (reserved_space) {
+          wksp.SetUsrMem(workspace_pd, reserved_space_tensor);
+          ws_data = static_cast<U*>(wksp.GetOpMem().get_data_handle());
+        }
+#endif  // ENABLE_MKLDNN_V1
+      } else {
+        // There is actually no workspace tensor out, so we make a dummy one.
+        size_t workspace_bytes = 0;
+        workspace_tf_shape.AddDim(workspace_bytes);
+        AllocateTFOutputs(context, scale_tensor.shape(), workspace_tf_shape,
+                          &batch_mean_tensor, &batch_variance_tensor,
+                          &saved_mean_tensor, &saved_variance_tensor,
+                          &reserved_space_tensor);
+      }
+
+      if (is_training_)
+        SetMeanVariance(*batch_mean_tensor, *batch_variance_tensor);
+      else
+        SetMeanVariance(est_mean_tensor, est_variance_tensor);
+
       // MKL-DNN packs scale & shift as "weights":
       // <scale>...<scale><shift>...<shift>
       weights.AllocateBuffer(2 * depth_ * sizeof(U));
@@ -821,18 +955,6 @@ class MklFusedBatchNormOp : public OpKernel {
                   reinterpret_cast<char*>(variance_values_),
                   depth_ * sizeof(U));
 
-#ifdef ENABLE_MKLDNN_V1
-      MklBatchNormFwdParams fwdParams(src_dims, depth_, epsilon_, is_training_,
-                                      src_md);
-#else
-      MklBatchNormFwdParams fwdParams(
-          src_dims, depth_, epsilon_, is_training_,
-          static_cast<MEMORY_FORMAT>(src_md.data.format));
-#endif  // ENABLE_MKLDNN_V1
-      // Get forward batch-normalization op from the primitive caching pool.
-      MklFusedBatchNormFwdPrimitive<T, U>* bn_fwd =
-          MklFusedBatchNormFwdPrimitiveFactory<T, U>::Get(fwdParams);
-
       // Check if reorder is needed for src.
       const T* src_data = nullptr;
       std::shared_ptr<BatchNormFwdPd> bn_fwd_pd = bn_fwd->GetBatchNormFwdPd();
@@ -866,7 +988,7 @@ class MklFusedBatchNormOp : public OpKernel {
 
       // Execute
       bn_fwd->Execute(src_data, weights_op_data, dst_data, mean_op_data,
-                      variance_op_data);
+                      variance_op_data, ws_data);
 
       float adjust_factor = 1.0;
       if (is_training_) {
@@ -924,6 +1046,7 @@ class MklFusedBatchNormOp : public OpKernel {
   U* mean_values_;
   U* variance_values_;
   size_t depth_;  // Batch normalization is performed for per channel.
+  FusedBNActivationMode activation_mode_;
   engine cpu_engine_ = engine(ENGINE_CPU, 0);
 
   void ExtractParams(OpKernelContext* context) {
@@ -938,6 +1061,7 @@ class MklFusedBatchNormOp : public OpKernel {
   }
 
   void HandleEmptyInput(OpKernelContext* context, TensorShape tf_shape_src,
+                        TensorShape workspace_tf_shape,
                         TensorShape tf_shape_scale, Tensor** dst_tensor) {
     DCHECK(dst_tensor);
 
@@ -955,12 +1079,14 @@ class MklFusedBatchNormOp : public OpKernel {
     Tensor* saved_mean_tensor = nullptr;
     Tensor* saved_variance_tensor = nullptr;
     Tensor* reserved_space_tensor = nullptr;
-    AllocateTFOutputs(context, tf_shape_scale, &batch_mean_tensor,
-                      &batch_variance_tensor, &saved_mean_tensor,
-                      &saved_variance_tensor, &reserved_space_tensor);
+    AllocateTFOutputs(context, tf_shape_scale, workspace_tf_shape,
+                      &batch_mean_tensor, &batch_variance_tensor,
+                      &saved_mean_tensor, &saved_variance_tensor,
+                      &reserved_space_tensor);
   }
 
   void AllocateTFOutputs(OpKernelContext* context, TensorShape tf_shape_scale,
+                         TensorShape workspace_tf_shape,
                          Tensor** batch_mean_tensor,
                          Tensor** batch_variance_tensor,
                          Tensor** saved_mean_tensor,
@@ -1024,21 +1150,15 @@ class MklFusedBatchNormOp : public OpKernel {
     std::fill_n(saved_variance_data, num_elements, static_cast<U>(0));
 
     // Changes to support reserved_space_3 parameter in FusedBatchNormV3.
-    // TODO: This parameter functionality is not implemented on CPU.
-    //       It is used to hold intermediate results. So the allocated
-    //       memory is filled with 0s.
     if (reserved_space) {
       DCHECK(reserved_space_tensor != nullptr);
 
       MklDnnShape mkl_shape_reserved_space;
       mkl_shape_reserved_space.SetMklTensor(false);
       AllocateOutputSetMklShape(context, kReservedSpaceIndex,
-                                reserved_space_tensor, tf_shape_scale,
+                                reserved_space_tensor, workspace_tf_shape,
                                 mkl_shape_reserved_space);
       DCHECK((*reserved_space_tensor) != nullptr);
-      auto saved_reserved_space_data =
-          (*reserved_space_tensor)->flat<U>().data();
-      std::fill_n(saved_reserved_space_data, num_elements, static_cast<U>(0));
     }
   }
 };
@@ -1367,7 +1487,7 @@ class MklFusedBatchNormGradOp : public OpKernel {
           .Device(DEVICE_CPU)                                  \
           .TypeConstraint<T>("T")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, T, false>);
+      MklFusedBatchNormOp<CPUDevice, T, T, false, false>);
 
 TF_CALL_float(REGISTER_MKL_FUSED_BATCHNORM_CPU);
 TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_CPU);
@@ -1380,7 +1500,7 @@ TF_CALL_bfloat16(REGISTER_MKL_FUSED_BATCHNORM_CPU);
           .TypeConstraint<T>("T")                              \
           .TypeConstraint<U>("U")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, U, false>);
+      MklFusedBatchNormOp<CPUDevice, T, U, false, false>);
 
 REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(float, float);
 REGISTER_MKL_FUSED_BATCHNORM_V2_CPU(bfloat16, float);
@@ -1421,12 +1541,30 @@ REGISTER_MKL_FUSED_BATCHNORM_GRAD_V2_CPU(bfloat16, float);
           .TypeConstraint<T>("T")                              \
           .TypeConstraint<U>("U")                              \
           .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
-      MklFusedBatchNormOp<CPUDevice, T, U, true>);
+      MklFusedBatchNormOp<CPUDevice, T, U, true, false>);      \
+  REGISTER_KERNEL_BUILDER(                                     \
+      Name("_MklFusedBatchNormEx")                             \
+          .Device(DEVICE_CPU)                                  \
+          .TypeConstraint<T>("T")                              \
+          .TypeConstraint<U>("U")                              \
+          .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \
+      MklFusedBatchNormOp<CPUDevice, T, U, true, true>);
 
 REGISTER_MKL_FUSED_BATCHNORM_V3_CPU(float, float);
 REGISTER_MKL_FUSED_BATCHNORM_V3_CPU(bfloat16, float);
 #undef REGISTER_MKL_FUSED_BATCHNORM_V3_CPU
 
+REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<float>("T")
+                            .TypeConstraint<float>("U"),
+                        NoOp);
+REGISTER_KERNEL_BUILDER(Name("_FusedBatchNormEx")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<bfloat16>("T")
+                            .TypeConstraint<float>("U"),
+                        NoOp);
+
 #define REGISTER_MKL_FUSED_BATCHNORM_GRAD_V3_CPU(T, U)         \
   REGISTER_KERNEL_BUILDER(                                     \
       Name("_MklFusedBatchNormGradV3")                         \
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 3f9cc0df131..90f945d2692 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -1342,6 +1342,48 @@ REGISTER_OP("_MklFusedBatchNormGradV3")
         R"doc(MKL-DNN implementation of FusedBatchNormGradV3: Do not invoke this operator directly in Python.
              Graph rewrite pass is expected to invoke this operator.)doc");
 
+REGISTER_OP("_MklFusedBatchNormEx")
+    .Input("x: T")
+    .Input("scale: U")
+    .Input("offset: U")
+    .Input("mean: U")
+    .Input("variance: U")
+    .Input("side_input: num_side_inputs * T")
+    .Input("mkl_x: uint8")
+    .Input("mkl_scale: uint8")
+    .Input("mkl_offset: uint8")
+    .Input("mkl_mean: uint8")
+    .Input("mkl_variance: uint8")
+    .Input("mkl_side_input: num_side_inputs * uint8")
+    .Output("y: T")
+    .Output("batch_mean: U")
+    .Output("batch_variance: U")
+    .Output("reserve_space_1: U")
+    .Output("reserve_space_2: U")
+    .Output("reserve_space_3: U")
+    .Output("mkl_y: uint8")
+    .Output("mkl_batch_mean: uint8")
+    .Output("mkl_batch_variance: uint8")
+    .Output("mkl_reserve_space_1: uint8")
+    .Output("mkl_reserve_space_2: uint8")
+    .Output("mkl_reserve_space_3: uint8")
+    .Attr("T: {bfloat16, float}")
+    .Attr("U: {float}")
+    .Attr("epsilon: float = 0.0001")
+    .Attr("exponential_avg_factor: float = 1.0")
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("num_side_inputs: int >= 0 = 0")
+    .Attr("activation_mode: string = \"Identity\"")
+    .Attr("is_training: bool = true")
+    .SetShapeFn(shape_inference::FusedBatchNormShape)
+    .Doc(R"doc(
+MKL version of FusedBatchNormEx operator. Uses MKL DNN APIs to perform fused
+batch normalization and relu.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 2e55bc6cd95..53ee2cfa035 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -238,7 +238,11 @@ REGISTER_OP("_FusedBatchNormEx")
     .Output("reserve_space_1: U")
     .Output("reserve_space_2: U")
     .Output("reserve_space_3: U")
+#ifdef ENABLE_MKLDNN_V1
+    .Attr("T: {half, float, bfloat16}")
+#else
     .Attr("T: {half, float}")
+#endif
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("exponential_avg_factor: float = 1.0")

From 08cd923c406a20aa019344e081a74f03ad2ebd0f Mon Sep 17 00:00:00 2001
From: ShengYang1 <yang.sheng@intel.com>
Date: Wed, 8 Apr 2020 14:07:24 +0800
Subject: [PATCH 002/412] Copy exponential_avg_factor attr in grappler

---
 tensorflow/core/grappler/optimizers/remapper.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index fd8c7a0af12..6f94f66cb00 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -885,6 +885,7 @@ void CopyFusedBatchNormAttributes(const NodeDef& fused_batch_norm,
   (*attr)["is_training"] = src_attr.at("is_training");
   (*attr)["data_format"] = src_attr.at("data_format");
   (*attr)["epsilon"] = src_attr.at("epsilon");
+  (*attr)["exponential_avg_factor"] = src_attr.at("exponential_avg_factor");
 
   // FusedBatchNormV2 and V3 have an extra type parameter.
   if (fused_batch_norm.op() != "FusedBatchNorm") {

From 966ed1cafc770e81e6a56be3f5715e0fe257b742 Mon Sep 17 00:00:00 2001
From: Fei Sun <sunfei.china@gmail.com>
Date: Thu, 16 Apr 2020 18:41:20 +0800
Subject: [PATCH 003/412] Use provided host name/ip instead of localhost if
 possible

---
 .../distributed_runtime/rpc/grpc_server_lib.cc    | 15 +++++++++++----
 .../distributed_runtime/rpc/grpc_server_lib.h     |  5 ++++-
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 32083fc272f..7e2c42dabea 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -132,8 +132,9 @@ GrpcServer::~GrpcServer() {
 void GrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
 
 // Look up the port that has been requested for this task in `server_def`.
-Status GrpcServer::GetPort(const ServerDef& server_def, int* port) const {
+Status GrpcServer::GetHostAndPort(const ServerDef& server_def, string* host_name, int* port) const {
   *port = -1;
+  *host_name = "localhost";
   for (const auto& job : server_def.cluster().job()) {
     if (job.name() == server_def.job_name()) {
       auto iter = job.tasks().find(server_def.task_index());
@@ -153,6 +154,10 @@ Status GrpcServer::GetPort(const ServerDef& server_def, int* port) const {
               "Could not parse port for local server from \"", iter->second,
               "\".");
         }
+
+        if (colon_index != string::npos && !iter->second.substr(0, colon_index).empty()) {
+          *host_name = iter->second.substr(0, colon_index);
+        }
       }
       break;
     }
@@ -175,7 +180,9 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   // otherwise if 'task_index=-1' the program will abort.
 
   int requested_port;
-  TF_RETURN_IF_ERROR(GetPort(server_def_, &requested_port));
+  string host_name;
+  TF_RETURN_IF_ERROR(GetHostAndPort(server_def_, &host_name, &requested_port));
+  host_name_ = host_name;
 
   SessionOptions sess_opts;
   ConfigProto config = server_def_.default_session_config();
@@ -325,7 +332,7 @@ Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                                        task.second);
       }
       if (job.name() == *options.job_name && task.first == options.task_index) {
-        host_port = strings::StrCat("localhost:", bound_port_);
+        host_port = strings::StrCat(host_name_, ":", bound_port_);
       } else {
         host_port = task.second;
       }
@@ -478,7 +485,7 @@ Status GrpcServer::Join() {
 }
 
 const string GrpcServer::target() const {
-  return strings::StrCat("grpc://localhost:", bound_port_);
+  return strings::StrCat("grpc://", host_name_, ":", bound_port_);
 }
 
 std::shared_ptr<::grpc::ServerCredentials> GrpcServer::GetServerCredentials(
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 8e25b8835eb..feb174cde4e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -104,7 +104,7 @@ class GrpcServer : public ServerInterface {
   Status UpdateServerDef(const ServerDef& server_def);
 
  protected:
-  virtual Status GetPort(const ServerDef& server_def, int* port) const;
+  virtual Status GetHostAndPort(const ServerDef& server_def, string* host_name, int* port) const;
   Status Init(const GrpcServerOptions& opts = GrpcServerOptions());
 
   // A subclass can override this method to support secure credentials.
@@ -136,6 +136,9 @@ class GrpcServer : public ServerInterface {
   // The port to which this server is bound.
   int bound_port_ = 0;
 
+  // The host name of this server
+  string host_name_;
+
   // Guards server configuration, server, and state.
   mutex mu_;
 

From 3a8b6ba5c1c8c2111c53490eba3f0c1a07f2494a Mon Sep 17 00:00:00 2001
From: Fei Sun <sunfei.china@gmail.com>
Date: Wed, 29 Apr 2020 10:35:01 +0800
Subject: [PATCH 004/412] Edit according to PR comments

---
 .../core/distributed_runtime/rpc/grpc_server_lib.cc       | 8 ++++----
 tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h | 4 +++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 7e2c42dabea..2cfdde5f56f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -132,7 +132,9 @@ GrpcServer::~GrpcServer() {
 void GrpcServer::MaybeMutateBuilder(::grpc::ServerBuilder* builder) {}
 
 // Look up the port that has been requested for this task in `server_def`.
-Status GrpcServer::GetHostAndPort(const ServerDef& server_def, string* host_name, int* port) const {
+Status GrpcServer::GetHostAndPort(const ServerDef& server_def,
+                                  string* host_name,
+                                  int* port) const {
   *port = -1;
   *host_name = "localhost";
   for (const auto& job : server_def.cluster().job()) {
@@ -180,9 +182,7 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   // otherwise if 'task_index=-1' the program will abort.
 
   int requested_port;
-  string host_name;
-  TF_RETURN_IF_ERROR(GetHostAndPort(server_def_, &host_name, &requested_port));
-  host_name_ = host_name;
+  TF_RETURN_IF_ERROR(GetHostAndPort(server_def_, &host_name_, &requested_port));
 
   SessionOptions sess_opts;
   ConfigProto config = server_def_.default_session_config();
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index feb174cde4e..8ecf0e158bf 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -104,7 +104,9 @@ class GrpcServer : public ServerInterface {
   Status UpdateServerDef(const ServerDef& server_def);
 
  protected:
-  virtual Status GetHostAndPort(const ServerDef& server_def, string* host_name, int* port) const;
+  virtual Status GetHostAndPort(const ServerDef& server_def,
+                                string* host_name,
+                                int* port) const;
   Status Init(const GrpcServerOptions& opts = GrpcServerOptions());
 
   // A subclass can override this method to support secure credentials.

From 87fc8379200e1faf536d74281c062820e33be75e Mon Sep 17 00:00:00 2001
From: Gaurav Singh <gaurav1086@gmail.com>
Date: Sat, 2 May 2020 14:50:25 -0400
Subject: [PATCH 005/412] [Lite] data: Fix memory leak

---
 .../lite/python/interpreter_wrapper/interpreter_wrapper.cc       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index bd78d56172e..313de20595d 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -592,6 +592,7 @@ PyObject* InterpreterWrapper::GetTensor(int i) const {
       size_t size_of_type;
       if (GetSizeOfType(nullptr, tensor->type, &size_of_type) != kTfLiteOk) {
         PyErr_SetString(PyExc_ValueError, "Unknown tensor type.");
+		free(data)
         return nullptr;
       }
       sparse_buffer_dims[0] = tensor->bytes / size_of_type;

From eeafd8091221d332649cd4df50f679e3b406f88c Mon Sep 17 00:00:00 2001
From: Hahn Anselm <anselm.hahn@gmail.com>
Date: Sun, 3 May 2020 20:15:11 +0200
Subject: [PATCH 006/412] Removing unreachable return

---
 tensorflow/lite/python/op_hint.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 29683718016..c7f49bdf4b6 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -1168,7 +1168,6 @@ def _get_correct_mapping(original_index, nodes):
     return node_indices[-1]
   else:
     return original_index
-  return original_index
 
 
 def _convert_op_hints_to_stubs_helper(

From 9cfeae817a1af13c53fec4e524c905132ce23c55 Mon Sep 17 00:00:00 2001
From: Hahn Anselm <anselm.hahn@gmail.com>
Date: Mon, 4 May 2020 19:35:41 +0200
Subject: [PATCH 007/412] Rearange the return

---
 tensorflow/lite/python/op_hint.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index c7f49bdf4b6..159fcaa2bf3 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -1166,8 +1166,7 @@ def _get_correct_mapping(original_index, nodes):
     node_indices = nodes.keys()
     node_indices = sorted(node_indices)
     return node_indices[-1]
-  else:
-    return original_index
+  return original_index
 
 
 def _convert_op_hints_to_stubs_helper(

From 0916c404a081488945ea28d12de129e63829f5d1 Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Mon, 4 May 2020 23:58:45 +0100
Subject: [PATCH 008/412] Mark QuantizationSpec methods that don't change
 member data as constant

---
 .../compiler/mlir/lite/quantization/quantization_config.h     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
index 5b1c73e7887..ea59f49f5b7 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
@@ -84,7 +84,7 @@ struct QuantizationSpecs {
   bool RunWeightQuantization() const { return weight_quantization; }
 
   // Whether this inference type represents a signed storage type.
-  bool IsSignedInferenceType() {
+  bool IsSignedInferenceType() const {
     switch (inference_type) {
       case tensorflow::DT_QUINT8:
       case tensorflow::DT_QUINT16:
@@ -96,7 +96,7 @@ struct QuantizationSpecs {
 
   // Gets the width of this quantization type. Returns 0 if it isn't a
   // quantization type.
-  int64_t GetQuantizationTypeWidth() {
+  int64_t GetQuantizationTypeWidth() const {
     switch (inference_type) {
       case tensorflow::DT_QINT8:
       case tensorflow::DT_QUINT8:

From a5ae7f124b27f97a35de9b3778f98e2bcc62a10f Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lukas.geiger94@gmail.com>
Date: Tue, 5 May 2020 00:03:05 +0100
Subject: [PATCH 009/412] Respect inference type in DefaultQuantParamsPass

---
 tensorflow/compiler/mlir/lite/tf_tfl_passes.cc |  3 ++-
 .../lite/transforms/default_quant_params.cc    | 18 ++++++++++++------
 .../compiler/mlir/lite/transforms/passes.h     |  2 +-
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 5eefa821c6b..8dbc84d1a9f 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -48,7 +48,8 @@ void AddQuantizationPasses(const mlir::TFL::QuantizationSpecs& quant_specs,
       quant_specs.default_ranges.second.hasValue()) {
     pass_manager->addPass(mlir::TFL::CreateDefaultQuantParamsPass(
         quant_specs.default_ranges.first.getValueOr(0.0),
-        quant_specs.default_ranges.second.getValueOr(0.0)));
+        quant_specs.default_ranges.second.getValueOr(0.0),
+        quant_specs.IsSignedInferenceType()));
     pass_manager->addPass(mlir::TFL::CreateQuantizePass());
     pass_manager->addPass(
         mlir::TFL::CreatePostQuantizePass(emit_quant_adaptor_ops));
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index a1602baced5..c23ae9fcfab 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -46,8 +46,11 @@ namespace {
 class DefaultQuantParamsPass
     : public PassWrapper<DefaultQuantParamsPass, FunctionPass> {
  public:
-  explicit DefaultQuantParamsPass(double default_min, double default_max)
-      : default_min_(default_min), default_max_(default_max) {}
+  explicit DefaultQuantParamsPass(double default_min, double default_max,
+                                  bool is_signed)
+      : default_min_(default_min),
+        default_max_(default_max),
+        is_signed_(is_signed) {}
 
   void runOnFunction() override;
 
@@ -82,6 +85,7 @@ class DefaultQuantParamsPass
 
   double default_min_;
   double default_max_;
+  bool is_signed_;
   quant::QuantParams default_quant_params_;
 };
 }  // namespace
@@ -214,15 +218,16 @@ quant::QuantParams DefaultQuantParamsPass::GetDefaultQuantParams(
     default_quant_params_ = quant::fakeQuantAttrsToType(
         builder.getUnknownLoc(),
         /*numBits=*/8, default_min_, default_max_, /*narrowRange=*/false,
-        builder.getF32Type());
+        builder.getF32Type(), is_signed_);
   }
   return default_quant_params_;
 }
 
 // Creates an instance of the default quant parameters pass.
 std::unique_ptr<OperationPass<FuncOp>> CreateDefaultQuantParamsPass(
-    double default_min, double default_max) {
-  return absl::make_unique<DefaultQuantParamsPass>(default_min, default_max);
+    double default_min, double default_max, bool is_signed) {
+  return absl::make_unique<DefaultQuantParamsPass>(default_min, default_max,
+                                                   is_signed);
 }
 
 // Registers this pass with default values, only for test
@@ -230,7 +235,8 @@ static PassRegistration<DefaultQuantParamsPass> pass(
     "tfl-default-quant",
     "Apply quantization with default quantization parameter", [] {
       return CreateDefaultQuantParamsPass(/*default_min=*/-1.0,
-                                          /*default_max=*/1.0);
+                                          /*default_max=*/1.0,
+                                          /*is_signed=*/false);
     });
 
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.h b/tensorflow/compiler/mlir/lite/transforms/passes.h
index 959c17e317a..105c9394fb4 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.h
@@ -76,7 +76,7 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateOptimizeFunctionalOpsPass();
 // Creates an instance of the TensorFlow Lite dialect pass to add default
 // quantization parameters.
 std::unique_ptr<OperationPass<FuncOp>> CreateDefaultQuantParamsPass(
-    double default_min, double default_max);
+    double default_min, double default_max, bool is_signed);
 
 // Creates an instance of the TensorFlow Lite dialect pass to convert dense
 // tensor to sparse format.

From c7828e73f7f6e7b2b0e43d9b04800147615e25a0 Mon Sep 17 00:00:00 2001
From: Gaurav Singh <gaurav1086@gmail.com>
Date: Wed, 6 May 2020 09:25:09 -0400
Subject: [PATCH 010/412] Fix build errors

---
 .../lite/python/interpreter_wrapper/interpreter_wrapper.cc      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 313de20595d..a414e26adb0 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -592,7 +592,7 @@ PyObject* InterpreterWrapper::GetTensor(int i) const {
       size_t size_of_type;
       if (GetSizeOfType(nullptr, tensor->type, &size_of_type) != kTfLiteOk) {
         PyErr_SetString(PyExc_ValueError, "Unknown tensor type.");
-		free(data)
+        free(data);
         return nullptr;
       }
       sparse_buffer_dims[0] = tensor->bytes / size_of_type;

From fc58d58923534e461d735a9a8b460d2dc8691ae5 Mon Sep 17 00:00:00 2001
From: Bas Aarts <baarts@nvidia.com>
Date: Thu, 7 May 2020 12:15:58 -0700
Subject: [PATCH 011/412] add __cudaInitModule to cudart_stub.cc

this fixes https://github.com/tensorflow/tensorflow/issues/39280
---
 tensorflow/stream_executor/cuda/cudart_stub.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
index 5ee106a65fd..1d7a4e378ba 100644
--- a/tensorflow/stream_executor/cuda/cudart_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -131,6 +131,13 @@ extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
   return func_ptr(gridDim, blockDim, sharedMem, stream);
 }
 
+extern char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
+  using FuncPtr = cudaError_t(CUDARTAPI *)(void **fatCubinHandle);
+  static auto func_ptr = LoadSymbol<FuncPtr>("__cudaInitModule");
+  if (!func_ptr) return GetSymbolNotFoundError();
+  return func_ptr(fatCubinHandle);
+}
+
 #if CUDART_VERSION >= 10010
 extern void CUDARTAPI __cudaRegisterFatBinaryEnd(void **fatCubinHandle) {
   using FuncPtr = void(CUDARTAPI *)(void **fatCubinHandle);

From 2d529fbf9de3678ec85bdaebf6ff321b49288522 Mon Sep 17 00:00:00 2001
From: Andrew Cavanaugh <andrewc@xmos.com>
Date: Thu, 7 May 2020 16:31:22 -0400
Subject: [PATCH 012/412] Prcandidate1 (#2)

* Initial commit of XCORE port
---
 .../lite/micro/testing/test_xcore_binary.sh   | 47 +++++++++++++++++++
 .../micro/tools/make/download_and_extract.sh  |  3 ++
 .../tools/make/targets/xcore_makefile.inc     | 24 ++++++++++
 tensorflow/lite/micro/xcore/README.md         | 32 +++++++++++++
 tensorflow/lite/micro/xcore/debug_log.cc      | 17 +++++++
 5 files changed, 123 insertions(+)
 create mode 100755 tensorflow/lite/micro/testing/test_xcore_binary.sh
 create mode 100644 tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc
 create mode 100644 tensorflow/lite/micro/xcore/README.md
 create mode 100644 tensorflow/lite/micro/xcore/debug_log.cc

diff --git a/tensorflow/lite/micro/testing/test_xcore_binary.sh b/tensorflow/lite/micro/testing/test_xcore_binary.sh
new file mode 100755
index 00000000000..e059968c885
--- /dev/null
+++ b/tensorflow/lite/micro/testing/test_xcore_binary.sh
@@ -0,0 +1,47 @@
+#!/bin/bash -e
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests an  XS3  binary by executing it using the XSIM simulator and parsing
+# the log output.
+#
+# First argument is the binary location.
+# Second argument is a regular expression that's required to be in the output
+# logs for the test to pass.
+
+declare -r ROOT_DIR=`pwd`
+declare -r TEST_TMPDIR=/tmp/test_xcore_binary/
+declare -r MICRO_LOG_PATH=${TEST_TMPDIR}/$1
+declare -r MICRO_LOG_FILENAME=${MICRO_LOG_PATH}/logs.txt
+declare -r XCORE_
+mkdir -p ${MICRO_LOG_PATH}
+
+# Get the location of this script file as an absolute path
+SCRIPT_PATH="`dirname \"$BASH_SOURCE\"`"
+SCRIPT_PATH="`( cd \"$SCRIPT_PATH\" && pwd )`"
+XSIM_FLAGS=""
+
+
+xsim $1 ${XSIM_FLAGS} 2>&1 | tee ${MICRO_LOG_FILENAME}
+
+if grep -q "$2" ${MICRO_LOG_FILENAME}
+then
+  echo "$1: PASS"
+  exit 0
+else
+  echo "$1: FAIL - '$2' not found in logs."
+  exit 1
+fi
+
diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index 2248031f6d1..5a673985cdd 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -137,6 +137,9 @@ download_and_extract() {
     exit 1
   fi
 
+  # delete anything after the '?' in a url that might confound f
+  url=$(echo "${url}" | sed "s/\?.*//")
+
   if [[ "${url}" == *gz ]]; then
     tar -C "${dir}" --strip-components=1 -xzf ${tempfile}
   elif [[ "${url}" == *tar.xz ]]; then
diff --git a/tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc b/tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc
new file mode 100644
index 00000000000..9a0f7463688
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/xcore_makefile.inc
@@ -0,0 +1,24 @@
+# Settings for XMOS XS3 based processors (xcore.ai, ...)
+
+#IMPORTANT: to set up environment variables correctly run the following from the top tensorflow directory:
+# $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" clean clean_downloads test
+# $ pushd tensorflow/lite/micro/tools/make/downloads/xtimecomposer/xTIMEcomposer/15.0.0/ && source SetEnv && popd
+# $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test
+
+ifeq ($(TARGET), xcore)
+  XTIME_URL := "https://www.xmos.com/download/Tools-15---Linux-64%2815.0.0_rc1%29.tgz?key=132D-9DC9-E913-0229-ECE6-D5AB-F511-2B19"
+  XTIME_MD5 := "8f6543c8ac4af7583edf75e62df322a2"
+  $(eval $(call add_third_party_download,$(XTIME_URL),$(XTIME_MD5),xtimecomposer))
+  PLATFORM_FLAGS = -target=XU316-1024-FB265-C32 -mcmodel=large -Os -DXCORE -Wno-xcore-fptrgroup -report
+  CXX_TOOL := xcc
+  CC_TOOL := xcc
+  AR_TOOL := xmosar   
+  override CXXFLAGS := -std=c++11 -g -DTF_LITE_STATIC_MEMORY -DNDEBUG
+  override CXXFLAGS += $(PLATFORM_FLAGS) 
+  override CCFLAGS := -g -DTF_LITE_STATIC_MEMORY -DNDEBUG
+  override CCFLAGS += $(PLATFORM_FLAGS)
+  TARGET_ARCH := xcore
+  #TARGET_TOOLCHAIN_PREFIX := tensorflow/lite/micro/tools/make/downloads/xtimecomposer/bin/
+  TEST_SCRIPT := tensorflow/lite/micro/testing/test_xcore_binary.sh
+  #GCC_XCORE := $(MAKEFILE_DIR)/downloads/xtimecomposer/bin/
+endif
diff --git a/tensorflow/lite/micro/xcore/README.md b/tensorflow/lite/micro/xcore/README.md
new file mode 100644
index 00000000000..bc217dce2fd
--- /dev/null
+++ b/tensorflow/lite/micro/xcore/README.md
@@ -0,0 +1,32 @@
+# Quickstart to install tools and run unit tests:
+
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" clean clean_downloads && make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test_greedy_memory_planner_test || true && pushd tensorflow/lite/micro/tools/make/downloads/xtimecomposer/xTIMEcomposer/15.0.0/ && source SetEnv && popd  && make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test
+
+(add -jN to the final make command to run builds / tests in N parallel threads)
+
+# Background information:
+
+* To start from a fresh repo (this will also remove non-xcore builds and downloads):
+```
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" clean clean_downloads
+```
+* To force xcore.ai tools download from a clean repo:
+```
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test_greedy_memory_planner_test
+```
+(this will fail to build the test, but if it succeeds because you already have tools it will exit quickly)
+
+* To set up environment variables correctly run the following from the top tensorflow directory:
+```
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test
+    $ pushd ./tensorflow/lite/micro/tools/make/downloads/xtimecomposer/xTIMEcomposer/15.0.0/ && source SetEnv && popd
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test 
+``` 
+* Assuming tools are already set up the following are the most commonly used commands:
+```   
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" build
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" test
+    $ make -f tensorflow/lite/micro/tools/make/Makefile TARGET="xcore" < name_of_example i.e. hello_world_test >
+```
+
+
diff --git a/tensorflow/lite/micro/xcore/debug_log.cc b/tensorflow/lite/micro/xcore/debug_log.cc
new file mode 100644
index 00000000000..95ef8df6b05
--- /dev/null
+++ b/tensorflow/lite/micro/xcore/debug_log.cc
@@ -0,0 +1,17 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/micro/debug_log.h"
+
+
+#include <cstdio>
+extern "C" void DebugLog(const char* s) { printf("%s",s); }

From 74833a04e032766a27890ff882d669d9e484a497 Mon Sep 17 00:00:00 2001
From: leslie-fang-intel <leslie.fang@intel.com>
Date: Fri, 8 May 2020 07:34:54 +0800
Subject: [PATCH 013/412] fix dequantize op regression issue

---
 tensorflow/core/kernels/dequantize_op.cc | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index 0f5a7019b1f..3b38daf0067 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -61,7 +61,9 @@ class DequantizeOp : public OpKernel {
                                 " is '" +
                                 DataTypeString(ctx->output_type(0)) + "'"));
 
+    need_cast_ = true;
     if (ctx->output_type(0) == DT_FLOAT) {
+      need_cast_ = false;
       OP_REQUIRES(ctx,
                   (mode_string == "MIN_COMBINED" ||
                    mode_string == "MIN_FIRST" || mode_string == "SCALED"),
@@ -98,8 +100,9 @@ class DequantizeOp : public OpKernel {
     }
 
     Tensor* output = nullptr;
-    Tensor float_output = tensorflow::Tensor(DT_FLOAT, input.shape());
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+    Tensor float_output =
+        need_cast_ ? tensorflow::Tensor(DT_FLOAT, input.shape()) : *output;
     if (num_slices == 1) {
       const float min_range = input_min_tensor.flat<float>()(0);
       const float max_range = input_max_tensor.flat<float>()(0);
@@ -128,10 +131,12 @@ class DequantizeOp : public OpKernel {
                         max_ranges(i), output_tensor.template chip<1>(i));
       }
     }
-    S* out_ptr = output->flat<S>().data();
-    float* in_ptr = float_output.flat<float>().data();
-    for (int64 i = 0; i < float_output.NumElements(); ++i) {
-      out_ptr[i] = static_cast<S>(in_ptr[i]);
+    if (need_cast_) {
+      S* out_ptr = output->flat<S>().data();
+      float* in_ptr = float_output.flat<float>().data();
+      for (int64 i = 0; i < float_output.NumElements(); ++i) {
+        out_ptr[i] = static_cast<S>(in_ptr[i]);
+      }
     }
   }
 
@@ -219,6 +224,7 @@ class DequantizeOp : public OpKernel {
   int mode_;
   int axis_;
   bool narrow_range_;
+  bool need_cast_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("Dequantize")

From f8972015fb81588c994fcea65330924f564b9bf4 Mon Sep 17 00:00:00 2001
From: josh meyer <joshua.richard.meyer@gmail.com>
Date: Thu, 7 May 2020 17:31:34 -0700
Subject: [PATCH 014/412] better error msg for incorrect WAV format chunk

---
 tensorflow/core/lib/wav/wav_io.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/lib/wav/wav_io.cc b/tensorflow/core/lib/wav/wav_io.cc
index d318059e8f6..fd0f796d93c 100644
--- a/tensorflow/core/lib/wav/wav_io.cc
+++ b/tensorflow/core/lib/wav/wav_io.cc
@@ -235,7 +235,7 @@ Status DecodeLin16WaveAsFloatVector(const string& wav_string,
       ReadValue<uint32>(wav_string, &format_chunk_size, &offset));
   if ((format_chunk_size != 16) && (format_chunk_size != 18)) {
     return errors::InvalidArgument(
-        "Bad file size for WAV: Expected 16 or 18, but got", format_chunk_size);
+        "Bad format chunk size for WAV: Expected 16 or 18, but got", format_chunk_size);
   }
   uint16 audio_format;
   TF_RETURN_IF_ERROR(ReadValue<uint16>(wav_string, &audio_format, &offset));

From dad62f51c9a20d715ce447851c72f9c510959c83 Mon Sep 17 00:00:00 2001
From: Andrew Cavanaugh <andrewc@xmos.com>
Date: Fri, 8 May 2020 10:53:06 -0400
Subject: [PATCH 015/412] Update download_and_extract.sh

---
 tensorflow/lite/micro/tools/make/download_and_extract.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh
index 5a673985cdd..dca6a309583 100755
--- a/tensorflow/lite/micro/tools/make/download_and_extract.sh
+++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh
@@ -137,7 +137,7 @@ download_and_extract() {
     exit 1
   fi
 
-  # delete anything after the '?' in a url that might confound f
+  # delete anything after the '?' in a url that may mask true file extension
   url=$(echo "${url}" | sed "s/\?.*//")
 
   if [[ "${url}" == *gz ]]; then

From cd2f4d16282d36c47547ba72d762ad967ce1d024 Mon Sep 17 00:00:00 2001
From: Bas Aarts <baarts@nvidia.com>
Date: Fri, 8 May 2020 07:58:05 -0700
Subject: [PATCH 016/412] address comments on commit
 fc58d58923534e461d735a9a8b460d2dc8691ae5

---
 tensorflow/stream_executor/cuda/cudart_stub.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cudart_stub.cc b/tensorflow/stream_executor/cuda/cudart_stub.cc
index 1d7a4e378ba..3b9e0f2937b 100644
--- a/tensorflow/stream_executor/cuda/cudart_stub.cc
+++ b/tensorflow/stream_executor/cuda/cudart_stub.cc
@@ -132,9 +132,9 @@ extern __host__ __device__ unsigned CUDARTAPI __cudaPushCallConfiguration(
 }
 
 extern char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
-  using FuncPtr = cudaError_t(CUDARTAPI *)(void **fatCubinHandle);
+  using FuncPtr = char(CUDARTAPI *)(void **fatCubinHandle);
   static auto func_ptr = LoadSymbol<FuncPtr>("__cudaInitModule");
-  if (!func_ptr) return GetSymbolNotFoundError();
+  if (!func_ptr) return 0;
   return func_ptr(fatCubinHandle);
 }
 

From 497c869b070b33ee082f9b92040f0403dce2ccd7 Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Thu, 7 May 2020 08:31:14 -0700
Subject: [PATCH 017/412] Fix the copy bug.

---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 7 +++++--
 .../compiler/xla/service/gpu/kernel_mapping_scheme.h       | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index ec5f10bd2e8..7084736ac3c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2016,7 +2016,9 @@ void IrEmitterUnnested::EmitTile(
 
   // True iff all threads always execute all instructions in the tiling
   // dimension X.
-  bool x_tile_fits = mapping_scheme.GetDimsInElems()[kDimX] % tile_size_x == 0;
+  bool x_tile_fits =
+      mapping_scheme.GetDimsInElems()[kDimX] % tile_size_x == 0 &&
+      mapping_scheme.GetRowContiguous();
 
   // The outer loop below is simply doing:
   //
@@ -2731,7 +2733,8 @@ void IrEmitterUnnested::EmitHlo021Tile(
                                      /*num_threads_y=*/kNumRows,
                                      /*num_threads_x=*/kWarpSize,
                                      /*indexing_order=*/kLinearIndexingX,
-                                     /*vector_size=*/1);
+                                     /*vector_size=*/1,
+                                     /*row_contiguous=*/false);
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
   llvm::Type* index_type =
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index 5e15d0767a1..d9f80172bcb 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -90,13 +90,14 @@ class KernelMappingScheme {
   KernelMappingScheme(absl::Span<const int64> dims_in_elems,
                       absl::Span<const int64> tile_sizes, int64 num_threads_y,
                       int64 num_threads_x, IndexingOrder indexing_order,
-                      int vector_size)
+                      int vector_size, bool row_contiguous = false)
       : dims_in_elems_{dims_in_elems[0], dims_in_elems[1], dims_in_elems[2]},
         tile_sizes_{tile_sizes[0], tile_sizes[1], tile_sizes[2]},
         num_threads_x_(num_threads_x),
         num_threads_y_(num_threads_y),
         indexing_order_(indexing_order),
-        vector_size_(vector_size) {
+        vector_size_(vector_size),
+	row_contiguous_(row_contiguous) {
     CHECK_EQ(tile_sizes[1] % num_threads_y_, 0);
     CHECK_EQ(tile_sizes[2] % num_threads_x_, 0);
     VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ",");
@@ -134,6 +135,7 @@ class KernelMappingScheme {
 
   IndexingOrder GetIndexingOrder() const { return indexing_order_; }
   int GetVectorSize() const { return vector_size_; }
+  bool GetRowContiguous() const {return row_contiguous_; }
 
  private:
   // The number of elements in each dimension.
@@ -159,6 +161,7 @@ class KernelMappingScheme {
   // to trigger vectorized loads on GPUs while keeping memory
   // coalescing.
   const int vector_size_;
+  const bool row_contiguous_;
 };
 
 // Information to support the code generation for a tiled reduction kernel.

From d0b0e9c2043b22ed855b61095d48dc9c936ca44c Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Fri, 8 May 2020 13:32:39 -0700
Subject: [PATCH 018/412] enabling mkldnn threadpool build options

---
 .bazelrc                                    |   5 +
 tensorflow/tensorflow.bzl                   |   8 +-
 tensorflow/workspace.bzl                    |  11 ++
 third_party/mkl/build_defs.bzl              |   1 +
 third_party/mkl_dnn/BUILD                   |   8 ++
 third_party/mkl_dnn/build_defs.bzl          |  16 +++
 third_party/mkl_dnn/mkldnn_threadpool.BUILD | 133 ++++++++++++++++++++
 7 files changed, 181 insertions(+), 1 deletion(-)
 create mode 100644 third_party/mkl_dnn/mkldnn_threadpool.BUILD

diff --git a/.bazelrc b/.bazelrc
index cf15d0976b1..2efdbad2e5f 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -143,6 +143,11 @@ build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl --define=build_with_mkl_dnn_v1_only=true
 build:mkl -c opt
 
+build:mkl_threadpool --define=build_with_mkl=true --define=enable_mkl=true
+build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
+build:mkl_threadpool --define=build_with_mkldnn_threadpool=true
+build:mkl_threadpool -c opt
+
 # This config refers to building with CUDA available. It does not necessarily
 # mean that we build CUDA op kernels.
 build:using_cuda --define=using_cuda=true
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d9229e00306..ed780092ce1 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -48,6 +48,7 @@ load(
     "//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
     "if_mkl_v1_open_source_only",
+    "if_mkldnn_threadpool",
 )
 load(
     "//third_party/ngraph:build_defs.bzl",
@@ -327,6 +328,9 @@ def tf_copts(
         if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"]) +
         if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
         if_mkl_v1_open_source_only(["-DENABLE_MKLDNN_V1"]) +
+        if_mkldnn_threadpool(["-DENABLE_MKLDNN_THREADPOOL"]) +
+        if_mkldnn_threadpool(["-DENABLE_MKLDNN_V1"]) +
+        if_mkldnn_threadpool(["-DINTEL_MKL_DNN_ONLY"]) +
         if_enable_mkl(["-DENABLE_MKL"]) +
         if_ngraph(["-DINTEL_NGRAPH=1"]) +
         if_android_arm(["-mfpu=neon"]) +
@@ -348,7 +352,9 @@ def tf_copts(
     )
 
 def tf_openmp_copts():
-    return if_mkl_lnx_x64(["-fopenmp"])
+    # TODO(intel-mkl): Remove -fopenmp for threadpool after removing all
+    # omp pragmas in tensorflow/core.
+    return if_mkl_lnx_x64(["-fopenmp"]) + if_mkldnn_threadpool(["-fopenmp"])
 
 def tfe_xla_copts():
     return select({
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 56f36a7b004..ab895dd6a99 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -232,6 +232,17 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    tf_http_archive(
+        name = "mkl_dnn_tp",
+        build_file = clean_dep("//third_party/mkl_dnn:mkldnn_threadpool.BUILD"),
+        sha256 = "54737bcb4dc1961d32ee75da3ecc529fa48198f8b2ca863a079e19a9c4adb70f",
+        strip_prefix = "oneDNN-1.4",
+        urls = [
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
+            "https://github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
+        ],
+    )
+
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index 4b8fb83eb09..f69d27dd094 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -107,6 +107,7 @@ def mkl_deps():
     return select({
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_only": ["@mkl_dnn"],
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": ["@mkl_dnn_v1//:mkl_dnn"],
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": ["@mkl_dnn_tp//:mkl_dnn"],
         "@org_tensorflow//third_party/mkl:build_with_mkl_ml_only": ["@org_tensorflow//third_party/mkl:intel_binary_blob"],
         "@org_tensorflow//third_party/mkl:build_with_mkl": [
             "@org_tensorflow//third_party/mkl:intel_binary_blob",
diff --git a/third_party/mkl_dnn/BUILD b/third_party/mkl_dnn/BUILD
index 774e5b0e2c0..9e617e0055a 100644
--- a/third_party/mkl_dnn/BUILD
+++ b/third_party/mkl_dnn/BUILD
@@ -27,6 +27,14 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+config_setting(
+    name = "build_with_mkldnn_threadpool",
+    define_values = {
+        "build_with_mkl": "true",
+        "build_with_mkldnn_threadpool": "true",
+    },
+    visibility = ["//visibility:public"],
+)
 bzl_library(
     name = "build_defs_bzl",
     srcs = ["build_defs.bzl"],
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index af05333c947..5778d136e9b 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -29,3 +29,19 @@ def if_mkl_v1_open_source_only(if_true, if_false = []):
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": if_true,
         "//conditions:default": if_false,
     })
+
+def if_mkldnn_threadpool(if_true, if_false = []):
+    """Returns `if_true` if MKL-DNN v1.x is used.
+
+    Shorthand for select()'ing on whether we're building with
+    MKL-DNN v1.x open source library only, without depending on MKL binary form.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with MKL-DNN v1.x open source library only. Otherwise, the
+    select statement evaluates to if_false.
+
+    """
+    return select({
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": if_true,
+        "//conditions:default": if_false,
+    })
diff --git a/third_party/mkl_dnn/mkldnn_threadpool.BUILD b/third_party/mkl_dnn/mkldnn_threadpool.BUILD
new file mode 100644
index 00000000000..35175b7f90f
--- /dev/null
+++ b/third_party/mkl_dnn/mkldnn_threadpool.BUILD
@@ -0,0 +1,133 @@
+exports_files(["LICENSE"])
+
+load(
+    "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
+    "if_mkl_open_source_only",
+    "if_mkldnn_threadpool",
+)
+load(
+    "@org_tensorflow//third_party:common.bzl",
+    "template_rule",
+)
+
+config_setting(
+    name = "clang_linux_x86_64",
+    values = {
+        "cpu": "k8",
+        "define": "using_clang=true",
+    },
+)
+
+template_rule(
+    name = "dnnl_config_h",
+    src = "include/dnnl_config.h.in",
+    out = "include/dnnl_config.h",
+    substitutions = {
+        "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
+        "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
+        "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
+    },
+)
+# Create the file mkldnn_version.h with MKL-DNN version numbers.
+# Currently, the version numbers are hard coded here. If MKL-DNN is upgraded then
+# the version numbers have to be updated manually. The version numbers can be
+# obtained from the PROJECT_VERSION settings in CMakeLists.txt. The variable is
+# set to "version_major.version_minor.version_patch". The git hash version can
+# be set to NA.
+# TODO(agramesh1) Automatically get the version numbers from CMakeLists.txt.
+
+template_rule(
+    name = "dnnl_version_h",
+    src = "include/dnnl_version.h.in",
+    out = "include/dnnl_version.h",
+    substitutions = {
+        "@DNNL_VERSION_MAJOR@": "1",
+        "@DNNL_VERSION_MINOR@": "2",
+        "@DNNL_VERSION_PATCH@": "0",
+        "@DNNL_VERSION_HASH@": "N/A",
+    },
+)
+
+cc_library(
+    name = "mkl_dnn",
+    srcs = glob([
+        "src/common/*.cpp",
+        "src/common/*.hpp",
+        "src/cpu/*.cpp",
+        "src/cpu/*.hpp",
+        "src/cpu/**/*.cpp",
+        "src/cpu/**/*.hpp",
+        "src/cpu/xbyak/*.h",
+    ]) + if_mkldnn_threadpool([
+        ":dnnl_config_h",
+    ]) + [":dnnl_version_h"],
+    hdrs = glob(["include/*"]),
+    copts = [
+        "-fexceptions",
+        "-DUSE_MKL",
+        "-DUSE_CBLAS",
+    ] + if_mkl_open_source_only([
+        "-UUSE_MKL",
+        "-UUSE_CBLAS",
+    ]) + if_mkldnn_threadpool([
+        "-UUSE_MKL",
+        "-UUSE_CBLAS",
+    ]) + select({
+        "@org_tensorflow//tensorflow:linux_x86_64": ["-fopenmp-simd"],
+        # TODO(ibiryukov): enable openmp with clang by including libomp as a
+        # dependency.
+        ":clang_linux_x86_64": [],
+        "//conditions:default": [],
+    }),
+    includes = [
+        "include",
+        "src",
+        "src/common",
+        "src/cpu",
+        "src/cpu/gemm",
+        "src/cpu/xbyak",
+    ],
+    visibility = ["//visibility:public"],
+    deps = select({
+        "@org_tensorflow//tensorflow:linux_x86_64": [
+            "@mkl_linux//:mkl_headers",
+            "@mkl_linux//:mkl_libs_linux",
+        ],
+        "@org_tensorflow//tensorflow:macos": [
+            "@mkl_darwin//:mkl_headers",
+            "@mkl_darwin//:mkl_libs_darwin",
+        ],
+        "@org_tensorflow//tensorflow:windows": [
+            "@mkl_windows//:mkl_headers",
+            "@mkl_windows//:mkl_libs_windows",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+cc_library(
+    name = "mkldnn_single_threaded",
+    srcs = glob([
+        "src/common/*.cpp",
+        "src/common/*.hpp",
+        "src/cpu/*.cpp",
+        "src/cpu/*.hpp",
+        "src/cpu/**/*.cpp",
+        "src/cpu/**/*.hpp",
+        "src/cpu/xbyak/*.h",
+    ]) + [":dnnl_config_h"],
+    hdrs = glob(["include/*"]),
+    copts = [
+        "-fexceptions",
+        "-DMKLDNN_THR=MKLDNN_THR_SEQ",  # Disables threading.
+    ],
+    includes = [
+        "include",
+        "src",
+        "src/common",
+        "src/cpu",
+        "src/cpu/gemm",
+        "src/cpu/xbyak",
+    ],
+    visibility = ["//visibility:public"],
+)

From f1ddb29ce910e714f2211ca9b14e47f0cf31f05e Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Fri, 8 May 2020 14:03:59 -0700
Subject: [PATCH 019/412] Add a test

---
 .../compiler/xla/service/gpu/tests/BUILD      | 14 +++++
 .../service/gpu/tests/gpu_copy_alone_test.cc  | 61 +++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc

diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index e04dba418d9..dae63be683f 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -235,6 +235,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "gpu_copy_alone_test",
+    srcs = [
+        "gpu_copy_alone_test.cc",
+    ],
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
+    deps = [
+        ":gpu_codegen_test",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 tf_cc_test(
     name = "gpu_ftz_test",
     srcs = ["gpu_ftz_test.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc
new file mode 100644
index 00000000000..ac3111aaae1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_alone_test.cc
@@ -0,0 +1,61 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// WARNING: This tests must be alone in its file!  Otherwise, the
+// error isn't caught. We expect and CUDA_ERROR_ILLEGAL_ADDRESS to be
+// thrown with the old buggy code.
+class CopyAloneNoOptTest : public GpuCodegenTest {
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // The test MultiOutputStore contain a MOF fusion and XLA optimizer pass doesn't like this.
+    debug_options.set_xla_disable_all_hlo_passes(true);
+    return debug_options;
+  }
+};
+
+TEST_F(CopyAloneNoOptTest, CopyTranspose) {
+    const char* hlo_text = R"(
+HloModule mod
+ENTRY main {
+  %param = f32[8,32,32,32,16]{4,3,2,1,0} parameter(0)
+  ROOT %copy = f32[8,32,32,32,16]{3,2,1,4,0} copy(f32[8,32,32,32,16]{4,3,2,1,0} %param)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+
+  CompileAndOptionallyVerifyPtx(std::move(optimized_module),
+                      R"(
+CHECK-NOT: ld.global.nc.v2
+)");
+
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla

From 3d42daf076d7caa8ad07182aa10dab4f90a1e45d Mon Sep 17 00:00:00 2001
From: Frederic Bastien <fbastien@nvidia.com>
Date: Fri, 8 May 2020 14:28:44 -0700
Subject: [PATCH 020/412] NFC: rename a variable

---
 .../compiler/xla/service/gpu/ir_emitter_unnested.cc       | 2 +-
 .../compiler/xla/service/gpu/kernel_mapping_scheme.h      | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 7084736ac3c..a78ffc8dd1a 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2734,7 +2734,7 @@ void IrEmitterUnnested::EmitHlo021Tile(
                                      /*num_threads_x=*/kWarpSize,
                                      /*indexing_order=*/kLinearIndexingX,
                                      /*vector_size=*/1,
-                                     /*row_contiguous=*/false);
+                                     /*is_row_contiguous=*/false);
   LaunchDimensions launch_dimensions(mapping_scheme.GetNumberOfBlocks(),
                                      mapping_scheme.GetThreadsPerBlock());
   llvm::Type* index_type =
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
index d9f80172bcb..99fa60a24c2 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h
@@ -90,14 +90,14 @@ class KernelMappingScheme {
   KernelMappingScheme(absl::Span<const int64> dims_in_elems,
                       absl::Span<const int64> tile_sizes, int64 num_threads_y,
                       int64 num_threads_x, IndexingOrder indexing_order,
-                      int vector_size, bool row_contiguous = false)
+                      int vector_size, bool is_row_contiguous = false)
       : dims_in_elems_{dims_in_elems[0], dims_in_elems[1], dims_in_elems[2]},
         tile_sizes_{tile_sizes[0], tile_sizes[1], tile_sizes[2]},
         num_threads_x_(num_threads_x),
         num_threads_y_(num_threads_y),
         indexing_order_(indexing_order),
         vector_size_(vector_size),
-	row_contiguous_(row_contiguous) {
+	is_row_contiguous_(is_row_contiguous) {
     CHECK_EQ(tile_sizes[1] % num_threads_y_, 0);
     CHECK_EQ(tile_sizes[2] % num_threads_x_, 0);
     VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ",");
@@ -135,7 +135,7 @@ class KernelMappingScheme {
 
   IndexingOrder GetIndexingOrder() const { return indexing_order_; }
   int GetVectorSize() const { return vector_size_; }
-  bool GetRowContiguous() const {return row_contiguous_; }
+  bool GetRowContiguous() const {return is_row_contiguous_; }
 
  private:
   // The number of elements in each dimension.
@@ -161,7 +161,7 @@ class KernelMappingScheme {
   // to trigger vectorized loads on GPUs while keeping memory
   // coalescing.
   const int vector_size_;
-  const bool row_contiguous_;
+  const bool is_row_contiguous_;
 };
 
 // Information to support the code generation for a tiled reduction kernel.

From 68952b2608d0b5bd614adb33b753288f99be66f9 Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <cs17btech11050@iith.ac.in>
Date: Sat, 9 May 2020 05:16:03 +0530
Subject: [PATCH 021/412] Added POC for tf_program

Signed-off-by: Shraiysh Vaishay <cs17btech11050@iith.ac.in>
---
 tensorflow/python/tf_program/BUILD            |  22 +
 tensorflow/python/tf_program/mlir_gen.py      | 452 ++++++++++++++++++
 .../python/tf_program/mlir_wrapper/BUILD      |  36 ++
 .../python/tf_program/mlir_wrapper/attrs.cc   |  25 +
 .../tf_program/mlir_wrapper/basic_classes.cc  |  49 ++
 .../tf_program/mlir_wrapper/builders.cc       |  51 ++
 .../tf_program/mlir_wrapper/mlir_util.h       |  25 +
 .../tf_program/mlir_wrapper/mlir_wrapper.cc   |  42 ++
 .../python/tf_program/mlir_wrapper/ops.cc     | 194 ++++++++
 .../python/tf_program/mlir_wrapper/types.cc   |  48 ++
 tensorflow/python/tf_program/pywrap_tfd.py    | 149 ++++++
 tensorflow/python/tf_program/tests/BUILD      |  31 ++
 .../tf_program/tests/filecheck_wrapper.cc     |  36 ++
 .../python/tf_program/tests/mlir_gen_test.py  | 228 +++++++++
 14 files changed, 1388 insertions(+)
 create mode 100644 tensorflow/python/tf_program/BUILD
 create mode 100644 tensorflow/python/tf_program/mlir_gen.py
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/BUILD
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/attrs.cc
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/basic_classes.cc
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/builders.cc
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/mlir_util.h
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/mlir_wrapper.cc
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/ops.cc
 create mode 100644 tensorflow/python/tf_program/mlir_wrapper/types.cc
 create mode 100644 tensorflow/python/tf_program/pywrap_tfd.py
 create mode 100644 tensorflow/python/tf_program/tests/BUILD
 create mode 100644 tensorflow/python/tf_program/tests/filecheck_wrapper.cc
 create mode 100644 tensorflow/python/tf_program/tests/mlir_gen_test.py

diff --git a/tensorflow/python/tf_program/BUILD b/tensorflow/python/tf_program/BUILD
new file mode 100644
index 00000000000..69eb9e7c031
--- /dev/null
+++ b/tensorflow/python/tf_program/BUILD
@@ -0,0 +1,22 @@
+package(licenses = ["notice"])
+
+py_library(
+    name = "pywrap_tfd",
+    srcs = ["pywrap_tfd.py"],
+    deps = [
+        "//tensorflow/python/tf_program/mlir_wrapper",
+    ],
+)
+
+py_library(
+    name = "mlir_gen",
+    srcs = ["mlir_gen.py"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pywrap_tfd",
+        "//tensorflow/python/autograph/pyct",
+        "//tensorflow/python/autograph/pyct/static_analysis",
+        "//tensorflow/python/types",
+        "@gast_archive//:gast",
+    ],
+)
diff --git a/tensorflow/python/tf_program/mlir_gen.py b/tensorflow/python/tf_program/mlir_gen.py
new file mode 100644
index 00000000000..74622d44424
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_gen.py
@@ -0,0 +1,452 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+'''mlir_gen: Generate mlir code from python code'''
+
+# pylint: disable=invalid-name
+# pylint: disable=missing-function-docstring
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast as ast
+import tensorflow.python.tf_program.pywrap_tfd as tfp
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import cfg
+from tensorflow.python.autograph.pyct import inspect_utils
+from tensorflow.python.autograph.pyct import naming
+from tensorflow.python.autograph.pyct import parser
+from tensorflow.python.autograph.pyct import qual_names
+from tensorflow.python.autograph.pyct import transformer
+from tensorflow.python.autograph.pyct.static_analysis import activity
+from tensorflow.python.autograph.pyct.static_analysis import annos
+from tensorflow.python.autograph.pyct.static_analysis import liveness
+from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
+from tensorflow.python.autograph.pyct.static_analysis import reaching_fndefs
+from tensorflow.python.types import core
+
+class SymbolTable:
+  '''
+  Symbol Table for python code
+  '''
+  def __init__(self):
+    self.symbols = []
+    self.enter_scope()
+
+  def enter_scope(self):
+    '''
+    Enter a new scope - at function level
+    '''
+    self.symbols.append({'types': {}, 'symbols': {}})
+    self.curr_table = self.symbols[len(self.symbols)-1]
+
+  def insert_symbol(self, name, value):
+    self.curr_table['symbols'][name] = value
+    self.curr_table['types'][name] = value.getType()
+    return value
+
+  def insert_type(self, name, type_):
+    self.curr_table['types'][name] = type_
+
+  def exit_scope(self):
+    self.symbols.pop()
+    self.curr_table = self.symbols[len(self.symbols)-1]
+
+  def lookup(self, name):
+    curr_idx = len(self.symbols)-1
+    while curr_idx >= 0 and (not name in self.symbols[curr_idx]['symbols']):
+      curr_idx -= 1
+    if curr_idx < 0:
+      return None
+    return self.symbols[curr_idx]['symbols'][name]
+
+  def lookup_type(self, name):
+    curr_idx = len(self.symbols)-1
+    while curr_idx >= 0 and (not name in self.symbols[curr_idx]['types']):
+      curr_idx -= 1
+    if curr_idx < 0:
+      return None
+    return self.symbols[curr_idx]['types'][name]
+
+  def __repr__(self):
+    s = '\n'.join(' ' * idx * 2 + str(table)
+                  for idx, table in enumerate(self.symbols))
+    return s
+
+class ProcessType(ast.NodeVisitor):
+  '''
+  Visit a node and return processed type
+  Currently only visits annotations and gives their type
+  '''
+  def __init__(self, prog, ctx):
+    self.prog = prog
+    self.ctx = ctx
+
+  def visit_Attribute(self, node):
+    # Supported: core.Tensor
+    value = self.visit(node.value)
+    if value is None or not hasattr(value, node.attr):
+      raise AttributeError(str(type(value)) + ' has no attribute ' + node.attr)
+    attr = getattr(value, node.attr)
+
+    if attr == core.Tensor:
+      return tfp.UnrankedTensorType.get(tfp.IntegerType.get(32, self.prog.ctx))
+    return attr
+
+  def visit_Name(self, node):
+    if node.id == 'int':
+      return tfp.IntegerType.get(32, self.prog.ctx)
+    if node.id == 'bool':
+      return tfp.IntegerType.get(1, self.prog.ctx)
+    if node.id in self.ctx.info.namespace:
+      return self.ctx.info.namespace[node.id]
+
+class MLIRGen(ast.NodeVisitor):
+  '''
+  Visit the AST and generate MLIR code
+  Requires liveness, reading_definitions
+  '''
+  def __init__(self, ctx):
+    self.ctx = ctx
+    self.symbol_table = SymbolTable()
+    self.prog = tfp.TFProgram()
+    self.opbuilder = None
+
+  def visit_block(self, block):
+    return [self.visit(item) for item in block]
+
+  def process_type(self, node):
+    return ProcessType(self.prog, self.ctx).visit(node)
+
+  def visit_Assign(self, node):
+    value = self.visit(node.value)
+    if isinstance(value, tuple):
+      # If it is a tuple of values, assign one to each in targets
+      # TODO: This currently is assuming that all elts in targets[0] are Name
+      # objects. This might not be always True.
+      for key, val in zip(node.targets[0].elts, value):
+        self.symbol_table.insert_symbol(key.id, val)
+    else:
+      self.symbol_table.insert_symbol(node.targets[0].id, value)
+
+  def visit_BinOp(self, node):
+    left = self.visit(node.left)
+    right = self.visit(node.right)
+    if isinstance(node.op, ast.Sub):
+      return tfp.Tf_SubOp.create(
+          self.opbuilder, self.opbuilder.getUnknownLoc(), left,
+          right).getResult(0)
+    if isinstance(node.op, ast.Add):
+      return tfp.Tf_AddV2Op.create(
+          self.opbuilder, self.opbuilder.getUnknownLoc(), left,
+          right).getResult(0)
+
+  def visit_BoolOp(self, node):
+    values = [self.visit(value) for value in node.values]
+    if isinstance(node.op, ast.Or):
+      return tfp.OrOp.create(
+          self.opbuilder, self.opbuilder.getUnknownLoc(), values).getResult(0)
+    if isinstance(node.op, ast.And):
+      return tfp.AndOp.create(
+          self.opbuilder, self.opbuilder.getUnknownLoc(), values).getResult(0)
+
+  def visit_Call(self, node):
+    func = self.visit(node.func)
+    args = [self.visit(arg) for arg in node.args]
+    callop = tfp.Tf_LegacyCallOp.create(
+        self.opbuilder, self.opbuilder.getUnknownLoc(),
+        func.getType().getResults(), args, func.getName())
+    if callop.getNumResults() == 1:
+      return callop[0]
+    return tuple(callop.getResult(idx) for idx in range(callop.getNumResults()))
+
+  def visit_Compare(self, node):
+    left = self.visit(node.left)
+    opb = self.opbuilder
+    for op, right in zip(node.ops, node.comparators):
+      if isinstance(op, ast.Eq):
+        left = tfp.Tf_EqualOp.create(opb, opb.getUnknownLoc(), left,
+                                     self.visit(right)).getResult(0)
+      elif isinstance(op, ast.Lt):
+        left = tfp.Tf_LessOp.create(opb, opb.getUnknownLoc(), left,
+                                    self.visit(right)).getResult(0)
+      elif isinstance(op, ast.LtE):
+        left = tfp.Tf_LessEqualOp.create(opb, opb.getUnknownLoc(), left,
+                                         self.visit(right)).getResult(0)
+      elif isinstance(op, ast.Gt):
+        left = tfp.Tf_GreaterOp.create(opb, opb.getUnknownLoc(), left,
+                                       self.visit(right)).getResult(0)
+      elif isinstance(op, ast.GtE):
+        left = tfp.Tf_GreaterEqualOp.create(opb, opb.getUnknownLoc(), left,
+                                            self.visit(right)).getResult(0)
+      elif isinstance(op, ast.NotEq):
+        left = tfp.Tf_NotEqualOp.create(opb, opb.getUnknownLoc(), left,
+                                        self.visit(right)).getResult(0)
+      else:
+        print(op)
+        raise NotImplementedError("CompareOp operator not recognized")
+    return left
+
+  def visit_Constant(self, node):
+    opb = self.opbuilder
+    value = None
+    if isinstance(node.value, int):
+      value = tfp.Tf_ConstOp.create(
+          opb, opb.getUnknownLoc(), tfp.IntegerAttr.get(tfp.IntegerType.get(
+              32, self.prog.ctx), node.value)).getResult(0)
+    return value
+
+  def visit_FunctionDef(self, node):
+    # Cache the current builder
+    cache_builder = self.opbuilder
+    inputs, outputs = [], []
+
+    for arg in node.args.args:
+      inputs.append(self.process_type(arg.annotation))
+
+    if node.returns:
+      outputs = [self.process_type(node.returns)]
+
+    currfunc = self.prog.add_function(
+        self.ctx.namer.new_symbol(node.name, []),
+        self.prog.get_function_type(inputs, outputs))
+
+    # Add the function to symbol table and enter new scope
+    self.symbol_table.insert_symbol(node.name, currfunc)
+    self.symbol_table.enter_scope()
+
+    # Add arguments to symbol table
+    for arg, value in zip(node.args.args, currfunc.getArguments()):
+      self.symbol_table.insert_symbol(arg.id, value)
+    self.opbuilder = tfp.OpBuilder(currfunc.getBody())
+
+    self.visit_block(node.body)
+    self.symbol_table.exit_scope()
+    self.opbuilder = cache_builder
+
+  def visit_If(self, node):
+    cond = self.visit(node.test)
+
+    # Create ifop
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
+    modified_in_cond = list(body_scope.modified | orelse_scope.modified)
+    outputs = [self.symbol_table.lookup_type(str(var))
+               for var in modified_in_cond]
+    ifop = tfp.IfOp.create(
+        self.opbuilder, self.opbuilder.getUnknownLoc(), cond, outputs)
+
+    # Cache the builder
+    cache_builder = self.opbuilder
+
+    # Visit body
+    self.opbuilder = tfp.OpBuilder(ifop.getRegion(0))
+    # Enter scope to avoid values generated inside the region to come in symbol table
+    self.symbol_table.enter_scope()
+    for stmt in node.body:
+      self.visit(stmt)
+    retvals = [self.symbol_table.lookup(str(varname))
+               for varname in modified_in_cond]
+    tfp.ReturnOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(), retvals)
+    self.symbol_table.exit_scope()
+
+    # Visit orelse
+    self.opbuilder = tfp.OpBuilder(ifop.getRegion(1))
+    self.symbol_table.enter_scope()
+    for stmt in node.orelse:
+      self.visit(stmt)
+    retvals = [self.symbol_table.lookup(str(varname))
+               for varname in modified_in_cond]
+    tfp.ReturnOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(), retvals)
+    self.symbol_table.exit_scope()
+
+    # Reset builder and enter return values in symbol table
+    self.opbuilder = cache_builder
+    for idx, var in enumerate(modified_in_cond):
+      self.symbol_table.insert_symbol(str(var), ifop.getResult(idx))
+
+    if ifop.getNumResults() == 1:
+      return ifop.getResult(0)
+
+    return tuple(ifop.getResult(i) for i in range(ifop.getNumResults()))
+
+  def visit_Name(self, node):
+    if self.symbol_table.lookup(node.id):
+      return self.symbol_table.lookup(node.id)
+    raise NotImplementedError('Symbol not found' + node.id)
+
+  def visit_Return(self, node):
+    opb = self.opbuilder
+    value = self.visit(node.value)
+    if isinstance(value, tuple):
+      # For more than one return values
+      return tfp.ReturnOp.create(opb, opb.getUnknownLoc(), list(value))
+    return tfp.ReturnOp.create(opb, opb.getUnknownLoc(), [value])
+
+  def visit_Tuple(self, node):
+    return tuple(self.visit(elt) for elt in node.elts)
+
+  def visit_UnaryOp(self, node):
+    operand = self.visit(node.operand)
+    if isinstance(node.op, ast.USub):
+      return tfp.Tf_NegOp.create(
+          self.opbuilder, self.opbuilder.getUnknownLoc(), operand).getResult(0)
+
+  def _get_basic_loop_vars(self, modified, live_in, live_out):
+    # [This is directly from
+    # tensorflow/python/autograph/converters/control_flow.py]
+    # The loop variables corresponding to simple symbols (e.g. `x`).
+    basic_loop_vars = []
+    for s in modified:
+      if s.is_composite():
+        # TODO: Raise an error when this happens for a TF loop.
+        continue
+      # Variables not live into or out of the loop are considered local to the
+      # loop.
+      if s not in live_in and s not in live_out:
+        continue
+      basic_loop_vars.append(s)
+    return frozenset(basic_loop_vars)
+
+  def _get_composite_loop_vars(self, modified, live_in):
+    # [This is directly from
+    # tensorflow/python/autograph/converters/control_flow.py]
+    # The loop variables corresponding to composite symbols (e.g. `self.x`).
+    composite_loop_vars = []
+    for s in modified:
+      if not s.is_composite():
+        continue
+      # Mutations made to objects created inside the loop will appear as writes
+      # to composite symbols. Because these mutations appear as modifications
+      # made to composite symbols, we check whether the composite's parent is
+      # actually live into the loop.
+      # Example:
+      #   while cond:
+      #     x = Foo()
+      #     x.foo = 2 * x.foo  # x.foo is live into the loop, but x is not.
+      #
+      # Note that some parents might not be symbols - for example, in x['foo'],
+      # 'foo' is a parent, but it's a literal, not a symbol. We don't check the
+      # liveness of literals.
+      support_set_symbols = tuple(
+          sss for sss in s.support_set if sss.is_symbol())
+      if not all(sss in live_in for sss in support_set_symbols):
+        continue
+      composite_loop_vars.append(s)
+    return frozenset(composite_loop_vars)
+
+  def _get_loop_vars(self, node, modified):
+    # [This is directly from python/autograph/converters/control_flow.py]
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
+    live_in = anno.getanno(node, anno.Static.LIVE_VARS_IN)
+    live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
+    reserved_symbols = body_scope.referenced
+
+    basic_loop_vars = self._get_basic_loop_vars(modified, live_in, live_out)
+    composite_loop_vars = self._get_composite_loop_vars(modified, live_in)
+    loop_vars = tuple(basic_loop_vars | composite_loop_vars)
+
+    # Variable that are used or defined inside the loop, but not defined
+    # before entering the loop. Only simple variables must be defined. The
+    # composite ones will be implicitly checked at runtime.
+    undefined_lives = basic_loop_vars - defined_in
+
+    return loop_vars, reserved_symbols, undefined_lives
+
+  def visit_While(self, node):
+
+    # Create a new WhileOp
+    # `inputs` are initial values for loop variables
+    body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
+    loop_vars, _, _ = self._get_loop_vars(node, body_scope.modified)
+    inputs = [self.symbol_table.lookup(str(name))
+              for name in loop_vars]
+    types = [input_.getType() for input_ in inputs]
+    while_op = tfp.WhileOp.create(
+        self.opbuilder, self.opbuilder.getUnknownLoc(), inputs, types)
+
+    # cache the current builder
+    cache_builder = self.opbuilder
+
+    # Process cond
+    self.symbol_table.enter_scope()
+    for input_, type_ in zip(loop_vars, types):
+      self.symbol_table.insert_symbol(
+          str(input_), while_op.getRegion(0).front().addArgument(type_))
+    self.opbuilder = tfp.OpBuilder(while_op.getRegion(0))
+    tfp.ReturnOp.create(
+        self.opbuilder, self.opbuilder.getUnknownLoc(), [self.visit(node.test)])
+    self.symbol_table.exit_scope()
+
+    # Process body
+    self.symbol_table.enter_scope()
+    for input_, type_ in zip(loop_vars, types):
+      self.symbol_table.insert_symbol(
+          str(input_), while_op.getRegion(1).front().addArgument(type_))
+    self.opbuilder = tfp.OpBuilder(while_op.getRegion(1))
+    self.visit_block(node.body)
+    tfp.ReturnOp.create(self.opbuilder, self.opbuilder.getUnknownLoc(), [
+        self.symbol_table.lookup(str(name)) for name in loop_vars])
+    self.symbol_table.exit_scope()
+
+    # Enter new values as symbols
+    for idx, var in enumerate(loop_vars):
+      self.symbol_table.insert_symbol(str(var), while_op.getResult(idx))
+
+    # Restore builder
+    self.opbuilder = cache_builder
+
+def mlir_gen_internal(node, entity_info):
+  '''
+  Returns mlir module for unprocessed node `node`
+  '''
+  namer = naming.Namer({})
+  graphs = cfg.build(node)
+  ctx = transformer.Context(entity_info, namer, None)
+  node = qual_names.resolve(node)
+  node = activity.resolve(node, ctx)
+  node = reaching_definitions.resolve(node, ctx, graphs)
+  node = reaching_fndefs.resolve(node, ctx, graphs)
+  node = liveness.resolve(node, ctx, graphs)
+  mlir_generator = MLIRGen(ctx)
+  mlir_generator.visit(node)
+  return mlir_generator.prog
+
+def mlir_gen(func):
+  '''
+  Parse a function and return TFProgram
+  '''
+  node, source = parser.parse_entity(func, future_features=())
+  entity_info = transformer.EntityInfo(
+      name=func.__name__,
+      source_code=source,
+      source_file=None,
+      future_features=(),
+      namespace=inspect_utils.getnamespace(func))
+  return mlir_gen_internal(node, entity_info)
+
+def mlir_gen_from_source(source=None, src_file=None):
+  if source is None:
+    source = open(src_file).read()
+  node = ast.parse(source)
+  entity_info = transformer.EntityInfo(
+      name="mlir_module",
+      source_code=source,
+      source_file=None,
+      future_features=(),
+      namespace={})
+  return mlir_gen_internal(node, entity_info)
diff --git a/tensorflow/python/tf_program/mlir_wrapper/BUILD b/tensorflow/python/tf_program/mlir_wrapper/BUILD
new file mode 100644
index 00000000000..ec600dcf67e
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/BUILD
@@ -0,0 +1,36 @@
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+package(licenses = ["notice"])
+
+tf_python_pybind_extension(
+    name = "mlir_wrapper",
+    srcs = ["mlir_wrapper.cc"],
+    module_name = "mlir_wrapper",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:pybind11_status",
+        "@llvm-project//mlir:StandardOps",
+        "@pybind11",
+    ],
+)
+
+cc_library(
+    name = "mlir_util",
+    srcs = [
+        "attrs.cc",
+        "basic_classes.cc",
+        "builders.cc",
+        "ops.cc",
+        "types.cc",
+    ],
+    hdrs = ["mlir_util.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:StandardOps",
+        "@pybind11",
+    ],
+)
diff --git a/tensorflow/python/tf_program/mlir_wrapper/attrs.cc b/tensorflow/python/tf_program/mlir_wrapper/attrs.cc
new file mode 100644
index 00000000000..16ccc27ef2b
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/attrs.cc
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Types.h"
+#include "tensorflow/python/tf_program/mlir_wrapper/mlir_util.h"
+
+void init_attrs(py::module& m) {
+  py::class_<mlir::Attribute>(m, "Attribute");
+  py::class_<mlir::IntegerAttr, mlir::Attribute>(m, "IntegerAttr")
+      .def("get",
+           py::overload_cast<mlir::Type, int64_t>(&mlir::IntegerAttr::get));
+}
diff --git a/tensorflow/python/tf_program/mlir_wrapper/basic_classes.cc b/tensorflow/python/tf_program/mlir_wrapper/basic_classes.cc
new file mode 100644
index 00000000000..dabd012c0e7
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/basic_classes.cc
@@ -0,0 +1,49 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/FileCheck.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "tensorflow/python/tf_program/mlir_wrapper/mlir_util.h"
+
+void init_basic_classes(py::module& m) {
+  py::class_<mlir::MLIRContext>(m, "MLIRContext").def(py::init<>());
+
+  py::class_<mlir::Location>(m, "Location");
+
+  py::class_<mlir::UnknownLoc>(m, "UnknownLoc")
+      .def("get", &mlir::UnknownLoc::get);
+
+  py::class_<mlir::Region>(m, "Region")
+      .def("back", &mlir::Region::back, py::return_value_policy::reference)
+      .def("front", &mlir::Region::front, py::return_value_policy::reference)
+      .def("add_block", [](mlir::Region& r) { r.push_back(new mlir::Block); })
+      .def("push_back", &mlir::Region::push_back)
+      .def("size", [](mlir::Region& r) { return r.getBlocks().size(); })
+      .def("front", &mlir::Region::front, py::return_value_policy::reference);
+  py::class_<mlir::Block::iterator>(m, "Block_Iterator");
+  py::class_<mlir::Block>(m, "Block")
+      .def("new", ([]() { return new mlir::Block; }),
+           py::return_value_policy::reference)
+      .def("end", &mlir::Block::end)
+      .def("addArgument", &mlir::Block::addArgument);
+
+  py::class_<mlir::Value>(m, "Value").def("getType", &mlir::Value::getType);
+  py::class_<mlir::OpResult, mlir::Value>(m, "OpResult");
+  py::class_<mlir::BlockArgument, mlir::Value>(m, "BlockArgument");
+}
diff --git a/tensorflow/python/tf_program/mlir_wrapper/builders.cc b/tensorflow/python/tf_program/mlir_wrapper/builders.cc
new file mode 100644
index 00000000000..0e83c4d7e5e
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/builders.cc
@@ -0,0 +1,51 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/Builders.h"
+
+#include "tensorflow/python/tf_program/mlir_wrapper/mlir_util.h"
+
+void init_builders(py::module& m) {
+  py::class_<mlir::Builder>(m, "Builder")
+      .def(py::init<mlir::MLIRContext*>())
+      .def("getFunctionType",
+           [](mlir::Builder& b, std::vector<mlir::Type> inputs,
+              std::vector<mlir::Type> outputs) {
+             return b.getFunctionType(llvm::ArrayRef<mlir::Type>(inputs),
+                                      llvm::ArrayRef<mlir::Type>(outputs));
+           });
+  py::class_<mlir::OpBuilder>(m, "OpBuilder")
+      .def(py::init<mlir::MLIRContext*>())
+      .def(py::init<mlir::Region&>())
+      .def(py::init<mlir::Operation*>())
+      .def(py::init<mlir::Block*, mlir::Block::iterator>())
+      .def("getUnknownLoc", &mlir::OpBuilder::getUnknownLoc)
+      .def("setInsertionPoint",
+           py::overload_cast<mlir::Block*, mlir::Block::iterator>(
+               &mlir::OpBuilder::setInsertionPoint))
+      .def("saveInsertionPoint", &mlir::OpBuilder::saveInsertionPoint)
+      .def("restoreInsertionPoint", &mlir::OpBuilder::restoreInsertionPoint)
+      .def(
+          "createOperation",
+          [](mlir::OpBuilder& opb, mlir::OperationState& state) {
+            return opb.createOperation(state);
+          },
+          py::return_value_policy::reference)
+      .def("getContext", &mlir::OpBuilder::getContext,
+           py::return_value_policy::reference);
+
+  py::class_<mlir::OpBuilder::InsertPoint>(m, "OpBuilder_InsertionPoint")
+      .def("getBlock", &mlir::OpBuilder::InsertPoint::getBlock);
+}
diff --git a/tensorflow/python/tf_program/mlir_wrapper/mlir_util.h b/tensorflow/python/tf_program/mlir_wrapper/mlir_util.h
new file mode 100644
index 00000000000..66350ad978a
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/mlir_util.h
@@ -0,0 +1,25 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+void init_basic_classes(py::module& m);
+void init_types(py::module& m);
+void init_builders(py::module& m);
+void init_ops(py::module& m);
+void init_attrs(py::module& m);
diff --git a/tensorflow/python/tf_program/mlir_wrapper/mlir_wrapper.cc b/tensorflow/python/tf_program/mlir_wrapper/mlir_wrapper.cc
new file mode 100644
index 00000000000..55b0425c445
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/mlir_wrapper.cc
@@ -0,0 +1,42 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+void init_basic_classes(py::module& m);
+void init_types(py::module& m);
+void init_builders(py::module& m);
+void init_ops(py::module& m);
+void init_attrs(py::module& m);
+
+PYBIND11_MODULE(mlir_wrapper, m) {
+  m.def("registerDialects", []() {
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    mlir::registerDialect<mlir::tf_executor::TensorFlowExecutorDialect>();
+    mlir::registerDialect<mlir::StandardOpsDialect>();
+  });
+
+  init_basic_classes(m);
+  init_types(m);
+  init_builders(m);
+  init_ops(m);
+  init_attrs(m);
+}
diff --git a/tensorflow/python/tf_program/mlir_wrapper/ops.cc b/tensorflow/python/tf_program/mlir_wrapper/ops.cc
new file mode 100644
index 00000000000..0391e31b9c2
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/ops.cc
@@ -0,0 +1,194 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
+
+#include "mlir/IR/Function.h"
+#include "mlir/IR/Operation.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/python/tf_program/mlir_wrapper/mlir_util.h"
+
+void init_ops(py::module& m) {
+  py::class_<mlir::Operation, std::unique_ptr<mlir::Operation, py::nodelete>>(
+      m, "Operation")
+      .def("getRegion", &mlir::Operation::getRegion,
+           py::return_value_policy::reference)
+      .def("getResult", &mlir::Operation::getResult)
+      .def("dump", &mlir::Operation::dump)
+      .def("getNumResults", &mlir::Operation::getNumResults);
+
+  py::class_<mlir::OperationState>(m, "OperationState")
+      .def(py::init([](mlir::Location loc, std::string name) {
+        return mlir::OperationState(loc, llvm::StringRef(name));
+      }))
+      .def("addTypes",
+           [](mlir::OperationState& state, std::vector<mlir::Type> tys) {
+             state.addTypes(mlir::ArrayRef<mlir::Type>(tys));
+           })
+      .def("addOperands",
+           [](mlir::OperationState& os, std::vector<mlir::Value> ops) {
+             os.addOperands(mlir::ArrayRef<mlir::Value>(ops));
+           })
+      .def("addRegion", py::overload_cast<>(&mlir::OperationState::addRegion),
+           py::return_value_policy::reference);
+
+  py::class_<mlir::ModuleOp>(m, "ModuleOp")
+      .def("create",
+           [](mlir::Location loc) { return mlir::ModuleOp::create(loc); })
+      .def("push_back",
+           [](mlir::ModuleOp& m, mlir::FuncOp f) { m.push_back(f); })
+      .def("dump", &mlir::ModuleOp::dump)
+      .def("getAsStr", [](mlir::ModuleOp& m) {
+        std::string str;
+        llvm::raw_string_ostream os(str);
+        m.print(os);
+        return os.str();
+      });
+
+  py::class_<mlir::FuncOp>(m, "FuncOp")
+      .def("create",
+           [](mlir::Location location, std::string name,
+              mlir::FunctionType type) {
+             auto func = mlir::FuncOp::create(location, name, type);
+             func.addEntryBlock();
+             return func;
+           })
+      .def(
+          "getBody",
+          [](mlir::FuncOp& f) -> mlir::Region& { return f.getBody(); },
+          py::return_value_policy::reference)
+      .def("getArguments",
+           [](mlir::FuncOp& f) { return f.getArguments().vec(); })
+      .def("getName", [](mlir::FuncOp& f) { return f.getName().str(); })
+      .def("getType", &mlir::FuncOp::getType);
+
+  py::class_<mlir::ReturnOp>(m, "ReturnOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              std::vector<mlir::Value> values) -> mlir::Operation* {
+             return opb
+                 .create<mlir::ReturnOp>(loc,
+                                         mlir::ArrayRef<mlir::Value>(values))
+                 .getOperation();
+           });
+
+  // mlir::TF::AddOp
+  py::class_<mlir::TF::AddV2Op>(m, "Tf_AddV2Op")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::AddV2Op>(loc, x, y).getOperation();
+           });
+
+  py::class_<mlir::TF::AnyOp>(m, "Tf_AnyOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value input,
+              mlir::Value reduction_indices,
+              bool keep_dims = false) -> mlir::Operation* {
+             return opb
+                 .create<mlir::TF::AnyOp>(loc, opb.getI1Type(), input,
+                                          reduction_indices, keep_dims)
+                 .getOperation();
+           });
+
+  // mlir::TF::ConstOp
+  py::class_<mlir::TF::ConstOp>(m, "Tf_ConstOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              mlir::Attribute value) -> mlir::Operation* {
+             return opb.create<mlir::TF::ConstOp>(loc, value).getOperation();
+           });
+
+  // mlir::TF::EqualOp
+  py::class_<mlir::TF::EqualOp>(m, "Tf_EqualOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb
+                 .create<mlir::TF::EqualOp>(loc, x, y, opb.getBoolAttr(true))
+                 .getOperation();
+           });
+
+  // mlir::TF::GreaterEqualOp
+  py::class_<mlir::TF::GreaterEqualOp>(m, "Tf_GreaterEqualOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::GreaterEqualOp>(loc, x, y)
+                 .getOperation();
+           });
+
+  // mlir::TF::GreaterOp
+  py::class_<mlir::TF::GreaterOp>(m, "Tf_GreaterOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::GreaterOp>(loc, x, y).getOperation();
+           });
+
+  // mlir::TF::LegacyCallOp
+  py::class_<mlir::TF::LegacyCallOp>(m, "Tf_LegacyCallOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              std::vector<mlir::Type> output, std::vector<mlir::Value> args,
+              std::string f) -> mlir::Operation* {
+             return opb
+                 .create<mlir::TF::LegacyCallOp>(
+                     loc, mlir::ArrayRef<mlir::Type>(output),
+                     mlir::ArrayRef<mlir::Value>(args), mlir::StringRef(f))
+                 .getOperation();
+           });
+
+  // mlir::TF::LessEqualOp
+  py::class_<mlir::TF::LessEqualOp>(m, "Tf_LessEqualOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::LessEqualOp>(loc, x, y).getOperation();
+           });
+
+  // mlir::TF::LessOp
+  py::class_<mlir::TF::LessOp>(m, "Tf_LessOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::LessOp>(loc, x, y).getOperation();
+           });
+
+  // mlir::TF::NegOp
+  py::class_<mlir::TF::NegOp>(m, "Tf_NegOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc,
+              mlir::Value x) -> mlir::Operation* {
+             return opb.create<mlir::TF::NegOp>(loc, x).getOperation();
+           });
+
+  py::class_<mlir::TF::NotEqualOp>(m, "Tf_NotEqualOp")
+      .def("create", [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+                        mlir::Value y) {
+        return opb
+            .create<mlir::TF::NotEqualOp>(
+                loc, x, y, mlir::BoolAttr::get(true, opb.getContext()))
+            .getOperation();
+      });
+
+  // mlir::TF::SubOp
+  py::class_<mlir::TF::SubOp>(m, "Tf_SubOp")
+      .def("create",
+           [](mlir::OpBuilder& opb, mlir::Location loc, mlir::Value x,
+              mlir::Value y) -> mlir::Operation* {
+             return opb.create<mlir::TF::SubOp>(loc, x, y).getOperation();
+           });
+}
diff --git a/tensorflow/python/tf_program/mlir_wrapper/types.cc b/tensorflow/python/tf_program/mlir_wrapper/types.cc
new file mode 100644
index 00000000000..461d10bd160
--- /dev/null
+++ b/tensorflow/python/tf_program/mlir_wrapper/types.cc
@@ -0,0 +1,48 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "mlir/IR/StandardTypes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/python/tf_program/mlir_wrapper/mlir_util.h"
+
+void init_types(py::module& m) {
+  // Type
+  py::class_<mlir::Type> Type(m, "Type");
+  Type.def("getKind", &mlir::Type::getKind);
+
+  // Type Enums
+  py::enum_<mlir::StandardTypes::Kind>(Type, "StandardTypes_Kind")
+      .value("BF16", mlir::StandardTypes::BF16);
+
+  // Type Sub-classes
+  py::class_<mlir::FunctionType, mlir::Type>(m, "FunctionType")
+      .def("getResults",
+           [](mlir::FunctionType& ft) { return ft.getResults().vec(); });
+
+  py::class_<mlir::FloatType, mlir::Type>(m, "FloatType")
+      .def("get", &mlir::FloatType::get);
+
+  py::class_<mlir::IntegerType, mlir::Type>(m, "IntegerType")
+      .def("get", py::overload_cast<unsigned, mlir::MLIRContext*>(
+                      &mlir::IntegerType::get));
+
+  py::class_<mlir::UnrankedTensorType, mlir::Type>(m, "UnrankedTensorType")
+      .def("get", &mlir::UnrankedTensorType::get);
+
+  py::class_<mlir::RankedTensorType, mlir::Type>(m, "RankedTensorType")
+      .def("get", [](std::vector<int64_t> shape, mlir::Type ty) {
+        return mlir::RankedTensorType::get(mlir::ArrayRef<int64_t>(shape), ty);
+      });
+}
diff --git a/tensorflow/python/tf_program/pywrap_tfd.py b/tensorflow/python/tf_program/pywrap_tfd.py
new file mode 100644
index 00000000000..fa9415206cf
--- /dev/null
+++ b/tensorflow/python/tf_program/pywrap_tfd.py
@@ -0,0 +1,149 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+'''
+Intermediate between python bindings for MLIR and mlir generation for tensorflow
+program. This passes most of the mlir classes as is, but adds a few new
+operations and the basic structure for a tensorflow program
+'''
+
+# pylint: disable=invalid-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.tf_program.mlir_wrapper import mlir_wrapper as mlir
+
+# Class Definitions
+OpBuilder = mlir.OpBuilder
+Block = mlir.Block
+
+# Types
+Type = mlir.Type
+IntegerType = mlir.IntegerType
+FloatType = mlir.FloatType
+RankedTensorType = mlir.RankedTensorType
+UnrankedTensorType = mlir.UnrankedTensorType
+IntegerAttr = mlir.IntegerAttr
+
+# Standard Ops
+ReturnOp = mlir.ReturnOp
+
+# TF Dialect Ops
+Tf_AnyOp = mlir.Tf_AnyOp
+Tf_AddV2Op = mlir.Tf_AddV2Op
+Tf_ConstOp = mlir.Tf_ConstOp
+Tf_EqualOp = mlir.Tf_EqualOp
+Tf_GreaterEqualOp = mlir.Tf_GreaterEqualOp
+Tf_GreaterOp = mlir.Tf_GreaterOp
+Tf_LegacyCallOp = mlir.Tf_LegacyCallOp
+Tf_LessEqualOp = mlir.Tf_LessEqualOp
+Tf_LessOp = mlir.Tf_LessOp
+Tf_NegOp = mlir.Tf_NegOp
+Tf_NotEqualOp = mlir.Tf_NotEqualOp
+Tf_SubOp = mlir.Tf_SubOp
+
+class IfOp:
+  '''
+  tfp.if(cond) ({body}, {orelse}) : type
+  If `cond` is true, `body` is executed, otherwise `orelse` is executed
+  '''
+  @classmethod
+  def create(cls, opb, loc, cond, outputs):
+    state = mlir.OperationState(loc, "tfp.If")
+    state.addOperands([cond])
+    state.addTypes(outputs)
+    state.addRegion().push_back(Block.new())  # body region
+    state.addRegion().push_back(Block.new())  # orelse region
+    return opb.createOperation(state)
+
+class OrOp:
+  '''
+  tfp.Or(ops...)
+  This is like tf.Any, except that the first dimension is opened into `ops`.
+  Returns a tensor of 1-bit integers which is "Logical OR" of the coressponding
+  elements in ops...
+  '''
+  @classmethod
+  def create(cls, opb, loc, values):
+    state = mlir.OperationState(loc, "tfp.Or")
+    state.addTypes([
+        UnrankedTensorType.get(IntegerType.get(1, opb.getContext()))])
+    state.addOperands(values)
+    return opb.createOperation(state)
+
+class AndOp:
+  '''
+  tfp.And(ops...)
+  This is like tf.All, except that the first dimension is opened to `ops`.
+  Returns a tensor of 1-bit integers which is "Logical AND" of the coressponding
+  elements in ops...
+  '''
+  @classmethod
+  def create(cls, opb, loc, values):
+    state = mlir.OperationState(loc, "tfp.And")
+    state.addTypes([
+        UnrankedTensorType.get(IntegerType.get(1, opb.getContext()))])
+    state.addOperands(values)
+    return opb.createOperation(state)
+
+class WhileOp:
+  '''
+  tfp.While(init-vals, {
+    ^bb1(cond-args):
+      cond-region
+      return cond
+  }, {
+    ^bb1(body-args):
+      body-region
+  })
+  As long as `cond-region` returns a "true"-like value, the body-region
+  is executed and the arguments are replaced by its return values for the next
+  iteration
+  '''
+  @classmethod
+  def create(cls, opb, loc, inputs, outputs):
+    state = mlir.OperationState(loc, "tfp.While")
+    state.addOperands(inputs)
+    state.addTypes(outputs)
+    state.addRegion().push_back(Block.new())  # cond region
+    state.addRegion().push_back(Block.new())  # body region
+    return opb.createOperation(state)
+
+class TFProgram:
+  '''
+  Python wrap for a Tensorflow Program (essentially an mlir Module)
+  '''
+  def __init__(self):
+    mlir.registerDialects()
+    self.ctx = mlir.MLIRContext()
+    self.builder = mlir.Builder(self.ctx)
+    self.module = mlir.ModuleOp.create(mlir.UnknownLoc.get(self.ctx))
+    self.curr_func = None
+
+  def add_function(self, name, func_type):
+    self.curr_func = mlir.FuncOp.create(
+        mlir.UnknownLoc.get(self.ctx), name, func_type)
+    self.module.push_back(self.curr_func)
+    return self.curr_func
+
+  def get_function_type(self, inputs, outputs):
+    return self.builder.getFunctionType(inputs, outputs)
+
+  def dump(self):
+    self.module.dump()
+
+  def __str__(self):
+    return self.module.getAsStr()
diff --git a/tensorflow/python/tf_program/tests/BUILD b/tensorflow/python/tf_program/tests/BUILD
new file mode 100644
index 00000000000..6b62d5528ec
--- /dev/null
+++ b/tensorflow/python/tf_program/tests/BUILD
@@ -0,0 +1,31 @@
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+package(licenses = ["notice"])
+
+tf_python_pybind_extension(
+    name = "filecheck_wrapper",
+    srcs = ["filecheck_wrapper.cc"],
+    module_name = "filecheck_wrapper",
+    deps = [
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:pybind11_status",
+        "@llvm-project//llvm:support",
+        "@pybind11",
+    ],
+)
+
+py_test(
+    name = "mlir_gen_test",
+    size = "small",
+    testonly = True,
+    srcs = ["mlir_gen_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":filecheck_wrapper",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/tf_program:mlir_gen",
+        "//tensorflow/python/types",
+    ],
+)
diff --git a/tensorflow/python/tf_program/tests/filecheck_wrapper.cc b/tensorflow/python/tf_program/tests/filecheck_wrapper.cc
new file mode 100644
index 00000000000..4d3d4af4f11
--- /dev/null
+++ b/tensorflow/python/tf_program/tests/filecheck_wrapper.cc
@@ -0,0 +1,36 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "llvm/Support/FileCheck.h"
+#include "llvm/Support/SourceMgr.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+PYBIND11_MODULE(filecheck_wrapper, m) {
+  m.def("check", [](std::string input, std::string check) {
+    llvm::FileCheckRequest fcr;
+    llvm::FileCheck fc(fcr);
+    llvm::SourceMgr SM = llvm::SourceMgr();
+    SM.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(input),
+                          llvm::SMLoc());
+    SM.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(check),
+                          llvm::SMLoc());
+    llvm::Regex regex = fc.buildCheckPrefixRegex();
+    fc.readCheckFile(SM, llvm::StringRef(check), regex);
+    return fc.checkInput(SM, llvm::StringRef(input));
+  });
+}
diff --git a/tensorflow/python/tf_program/tests/mlir_gen_test.py b/tensorflow/python/tf_program/tests/mlir_gen_test.py
new file mode 100644
index 00000000000..664d561fb6a
--- /dev/null
+++ b/tensorflow/python/tf_program/tests/mlir_gen_test.py
@@ -0,0 +1,228 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+'''Tests for `mlir_gen` module'''
+
+# pylint: disable=missing-function-docstring
+# pylint: disable=invalid-name
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.platform import test
+from tensorflow.python.types import core
+from tensorflow.python.tf_program.mlir_gen import mlir_gen
+
+import tensorflow.python.tf_program.tests.filecheck_wrapper as fw
+
+class MLIRGenTestBase(test.TestCase):
+
+  def _check_code(self, mlir_code, exp_mlir_code):
+    return self.assertTrue(fw.check(str(mlir_code), exp_mlir_code))
+
+class MLIRGenTest(MLIRGenTestBase):
+  '''MLIR Generation Tests for Tensorflow Program'''
+
+  def test_simple(self):
+
+    def test_fn():
+      pass
+
+    mlir_code = mlir_gen(test_fn)
+    mlir_code_exp = r'''
+      CHECK-LABEL: @test_fn
+    '''
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_argument(self):
+
+    def test_fn(x: core.Tensor) -> core.Tensor:
+      return x
+
+    mlir_code = mlir_gen(test_fn)
+    mlir_code_exp = r'''
+      CHECK-LABEL: @test_fn(%arg0: tensor<*xi32>) -> tensor<*xi32> {
+        CHECK-NEXT: return %arg0 : tensor<*xi32>
+    '''
+    self._check_code(mlir_code, mlir_code_exp)
+
+  def test_constant(self):
+    def test_fn()->int:
+      return 23
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn() -> i32
+      CHECK: %[[r0:[0-9]+]] = "tf.Const"() {value = dense<23> : tensor<i32>} : () -> tensor<i32>
+      CHECK: return %[[r0]] : tensor<i32>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_BoolOp(self):
+    def test_fn(x: bool, y: bool)->bool:
+      return x or y or x and x and y
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn(%arg0: i1, %arg1: i1) -> i1
+      CHECK: %[[r0:[0-9]+]] = "tfp.And"(%arg0, %arg0, %arg1) : (i1, i1, i1) -> tensor<*xi1>
+      CHECK: %[[r1:[0-9]+]] = "tfp.Or"(%arg0, %arg1, %[[r0]]) : (i1, i1, tensor<*xi1>) -> tensor<*xi1>
+      return %[[r1]] : tensor<*xi1>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_Call(self):
+    def test_fn():
+      def f1():
+        return 23
+      def f2():
+        return f1()
+      f2()
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn()
+        CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @f2} : () -> ()
+      CHECK: }
+      CHECK-LABEL: func @f1() {
+        CHECK: %[[r0:[0-9]+]] = "tf.Const"() {value = dense<23> : tensor<i32>} : () -> tensor<i32>
+        CHECK: return %[[r0]] : tensor<i32>
+      CHECK: }
+      CHECK-LABEL: func @f2() {
+        CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @f1} : () -> ()
+      }
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_Compare(self):
+    def test_fn(x: core.Tensor, y: core.Tensor, z: core.Tensor):
+      return x > y < z
+
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<*xi32>)
+      CHECK: %[[r0:[0-9]+]] = "tf.Greater"(%arg0, %arg1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi1>
+      CHECK: %[[r1:[0-9]+]] = "tf.Less"(%[[r0]], %arg2) : (tensor<*xi1>, tensor<*xi32>) -> tensor<*xi1>
+      CHECK: return %[[r1]] : tensor<*xi1>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_Assign_BinOp(self):
+    def test_fn()->int:
+      y = 12 + 23 - 24
+      return y
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn() -> i32
+      CHECK: %[[r0:[0-9]+]] = "tf.AddV2"(%{{[0-9]+}}, %{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      CHECK: %[[r1:[0-9]+]] = "tf.Sub"(%{{[0-9]+}}, %{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+      CHECK: return %[[r1]] : tensor<i32>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_if(self):
+    def test_fn(x: core.Tensor)->int:
+      res = 0
+      if x > 0:
+        res = 1
+      elif x < 0:
+        res = -1
+      else:
+        res = 0
+      return res
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>) -> i32
+      
+      CHECK: %[[r1:[0-9]+]] = "tf.Greater"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      CHECK-NEXT: %[[r2:[0-9]+]] = "tfp.If"(%[[r1]]) ( {
+        CHECK: return %{{[0-9]+}} : tensor<i32>
+      CHECK-NEXT: },  {
+        CHECK: %[[r3:[0-9]+]] = "tf.Less"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+        CHECK: %[[r4:[0-9]+]] = "tfp.If"(%[[r3]]) ( {
+          CHECK: %[[r5:[0-9]+]] = "tf.Neg"(%{{[0-9]+}}) : (tensor<i32>) -> tensor<i32>
+          CHECK: return %[[r5]] : tensor<i32>
+        CHECK-NEXT: },  {
+          CHECK: return %{{[0-9]+}} : tensor<i32>
+        CHECK-NEXT: }) : (tensor<*xi1>) -> tensor<i32>
+        CHECK: return %[[r4]] : tensor<i32>
+      CHECK-NEXT: }) : (tensor<*xi1>) -> tensor<i32>
+      CHECK-NEXT: return %[[r2]] : tensor<i32>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_while(self):
+    def test_fn(x: core.Tensor)->core.Tensor:
+      s = 0
+      while x > 0:
+        s = s + x
+      return s
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>) -> tensor<*xi32>
+
+      CHECK: %[[r1:[0-9]+]] = "tfp.While"(%0) ( {
+      CHECK-NEXT: ^{{[^ ]+}}(%arg1: tensor<i32>):
+        CHECK: %[[r2:[0-9]+]] = "tf.Greater"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+        CHECK-NEXT: return %[[r2]] : tensor<*xi1>
+      CHECK-NEXT: },  {
+      CHECK-NEXT: ^{{[^ ]+}}(%arg1: tensor<i32>):
+        CHECK: %[[r3:[0-9]+]] = "tf.AddV2"(%arg1, %arg0) : (tensor<i32>, tensor<*xi32>) -> tensor<*xi32>
+        CHECK-NEXT: return %[[r3]] : tensor<*xi32>
+      CHECK-NEXT: }) : (tensor<i32>) -> tensor<i32>
+      CHECK-NEXT: return %[[r1]] : tensor<i32>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+  def test_fibonacci(self):
+    def test_fn(x: core.Tensor)->core.Tensor:
+      res, idx = 0, 2
+      a, b = 0, 1
+      if x == 0 or x == 1:
+        res = x
+      else:
+        while idx <= x:
+          res = a + b
+          a = b
+          b = res
+          idx = idx + 1
+      return res
+    mlir_code = mlir_gen(test_fn)
+    exp_mlir_code = r'''
+      CHECK-LABEL: @test_fn(%arg0: tensor<*xi32>) -> tensor<*xi32>
+      CHECK: %[[r5:[0-9]+]] = "tf.Equal"(%arg0, %{{[0-9]+}}) {incompatible_shape_error = true} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      CHECK: %[[r7:[0-9]+]] = "tf.Equal"(%arg0, %{{[0-9]+}}) {incompatible_shape_error = true} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+      CHECK: %[[r8:[0-9]+]] = "tfp.Or"(%[[r5]], %[[r7]]) : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
+      
+      CHECK: %[[r9:[0-9]+]]:4 = "tfp.If"(%[[r8]]) ( {
+        CHECK-NEXT: return %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>
+        CHECK-NEXT: },  {
+        CHECK-NEXT: %[[r10:[0-9]+]]:4 = "tfp.While"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}) ( {
+          CHECK-NEXT: ^{{[^ ]*}}(%arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>):
+          CHECK-NEXT: %[[r11:[0-9]+]] = "tf.LessEqual"(%arg{{[0-9]+}}, %arg{{[0-9]+}}) : (tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>) -> tensor<*xi1>
+          CHECK-NEXT: return %[[r11]] : tensor<*xi1>
+        CHECK-NEXT: },  {
+          CHECK-NEXT: ^{{[^ ]*}}(%arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>):
+          CHECK-NEXT: %[[r12:[0-9]+]] = "tf.AddV2"(%arg{{[0-9]+}}, %arg{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+          CHECK: %[[r13:[0-9]+]] = "tf.AddV2"(%arg{{[0-9]+}}, %{{[0-9]+}}) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+          CHECK-NEXT: return %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+        CHECK-NEXT: }) : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
+        CHECK-NEXT: return %[[r10]]#{{[0-9]+}}, %[[r10]]#{{[0-9]+}}, %[[r10]]#{{[0-9]+}}, %[[r10]]#{{[0-9]+}} : tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>
+      CHECK-NEXT: }) : (tensor<*xi1>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<i32>)
+      CHECK-NEXT: return %[[r9]]#{{[0-9]+}} : tensor<i32>
+    '''
+    self._check_code(mlir_code, exp_mlir_code)
+
+
+if __name__ == '__main__':
+  test.main()

From 3f37c0e264bda211a5c16d6bc7d97f40e2d8d68c Mon Sep 17 00:00:00 2001
From: Shraiysh Vaishay <cs17btech11050@iith.ac.in>
Date: Sat, 9 May 2020 17:27:08 +0530
Subject: [PATCH 022/412] Added no_oss_py2 tag and changed srcs_version for
 tests to PY3

Signed-off-by: Shraiysh Vaishay <cs17btech11050@iith.ac.in>
---
 tensorflow/python/tf_program/tests/BUILD | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/tf_program/tests/BUILD b/tensorflow/python/tf_program/tests/BUILD
index 6b62d5528ec..2d36e4b295f 100644
--- a/tensorflow/python/tf_program/tests/BUILD
+++ b/tensorflow/python/tf_program/tests/BUILD
@@ -20,8 +20,11 @@ py_test(
     testonly = True,
     srcs = ["mlir_gen_test.py"],
     python_version = "PY3",
-    srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    srcs_version = "PY3",
+    tags = [
+        "no_oss_py2",
+        "no_pip",
+    ],
     deps = [
         ":filecheck_wrapper",
         "//tensorflow/python:client_testlib",

From c4ab2d38bcd48b5ae9cca7fe94ac11a8a28683fa Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 10 May 2020 03:37:15 +0000
Subject: [PATCH 023/412] Add mlir_graph_optimization_pass.h header to pip
 wheel

This PR adds mlir_graph_optimization_pass.h header to tf-nightly pip wheel.
mlir_graph_optimization_pass.h is a header file that allows to register
mlir based graph optimizaton (either part of the tensorflow, or externally
registered).
However, it is not part of the pip install so it is not possible
to register with installed version of tensorflow. This PR adds the header
file to be part of the pip install.
This PR is related to 39231

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/tools/pip_package/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 70d88f294bc..2c3734d2fc2 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -28,6 +28,7 @@ transitive_hdrs(
     deps = [
         "//tensorflow/c/experimental:network",
         "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
+        "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",

From e1bed0f7af858d4d40ad6501994f819895330803 Mon Sep 17 00:00:00 2001
From: Shunya Ueta <shunyaueta01@gmail.com>
Date: Sun, 10 May 2020 18:01:42 +0900
Subject: [PATCH 024/412] Remove Python2 badge in raspberry Pi into README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 27032043e07..a6de2984dd8 100644
--- a/README.md
+++ b/README.md
@@ -112,8 +112,8 @@ Build Type               | Status
 **Windows CPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-cpu.html)                                                                                                                                                                    | [PyPI](https://pypi.org/project/tf-nightly/)
 **Windows GPU**          | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/windows-gpu.html)                                                                                                                                                                    | [PyPI](https://pypi.org/project/tf-nightly-gpu/)
 **Android**              | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/android.html)                                                                                                                                                                            | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
-**Raspberry Pi 0 and 1** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py2.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py2.html) [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html) | [Py2](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp27-none-linux_armv6l.whl) [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
-**Raspberry Pi 2 and 3** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py2.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py2.html) [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html) | [Py2](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp27-none-linux_armv7l.whl) [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
+**Raspberry Pi 0 and 1** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi01-py3.html) | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv6l.whl)
+**Raspberry Pi 2 and 3** | [![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.svg)](https://storage.googleapis.com/tensorflow-kokoro-build-badges/rpi23-py3.html) | [Py3](https://storage.googleapis.com/tensorflow-nightly/tensorflow-1.10.0-cp34-none-linux_armv7l.whl)
 
 ### Community Supported Builds
 

From 85fcfd9191cb4fdefa3ed8690bddd4b8eda0836d Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 10 May 2020 18:52:18 +0000
Subject: [PATCH 025/412] Update protobuf-java to 3.9.2

This PR updates protobuf-java to 3.9.2, to match C++ version
in tensorflow/workspace.bzl (3.9.2), and to fix the issue raised
in 39381.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/java/maven/proto/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index ce1acc20b00..aa4a9bb4618 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -16,7 +16,7 @@
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>3.5.1</version>
+      <version>3.9.2</version>
     </dependency>
   </dependencies>
 

From c6d2369174a69d3f873100f02788a50c396395ea Mon Sep 17 00:00:00 2001
From: marload <rladhkstn8@gmail.com>
Date: Mon, 11 May 2020 15:37:57 +0900
Subject: [PATCH 026/412] Refactoring: Format String -> Format Method

---
 configure.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/configure.py b/configure.py
index a003265f3c9..ca2ff597a31 100644
--- a/configure.py
+++ b/configure.py
@@ -144,7 +144,7 @@ def write_to_bazelrc(line):
 
 
 def write_action_env_to_bazelrc(var_name, var):
-  write_to_bazelrc('build --action_env %s="%s"' % (var_name, str(var)))
+  write_to_bazelrc('build --action_env {}="{}"'.format(var_name, str(var)))
 
 
 def run_shell(cmd, allow_non_zero=False, stderr=None):
@@ -205,7 +205,7 @@ def setup_python(environ_cp):
   # Get PYTHON_BIN_PATH, default is the current running python.
   default_python_bin_path = sys.executable
   ask_python_bin_path = ('Please specify the location of python. [Default is '
-                         '%s]: ') % default_python_bin_path
+                         '{}]: ').format(default_python_bin_path)
   while True:
     python_bin_path = get_from_env_or_user_or_default(environ_cp,
                                                       'PYTHON_BIN_PATH',
@@ -215,9 +215,9 @@ def setup_python(environ_cp):
     if os.path.isfile(python_bin_path) and os.access(python_bin_path, os.X_OK):
       break
     elif not os.path.exists(python_bin_path):
-      print('Invalid python path: %s cannot be found.' % python_bin_path)
+      print('Invalid python path: {} cannot be found.'.format(python_bin_path))
     else:
-      print('%s is not executable.  Is it the python binary?' % python_bin_path)
+      print('{} is not executable.  Is it the python binary?'.format(python_bin_path))
     environ_cp['PYTHON_BIN_PATH'] = ''
 
   # Convert python path to Windows style before checking lib and version
@@ -236,7 +236,7 @@ def setup_python(environ_cp):
       default_python_lib_path = python_lib_paths[0]
       python_lib_path = get_input(
           'Please input the desired Python library path to use.  '
-          'Default is [%s]\n' % python_lib_paths[0])
+          'Default is [{}]\n'.format(python_lib_paths[0]))
       if not python_lib_path:
         python_lib_path = default_python_lib_path
     environ_cp['PYTHON_LIB_PATH'] = python_lib_path
@@ -252,7 +252,7 @@ def setup_python(environ_cp):
   # Set-up env variables used by python_configure.bzl
   write_action_env_to_bazelrc('PYTHON_BIN_PATH', python_bin_path)
   write_action_env_to_bazelrc('PYTHON_LIB_PATH', python_lib_path)
-  write_to_bazelrc('build --python_path=\"%s"' % python_bin_path)
+  write_to_bazelrc('build --python_path=\"{}"'.format(python_bin_path))
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
   # If choosen python_lib_path is from a path specified in the PYTHONPATH
@@ -266,7 +266,7 @@ def setup_python(environ_cp):
   with open(
       os.path.join(_TF_WORKSPACE_ROOT, 'tools', 'python_bin_path.sh'),
       'w') as f:
-    f.write('export PYTHON_BIN_PATH="%s"' % python_bin_path)
+    f.write('export PYTHON_BIN_PATH="{}"'.format(python_bin_path))
 
 
 def reset_tf_configure_bazelrc():
@@ -320,11 +320,11 @@ def get_var(environ_cp,
       Raise the error to avoid infinitely looping.
   """
   if not question:
-    question = 'Do you wish to build TensorFlow with %s support?' % query_item
+    question = 'Do you wish to build TensorFlow with {} support?'.format(query_item)
   if not yes_reply:
-    yes_reply = '%s support will be enabled for TensorFlow.' % query_item
+    yes_reply = '{} support will be enabled for TensorFlow.'.format(query_item)
   if not no_reply:
-    no_reply = 'No %s' % yes_reply
+    no_reply = 'No {}'.format(yes_reply)
 
   yes_reply += '\n'
   no_reply += '\n'
@@ -368,7 +368,7 @@ def get_var(environ_cp,
         print(no_reply)
         var = False
     else:
-      print('Invalid selection: %s' % user_input_origin)
+      print('Invalid selection: {}'.format(user_input_origin))
   return var
 
 
From eb75c470f909daa72fec6f397ba4892d9d178842 Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Mon, 11 May 2020 10:18:23 -0700
Subject: [PATCH 027/412] adding version number.

---
 third_party/mkl_dnn/mkldnn_threadpool.BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/mkl_dnn/mkldnn_threadpool.BUILD b/third_party/mkl_dnn/mkldnn_threadpool.BUILD
index 35175b7f90f..7209b8a62d0 100644
--- a/third_party/mkl_dnn/mkldnn_threadpool.BUILD
+++ b/third_party/mkl_dnn/mkldnn_threadpool.BUILD
@@ -42,7 +42,7 @@ template_rule(
     out = "include/dnnl_version.h",
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "1",
-        "@DNNL_VERSION_MINOR@": "2",
+        "@DNNL_VERSION_MINOR@": "4",
         "@DNNL_VERSION_PATCH@": "0",
         "@DNNL_VERSION_HASH@": "N/A",
     },

From 0a980f296919766407af45b95c9e8aa290f72569 Mon Sep 17 00:00:00 2001
From: Eugene Kuznetsov <eugene.kuznetsov@amd.com>
Date: Tue, 5 May 2020 10:54:54 +0000
Subject: [PATCH 028/412] ROCm 3.5 (hip-clang) build fixes

---
 .../service/gpu/llvm_gpu_backend/gpu_backend_lib.cc |  2 +-
 .../stream_executor/rocm/rocm_gpu_executor.cc       |  4 ++++
 .../clang/bin/crosstool_wrapper_driver_rocm.tpl     |  4 +++-
 third_party/gpus/cuda_configure.bzl                 | 13 +++++++++----
 third_party/gpus/rocm_configure.bzl                 |  2 ++
 5 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 060a0375271..497dcda4361 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -689,7 +689,7 @@ std::unique_ptr<llvm::TargetMachine> AMDGPUGetTargetMachine(
     llvm::Triple target_triple, int amdgpu_version,
     const HloModuleConfig& hlo_module_config) {
   return GetTargetMachine(target_triple, absl::StrCat("gfx", amdgpu_version),
-                          hlo_module_config, "-code-object-v3");
+                          hlo_module_config, "+code-object-v3");
 }
 
 void AMDGPUBackendInit(const HloModuleConfig& hlo_module_config) {
diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index e22a243a70b..216602a7597 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -132,6 +132,10 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
     VLOG(3) << "Unloading  HSACO module " << module;
     GpuDriver::UnloadModule(context_, module);
     gpu_binary_to_module_.erase(module_it);
+    const char* mem_it = nullptr;
+    for (auto x : in_memory_modules_)
+      if (x.second == module) mem_it = x.first;
+    if (mem_it != nullptr) in_memory_modules_.erase(mem_it);
   }
   return true;
 }
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
index f5ac7b39dfd..89275128a9c 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
@@ -179,7 +179,7 @@ def InvokeHipcc(argv, log=False):
   # Also we need to retain warning about uninitialised shared variable as
   # warning only, even when -Werror option is specified.
   if HIPCC_IS_HIPCLANG:
-    hipccopts += ' --include=hip/hip_runtime.h -Wno-error=cuda-shared-init '
+    hipccopts += ' --include=hip/hip_runtime.h '
   hipccopts += ' ' + hipcc_compiler_options
   # Use -fno-gpu-rdc by default for early GPU kernel finalization
   # This flag would trigger GPU kernels be generated at compile time, instead
@@ -258,6 +258,8 @@ def main():
     gpu_linker_flags.append('-L' + HIP_RUNTIME_PATH)
     gpu_linker_flags.append('-Wl,-rpath=' + HIP_RUNTIME_PATH)
     gpu_linker_flags.append('-l' + HIP_RUNTIME_LIBRARY)
+    if HIPCC_IS_HIPCLANG:
+      gpu_linker_flags.append("-lrt")
 
     if VERBOSE: print(' '.join([CPU_COMPILER] + gpu_linker_flags))
     return subprocess.call([CPU_COMPILER] + gpu_linker_flags)
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 545aeebe97a..ce924fe4cd2 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -808,23 +808,28 @@ def make_copy_files_rule(repository_ctx, name, srcs, outs):
     cmd = \"""%s \""",
 )""" % (name, "\n".join(outs), " && \\\n".join(cmds))
 
-def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir):
+def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir, exceptions=None):
     """Returns a rule to recursively copy a directory."""
     src_dir = _norm_path(src_dir)
     out_dir = _norm_path(out_dir)
     outs = read_dir(repository_ctx, src_dir)
+    post_cmd=''
+    if exceptions!=None:
+      outs = [x for x in outs if not any([x.startswith(y) for y in exceptions])]
     outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
-
     # '@D' already contains the relative path for a single file, see
     # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
     out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
+    if exceptions!=None:
+      for x in exceptions:
+        post_cmd+=" ; rm -fR " + x.replace(src_dir, out_dir)
     return """genrule(
     name = "%s",
     outs = [
 %s
     ],
-    cmd = \"""cp -rLf "%s/." "%s/" \""",
-)""" % (name, "\n".join(outs), src_dir, out_dir)
+    cmd = \"""cp -rLf "%s/." "%s/" %s\""",
+)""" % (name, "\n".join(outs), src_dir, out_dir, post_cmd)
 
 def _flag_enabled(repository_ctx, flag_name):
     return get_host_environ(repository_ctx, flag_name) == "1"
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 3c345e6724b..3f518fb05f1 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -615,6 +615,8 @@ def _create_local_rocm_repository(repository_ctx):
             name = "rocm-include",
             src_dir = rocm_toolkit_path + "/include",
             out_dir = "rocm/include",
+            exceptions = [rocm_toolkit_path + "/include/gtest", 
+              rocm_toolkit_path + "/include/gmock"],
         ),
         make_copy_dir_rule(
             repository_ctx,

From cd24c2bdc771aa3576ef1fc1699374eabdc5447b Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Tue, 12 May 2020 11:06:01 +0800
Subject: [PATCH 029/412] [tflite] add fp16 support for evaluation tools

---
 .../lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc   | 2 ++
 .../lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h    | 3 +++
 .../lite/tools/delegates/default_execution_provider.cc       | 4 ++++
 .../lite/tools/evaluation/evaluation_delegate_provider.cc    | 4 ++++
 .../lite/tools/evaluation/proto/evaluation_stages.proto      | 3 +++
 .../lite/tools/evaluation/stages/tflite_inference_stage.cc   | 1 +
 .../tools/evaluation/tasks/coco_object_detection/run_eval.cc | 5 +++++
 .../tasks/imagenet_image_classification/run_eval.cc          | 5 +++++
 .../lite/tools/evaluation/tasks/inference_diff/run_eval.cc   | 3 +++
 9 files changed, 30 insertions(+)

diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index f318dc68d09..61c2acb8b2e 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -141,6 +141,8 @@ class CompositeObserver : public ImagenetModelEvaluator::Observer {
       tflite::Flag::CreateFlag(kNumRanksFlag, &params.num_ranks,
                                "Generates the top-1 to top-k accuracy values"
                                "where k = num_ranks. Default: 10"),
+      tflite::Flag::CreateFlag("allow_fp16", &params.allow_fp16,
+                               "allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
index 65d4a2c49f8..323069383c3 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
@@ -78,6 +78,9 @@ class ImagenetModelEvaluator {
 
     // Number of interpreter threads.
     int num_interpreter_threads = 1;
+
+    // allow fp16
+    bool allow_fp16 = false;
   };
 
   // An evaluation observer.
diff --git a/tensorflow/lite/tools/delegates/default_execution_provider.cc b/tensorflow/lite/tools/delegates/default_execution_provider.cc
index f75fd791072..67c38308206 100644
--- a/tensorflow/lite/tools/delegates/default_execution_provider.cc
+++ b/tensorflow/lite/tools/delegates/default_execution_provider.cc
@@ -30,6 +30,7 @@ class DefaultExecutionProvider : public DelegateProvider {
                              ToolParam::Create<int32_t>(0));
     default_params_.AddParam("min_nodes_per_partition",
                              ToolParam::Create<int32_t>(0));
+    default_params_.AddParam("allow_fp16", ToolParam::Create<bool>(false));
   }
 
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -44,6 +45,7 @@ std::vector<Flag> DefaultExecutionProvider::CreateFlags(
   std::vector<Flag> flags = {
       CreateFlag<int32_t>("num_threads", params,
                           "number of threads used for inference on CPU."),
+      CreateFlag<bool>("allow_fp16", params, "allow_fp16"),
       CreateFlag<int32_t>("max_delegated_partitions", params,
                           "Max number of partitions to be delegated."),
       CreateFlag<int32_t>(
@@ -61,6 +63,8 @@ void DefaultExecutionProvider::LogParams(const ToolParams& params) const {
                    << params.Get<int32_t>("max_delegated_partitions") << "]";
   TFLITE_LOG(INFO) << "Min nodes per partition : ["
                    << params.Get<int32_t>("min_nodes_per_partition") << "]";
+  TFLITE_LOG(INFO) << "allow_fp16: ["
+                   << params.Get<bool>("allow_fp16") << "]";
 }
 
 TfLiteDelegatePtr DefaultExecutionProvider::CreateTfLiteDelegate(
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
index 42f2666ba9b..a7625441406 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
@@ -132,6 +132,10 @@ tools::ToolParams DelegateProviders::GetAllParams(
     tool_params.Set<int32_t>("num_threads", params.num_threads());
   }
 
+  if (params.has_allow_fp16()) {
+    tool_params.Set<bool>("allow_fp16", params.allow_fp16());
+  }
+
   const auto type = params.delegate();
   switch (type) {
     case TfliteInferenceParams::NNAPI:
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
index 09765d71726..c7d033eb111 100644
--- a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
@@ -121,6 +121,9 @@ message TfliteInferenceParams {
   // This helps benchmark cases where extensive pre-processing might not be
   // required for every input.
   optional int32 invocations_per_run = 4 [default = 1];
+
+  // allow_fp16
+  optional bool allow_fp16 = 5 [default = false];
 }
 
 // Metrics specific to TFLite inference.
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
index 365a00c3cd1..8189140e953 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
@@ -95,6 +95,7 @@ TfLiteStatus TfliteInferenceStage::Init(
     return kTfLiteError;
   }
   interpreter_->SetNumThreads(params.num_threads());
+  interpreter_->SetAllowFp16PrecisionForFp32(params.allow_fp16());
 
   if (!delegate_providers) {
     std::string error_message;
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index 765e8fc6465..1ff4e55c270 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -65,6 +65,7 @@ class CocoObjectDetection : public TaskExecutor {
   bool debug_mode_;
   std::string delegate_;
   int num_interpreter_threads_;
+  bool allow_fp16_;
   DelegateProviders delegate_providers_;
 };
 
@@ -104,6 +105,9 @@ CocoObjectDetection::CocoObjectDetection(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'xnnpack', 'hexagon'}"),
+      tflite::Flag::CreateFlag(
+          "allow_fp16", &allow_fp16_,
+          "allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   DelegateProviders delegate_providers;
@@ -132,6 +136,7 @@ absl::optional<EvaluationStageMetrics> CocoObjectDetection::Run() {
   inference_params->set_model_file_path(model_file_path_);
   inference_params->set_num_threads(num_interpreter_threads_);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
+  inference_params->set_allow_fp16(allow_fp16_);
 
   // Get ground truth data.
   absl::flat_hash_map<std::string, ObjectDetectionResult> ground_truth_map;
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index 13eeb313ad4..1e1cf86732a 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -67,6 +67,7 @@ class ImagenetClassification : public TaskExecutor {
   std::string delegate_;
   int num_images_;
   int num_interpreter_threads_;
+  bool allow_fp16_;
   DelegateProviders delegate_providers_;
 };
 
@@ -106,6 +107,9 @@ ImagenetClassification::ImagenetClassification(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
+      tflite::Flag::CreateFlag(
+          "allow_fp16", &allow_fp16_,
+          "allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
@@ -155,6 +159,7 @@ absl::optional<EvaluationStageMetrics> ImagenetClassification::Run() {
   inference_params->set_model_file_path(model_file_path_);
   inference_params->set_num_threads(num_interpreter_threads_);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
+  inference_params->set_allow_fp16(allow_fp16_);
   classification_params->mutable_topk_accuracy_eval_params()->set_k(10);
 
   ImageClassificationStage eval(eval_config);
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
index 814ebe3b3bf..de41fb96a03 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
@@ -50,6 +50,7 @@ class InferenceDiff : public TaskExecutor {
   std::string delegate_;
   int num_runs_;
   int num_interpreter_threads_;
+  bool allow_fp16_;
   DelegateProviders delegate_providers_;
 };
 
@@ -71,6 +72,7 @@ InferenceDiff::InferenceDiff(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for test inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
+      tflite::Flag::CreateFlag("allow_fp16", &allow_fp16_, "allow fp16")
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
@@ -88,6 +90,7 @@ absl::optional<EvaluationStageMetrics> InferenceDiff::Run() {
   // generating random data.
   inference_params->set_invocations_per_run(3);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
+  inference_params->set_allow_fp16(allow_fp16_);
   if (!delegate_.empty() &&
       inference_params->delegate() == TfliteInferenceParams::NONE) {
     TFLITE_LOG(WARN) << "Unsupported TFLite delegate: " << delegate_;

From b76ef65778eb1ebd67d5f43d88b1e353c1c41a0c Mon Sep 17 00:00:00 2001
From: Peng Sun <peng.sun@arm.com>
Date: Mon, 11 May 2020 08:53:29 +0100
Subject: [PATCH 030/412] add bias to transpose_conv TESTs.

---
 .../lite/testing/op_tests/transpose_conv.py   | 25 ++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/testing/op_tests/transpose_conv.py b/tensorflow/lite/testing/op_tests/transpose_conv.py
index 654856f0d88..ce30860e289 100644
--- a/tensorflow/lite/testing/op_tests/transpose_conv.py
+++ b/tensorflow/lite/testing/op_tests/transpose_conv.py
@@ -38,6 +38,7 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 3, 4, 1], [1, 10, 10, 3], [3, 20, 20, 1]],
           "filter_size": [[1, 1], [1, 2], [3, 3]],
+          "has_bias": [False],
           "strides": [[1, 1, 1, 1], [1, 3, 3, 1]],
           "padding": ["SAME", "VALID"],
           "data_format": ["NHWC"],
@@ -50,6 +51,7 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 3, 3, 1]],
           "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
           "strides": [[1, 1, 1, 1]],
           "padding": ["SAME"],
           "data_format": ["NHWC"],
@@ -60,6 +62,7 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 3, 3, 1]],
           "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
           "strides": [[1, 2, 2, 1]],
           "padding": ["SAME"],
           "data_format": ["NHWC"],
@@ -70,13 +73,26 @@ def make_transpose_conv_tests(options):
       {
           "input_shape": [[1, 4, 3, 1]],
           "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [False],
           "strides": [[1, 2, 2, 1]],
           "padding": ["SAME"],
           "data_format": ["NHWC"],
           "channel_multiplier": [1],
           "output_shape": [[1, 8, 6, 2]],
           "fully_quantize": [True]
-      }
+      },
+      {
+          "input_shape": [[1, 3, 3, 1]],
+          "filter_size": [[3, 3, 2, 1]],
+          "has_bias": [True],
+          "strides": [[1, 1, 1, 1]],
+          "padding": ["SAME"],
+          "data_format": ["NHWC"],
+          "channel_multiplier": [1],
+          "output_shape": [[1, 3, 3, 2]],
+          "fully_quantize": [True]
+      },
+
   ]
 
   def get_tensor_shapes(parameters):
@@ -124,6 +140,13 @@ def make_transpose_conv_tests(options):
           strides=parameters["strides"],
           padding=parameters["padding"],
           data_format=parameters["data_format"])
+      if parameters["has_bias"]:
+        bias_input = create_tensor_data(np.float32,
+                                        (parameters["output_shape"][-1],),
+                                        min_value=-1,
+                                        max_value=1)
+        out = tf.nn.bias_add(out, bias_input,
+                             data_format=parameters["data_format"])
 
     return input_tensors, [out]
 

From df1ea3b0d4c834fe1dba719c27e3291f9af25c96 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 12 May 2020 15:08:56 +0000
Subject: [PATCH 031/412] Update docstring of tf.split to clarify
 num_or_size_splits

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_ops.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index fbb977f8d9a..8d1284da3d0 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1919,9 +1919,9 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
   See also `tf.unstack`.
 
-  If `num_or_size_splits` is an integer, then `value` is split along the
-  dimension `axis` into `num_split` smaller tensors. This requires that
-  `value.shape[axis]` is divisible by `num_split`.
+  If `num_or_size_splits` is an integer,  we call it num_split and
+  `value` is split along the dimension `axis` into `num_split` smaller
+  tensors. This requires that `value.shape[axis]` is divisible by `num_split`.
 
   If `num_or_size_splits` is a 1-D Tensor (or list), we call it `size_splits`
   and `value` is split into `len(size_splits)` elements. The shape of the `i`-th

From 275874a436c06be8d13521c291bde77d9e697c1a Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 12 May 2020 15:15:45 +0000
Subject: [PATCH 032/412] Combine into one block in example of docstring so
 that it could be rendered correctly

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_ops.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 8d1284da3d0..8aa5d66f402 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1931,13 +1931,15 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   For example:
 
   >>> x = tf.Variable(tf.random.uniform([5, 30], -1, 1))
-
-  Split `x` into 3 tensors along dimension 1
+  >>>
+  >>> # Split `x` into 3 tensors along dimension 1:
+  ...
   >>> s0, s1, s2 = tf.split(x, num_or_size_splits=3, axis=1)
   >>> tf.shape(s0).numpy()
   array([ 5, 10], dtype=int32)
-
-  Split `x` into 3 tensors with sizes [4, 15, 11] along dimension 1
+  >>>
+  >>> # Split `x` into 3 tensors with sizes [4, 15, 11] along dimension 1
+  ...
   >>> split0, split1, split2 = tf.split(x, [4, 15, 11], 1)
   >>> tf.shape(split0).numpy()
   array([5, 4], dtype=int32)

From adb282e47c7c73813270b082a23e674cf7087885 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 12 May 2020 09:24:51 -0700
Subject: [PATCH 033/412] Bump open source llvm revision to
 123bee602a260150ff55c74287f583a67ee78f36

PiperOrigin-RevId: 311139944
Change-Id: I31557f69d4c4cea061157fcff411f384dddeef05
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 36dc0c2b101..fe548fdec05 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -679,8 +679,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "728cf6d86b4f20144ac10517afb0cb978beac124"
-    LLVM_SHA256 = "41a24cf437be40c8a790b1095e6bfc3a9d531a44275abecddf2eda1835658bcc"
+    LLVM_COMMIT = "123bee602a260150ff55c74287f583a67ee78f36"
+    LLVM_SHA256 = "313ec75e47ea3f128724a61b8b6b45b7d305ba2ae57a5084b4bf1f881b4ec8f2"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 8e3bc844b1a081def879f563d49fee82e3a819ae Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Tue, 12 May 2020 09:31:06 -0700
Subject: [PATCH 034/412] Add support for a device ID op in parallel_device

The op doesn't really make sense to register kernels for, so I'm not registering it anywhere by default yet; it's currently just registered in the parallel device tests.

PiperOrigin-RevId: 311141160
Change-Id: Iff1839112dac6fe3406e4b31f0e6f7239809a5bb
---
 tensorflow/c/eager/parallel_device/BUILD      | 17 ++++++
 .../eager/parallel_device/parallel_device.cc  | 51 ++++++++++++++++
 .../parallel_device/parallel_device_ops.cc    | 26 ++++++++
 .../parallel_device/parallel_device_test.cc   | 59 ++++++++++++++-----
 .../python/distribute/parallel_device/BUILD   | 23 ++++++++
 .../parallel_device/parallel_device.py        | 20 +++++++
 .../parallel_device/parallel_device_test.py   |  6 ++
 7 files changed, 186 insertions(+), 16 deletions(-)
 create mode 100644 tensorflow/c/eager/parallel_device/parallel_device_ops.cc

diff --git a/tensorflow/c/eager/parallel_device/BUILD b/tensorflow/c/eager/parallel_device/BUILD
index 92947be79f8..3b2640e14d1 100644
--- a/tensorflow/c/eager/parallel_device/BUILD
+++ b/tensorflow/c/eager/parallel_device/BUILD
@@ -44,6 +44,7 @@ tf_cc_test(
     srcs = ["parallel_device_test.cc"],
     deps = [
         ":parallel_device",
+        ":parallel_device_ops",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
         "//tensorflow/c/eager:c_api",
@@ -53,3 +54,19 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+# Note: ParallelDevice-specific ops are experimental and not currently linked in
+# to TensorFlow by default, just used in a few tests.
+filegroup(
+    name = "parallel_device_ops_srcs",
+    srcs = ["parallel_device_ops.cc"],
+    visibility = ["//tensorflow/python/distribute/parallel_device:__pkg__"],
+)
+
+cc_library(
+    name = "parallel_device_ops",
+    srcs = [":parallel_device_ops_srcs"],
+    visibility = ["//tensorflow:internal"],
+    deps = ["//tensorflow/core:framework"],
+    alwayslink = 1,
+)
diff --git a/tensorflow/c/eager/parallel_device/parallel_device.cc b/tensorflow/c/eager/parallel_device/parallel_device.cc
index e6846809fcf..27c2699c4c2 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device.cc
@@ -92,6 +92,10 @@ class ParallelDevice {
                                                        TFE_TensorHandle* tensor,
                                                        TF_Status* status) const;
 
+  // A parallel tensor with scalar integers numbering component devices.
+  std::unique_ptr<ParallelTensor> DeviceIDs(TFE_Context* context,
+                                            TF_Status* status) const;
+
   // Takes a description of a single operation being executed on the
   // ParallelDevice, and in turn runs one operation per component device with
   // its corresponding inputs from the input ParallelTensors (or
@@ -208,6 +212,46 @@ std::unique_ptr<ParallelTensor> ParallelDevice::CopyToParallelDevice(
                                            status);
 }
 
+std::unique_ptr<ParallelTensor> ParallelDevice::DeviceIDs(
+    TFE_Context* context, TF_Status* status) const {
+  // TODO(allenl): We could cache DeviceIDs (keyed by context).
+  std::vector<TensorHandlePtr> components;
+  components.reserve(underlying_devices_.size());
+  for (int device_index = 0; device_index < underlying_devices_.size();
+       ++device_index) {
+    int64_t* device_id = new int64_t;
+    *device_id = device_index;
+    std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tensor(
+        TF_NewTensor(
+            TF_INT64, /*dims=*/nullptr, /*num_dims=*/0, device_id,
+            sizeof(int64_t),
+            [](void* data, size_t, void* arg) {
+              delete reinterpret_cast<int64_t*>(data);
+            },
+            nullptr),
+        TF_DeleteTensor);
+    // TODO(allenl): Here and when executing regular operations, we could hold
+    // on to one TFE_Op per device and just call TFE_ResetOp to avoid parsing
+    // device names repeatedly.
+    OpPtr const_op(TFE_NewOp(context, "Const", status));
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetDevice(const_op.get(), underlying_devices_[device_index].c_str(),
+                    status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetAttrTensor(const_op.get(), "value", tensor.get(), status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    TFE_OpSetAttrType(const_op.get(), "dtype", TF_INT64);
+    TFE_TensorHandle* device_handle;
+    int num_outputs = 1;
+    TFE_Execute(const_op.get(), &device_handle, &num_outputs, status);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+    components.emplace_back(device_handle);
+    if (TF_GetCode(status) != TF_OK) return nullptr;
+  }
+  return ParallelTensor::FromTensorHandles(*this, std::move(components),
+                                           status);
+}
+
 absl::optional<std::vector<MaybeParallelTensorOwned>> ParallelDevice::Execute(
     TFE_Context* context, std::vector<MaybeParallelTensorUnowned> inputs,
     const char* operation_name, const TFE_OpAttrs* attributes,
@@ -282,6 +326,13 @@ absl::optional<std::vector<MaybeParallelTensorOwned>> ParallelDevice::Execute(
     }
     result.emplace(std::move(outputs));
     return result;
+  } else if (operation_name == std::string("DeviceID")) {
+    std::vector<MaybeParallelTensorOwned> result_content;
+    result_content.reserve(1);
+    result_content.push_back(DeviceIDs(context, status));
+    if (TF_GetCode(status) != TF_OK) return result;
+    result.emplace(std::move(result_content));
+    return result;
   }
   absl::optional<std::vector<std::unique_ptr<ParallelTensor>>>
       maybe_parallel_results(
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_ops.cc b/tensorflow/c/eager/parallel_device/parallel_device_ops.cc
new file mode 100644
index 00000000000..1decffca047
--- /dev/null
+++ b/tensorflow/c/eager/parallel_device/parallel_device_ops.cc
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+
+// TODO(allenl): Figure out if we need this op, and if so whether we should move
+// it to core TF. Right now the eager C API does some checking of op
+// registrations before calling into custom devices, but we may be able to avoid
+// that.
+REGISTER_OP("DeviceID")
+    .Output("device_id: int64")
+    .SetIsStateful()
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_test.cc b/tensorflow/c/eager/parallel_device/parallel_device_test.cc
index 9b0613b0391..fdc140407df 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_test.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_test.cc
@@ -278,14 +278,15 @@ TensorHandlePtr Multiply(TFE_Context* context, TFE_TensorHandle* first,
 }
 
 // Assert that `handle` is equal to `expected_value`.
-void AssertScalarFloatEq(TFE_TensorHandle* handle, float expected_value) {
+template <typename value_type>
+void ExpectScalarEq(TFE_TensorHandle* handle, value_type expected_value) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> value_zero(
       TFE_TensorHandleResolve(handle, status.get()), TF_DeleteTensor);
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  ASSERT_EQ(expected_value,
-            *static_cast<float*>(TF_TensorData(value_zero.get())));
+  EXPECT_EQ(expected_value,
+            *static_cast<value_type*>(TF_TensorData(value_zero.get())));
 }
 
 template <std::size_t num_devices>
@@ -343,8 +344,8 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
     ExtractPerDeviceValues(context, read.get(), &components, status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-    AssertScalarFloatEq(components[0].get(), 20.);
-    AssertScalarFloatEq(components[1].get(), 20.);
+    ExpectScalarEq<float>(components[0].get(), 20.);
+    ExpectScalarEq<float>(components[1].get(), 20.);
 
     std::string first_device =
         TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
@@ -373,8 +374,8 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
     ExtractPerDeviceValues(context, read.get(), &components, status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-    AssertScalarFloatEq(components[0].get(), 23.);
-    AssertScalarFloatEq(components[1].get(), 18.);
+    ExpectScalarEq<float>(components[0].get(), 23.);
+    ExpectScalarEq<float>(components[1].get(), 18.);
 
     std::string first_device =
         TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
@@ -383,6 +384,32 @@ void BasicTestsForTwoDevices(TFE_Context* context, const char* first_device,
         TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
     ASSERT_EQ(underlying_devices[1], second_device);
   }
+  // Compute the device ID twice and verify the result
+  for (int i = 0; i < 2; ++i) {
+    std::unique_ptr<TFE_Op, decltype(&TFE_DeleteOp)> op(
+        TFE_NewOp(context, "DeviceID", status.get()), TFE_DeleteOp);
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_OpSetDevice(op.get(), device_name, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    TFE_TensorHandle* result_handle;
+    int num_retvals = 1;
+    TFE_Execute(op.get(), &result_handle, &num_retvals, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    std::array<TensorHandlePtr, 2> components;
+    ExtractPerDeviceValues(context, result_handle, &components, status.get());
+    TFE_DeleteTensorHandle(result_handle);
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    ExpectScalarEq<int64_t>(components[0].get(), 0);
+    ExpectScalarEq<int64_t>(components[1].get(), 1);
+    std::string first_device =
+        TFE_TensorHandleBackingDeviceName(components[0].get(), status.get());
+    ASSERT_EQ(underlying_devices[0], first_device);
+    std::string second_device =
+        TFE_TensorHandleBackingDeviceName(components[1].get(), status.get());
+    ASSERT_EQ(underlying_devices[1], second_device);
+  }
 }
 
 TEST(PARALLEL_DEVICE, TestBasicCPU) {
@@ -498,8 +525,8 @@ TEST(PARALLEL_DEVICE, TestExplicitCopies) {
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
     // The value of the original tensor is replicated on each device.
-    AssertScalarFloatEq(components[0].get(), 3.);
-    AssertScalarFloatEq(components[1].get(), 3.);
+    ExpectScalarEq<float>(components[0].get(), 3.);
+    ExpectScalarEq<float>(components[1].get(), 3.);
 
     // Verify that the mirrors are placed on the component devices.
     std::string first_device =
@@ -630,7 +657,7 @@ TEST(PARALLEL_DEVICE, TestNestedParallelDevices) {
                          &second_components, status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
 
-  AssertScalarFloatEq(second_components[1].get(), 9.);
+  ExpectScalarEq<float>(second_components[1].get(), 9.);
 
   // Verify that the mirrors are placed on the component devices.
   std::string first_device = TFE_TensorHandleBackingDeviceName(
@@ -644,8 +671,8 @@ TEST(PARALLEL_DEVICE, TestNestedParallelDevices) {
   std::array<TensorHandlePtr, 2> first_components;
   ExtractPerDeviceValues(context.get(), second_components[0].get(),
                          &first_components, status.get());
-  AssertScalarFloatEq(first_components[0].get(), 3.);
-  AssertScalarFloatEq(first_components[1].get(), 6.);
+  ExpectScalarEq<float>(first_components[0].get(), 3.);
+  ExpectScalarEq<float>(first_components[1].get(), 6.);
 
   first_device = TFE_TensorHandleBackingDeviceName(first_components[0].get(),
                                                    status.get());
@@ -806,8 +833,8 @@ TEST(PARALLEL_DEVICE, TestCollective) {
   ExtractPerDeviceValues(context.get(), reduced.get(), &result_components,
                          status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  AssertScalarFloatEq(result_components[0].get(), 3.);
-  AssertScalarFloatEq(result_components[1].get(), 3.);
+  ExpectScalarEq<float>(result_components[0].get(), 3.);
+  ExpectScalarEq<float>(result_components[1].get(), 3.);
 }
 
 void RegisterCollectiveMulFunction(TFE_Context* context,
@@ -909,8 +936,8 @@ TEST(PARALLEL_DEVICE, TestFunction) {
   ExtractPerDeviceValues(context.get(), reduced.get(), &result_components,
                          status.get());
   ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
-  AssertScalarFloatEq(result_components[0].get(), 7. * 9.);
-  AssertScalarFloatEq(result_components[1].get(), 7. * 9.);
+  ExpectScalarEq<float>(result_components[0].get(), 7. * 9.);
+  ExpectScalarEq<float>(result_components[1].get(), 7. * 9.);
 
   std::string first_device = TFE_TensorHandleBackingDeviceName(
       result_components[0].get(), status.get());
diff --git a/tensorflow/python/distribute/parallel_device/BUILD b/tensorflow/python/distribute/parallel_device/BUILD
index 43c6029f3d2..930816d4407 100644
--- a/tensorflow/python/distribute/parallel_device/BUILD
+++ b/tensorflow/python/distribute/parallel_device/BUILD
@@ -1,3 +1,6 @@
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+
 package(
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
@@ -14,6 +17,7 @@ py_library(
     srcs = ["parallel_device.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":parallel_device_ops",
         ":saving",
         "//tensorflow/python:_pywrap_parallel_device",
     ],
@@ -26,6 +30,25 @@ py_library(
     deps = ["//tensorflow/python:framework_ops"],
 )
 
+tf_gen_op_wrapper_py(
+    name = "parallel_device_ops_py",
+    out = "gen_parallel_device_ops.py",
+    deps = ["//tensorflow/c/eager/parallel_device:parallel_device_ops"],
+)
+
+tf_custom_op_library(
+    name = "_parallel_device_ops.so",
+    srcs = ["//tensorflow/c/eager/parallel_device:parallel_device_ops_srcs"],
+)
+
+tf_custom_op_py_library(
+    name = "parallel_device_ops",
+    dso = [":_parallel_device_ops.so"],
+    kernels = ["//tensorflow/c/eager/parallel_device:parallel_device_ops"],
+    visibility = ["//tensorflow:internal"],
+    deps = [":parallel_device_ops_py"],
+)
+
 py_test(
     name = "parallel_device_test",
     srcs = ["parallel_device_test.py"],
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device.py b/tensorflow/python/distribute/parallel_device/parallel_device.py
index 982b061cdb7..2dbdc653a64 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device.py
@@ -22,11 +22,17 @@ import contextlib
 import threading
 
 from tensorflow.python import _pywrap_parallel_device
+from tensorflow.python.distribute.parallel_device import gen_parallel_device_ops
 from tensorflow.python.distribute.parallel_device import saving
 from tensorflow.python.eager import context
+from tensorflow.python.framework import load_library
 from tensorflow.python.framework import ops
+from tensorflow.python.platform import resource_loader
 from tensorflow.python.tpu.ops import tpu_ops
 
+load_library.load_op_library(
+    resource_loader.get_path_to_datafile("_parallel_device_ops.so"))
+
 _next_device_number = 0
 _next_device_number_lock = threading.Lock()
 
@@ -58,6 +64,8 @@ class ParallelDevice(object):
     device, device_info = _pywrap_parallel_device.GetParallelDeviceCapsules(
         self.name, self.components)
     context.register_custom_device(device, self.name, device_info)
+    with ops.device(self.name):
+      self._device_ids = gen_parallel_device_ops.device_id()
 
   def pack(self, tensors):
     """Create a tensor on the parallel device from a sequence of tensors.
@@ -84,6 +92,18 @@ class ParallelDevice(object):
       return tpu_ops.tpu_replicated_output(
           parallel_tensor, num_replicas=len(self.components))
 
+  @property
+  def device_ids(self):
+    """A parallel tensor with scalar integers numbering component devices.
+
+    Each device ID is placed on its corresponding device, in the same order as
+    the `components` constructor argument.
+
+    Returns:
+      A parallel tensor containing 0 on the first device, 1 on the second, etc.
+    """
+    return self._device_ids
+
   # TODO(allenl): Fixing saving in Python is a bit odd. One alternative would be
   # to provide a hook for the custom device to create save specs/etc., then call
   # that hook from the default variable implementation if the variable is on a
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index d3f3417eca9..e35eb601cc5 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -119,6 +119,12 @@ class ParallelDeviceTests(_VirtualDeviceTestCase):
     self.assertIn(self.device.components[0], outputs[0].backing_device)
     self.assertIn(self.device.components[1], outputs[1].backing_device)
 
+  def test_device_id(self):
+    device_ids = self.device.unpack(self.device.device_ids)
+    self.assertAllClose([0, 1], device_ids)
+    self.assertIn(self.device.components[0], device_ids[0].backing_device)
+    self.assertIn(self.device.components[1], device_ids[1].backing_device)
+
   def test_collective_reduce(self):
     with ops.device(self.device.name):
       x = self.device.pack(

From 3116ec3708443de4360c631f62a23b26eccd6763 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 09:35:57 -0700
Subject: [PATCH 035/412] Resolve trivial aliases for portable TensorFlow
 targets.

PiperOrigin-RevId: 311142154
Change-Id: I702bddcc4b6dfb69d9a8747770fc88826603b1aa
---
 tensorflow/lite/delegates/flex/BUILD | 14 +++++++-------
 tensorflow/lite/testing/BUILD        |  8 ++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 9fe80605e39..d69d2207e63 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -26,7 +26,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api_internal",
@@ -66,7 +66,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:tensorflow",
@@ -103,7 +103,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:lib",
@@ -137,7 +137,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/common_runtime/eager:context",
@@ -183,7 +183,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/common_runtime/eager:context",
@@ -211,7 +211,7 @@ tf_cc_test(
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:tensorflow",
@@ -245,7 +245,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api_internal",
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 9d50f1ad604..df85f659bf3 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -329,7 +329,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
@@ -368,7 +368,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
@@ -408,7 +408,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
@@ -443,7 +443,7 @@ cc_library(
             "//tensorflow/core:android_tensorflow_lib",
         ],
         "//tensorflow:ios": [
-            "//tensorflow/core:ios_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )

From f4628678066c72309d3fd121af1aaf54d9905ca3 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 12 May 2020 09:46:28 -0700
Subject: [PATCH 036/412] [XLA:Python] Make sure xla_client is always imported
 before TPU client extension.

PiperOrigin-RevId: 311144400
Change-Id: Ia499185c36b5596b7aa25c44e51fd07696f85cfe
---
 tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py
index ef0caff0ae6..6d4482af43f 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py
@@ -20,6 +20,9 @@ from __future__ import print_function
 
 from absl import logging
 
+# Import xla_client to load shared C++ extensions (just CompileOptions at the
+# time of writing).
+from tensorflow.compiler.xla.python import xla_client  # pylint: disable=unused-import
 from tensorflow.compiler.xla.python.tpu_driver.client import tpu_client_extension as _tpu_client
 
 
From d2bc2b66a3a0e373d3a6ecf56d45955ae9375591 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 09:46:51 -0700
Subject: [PATCH 037/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311144477
Change-Id: Iaa231e5c7e87d6e930b37003675adb307dad79b4
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 2ffde8a33949bdf3209d58729f7c56045a621deb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 10:07:20 -0700
Subject: [PATCH 038/412] This breaks multi-python: The local gen_build_info
 rule calls into find_cuda_config, which only works in the remote image.

This is additionally brittle: relying on TF_CUDA_VERSION being an action_env is poisoning our caches, and running find_cuda_conifg multiple times is bugprone.

I think the better way to do this is to put the information from the repo_rule into a file template as part of the repo rule configuration (cuda_configure.bzl). Then we can just include that file, instead of trying to do that as part of the action.

PiperOrigin-RevId: 311148754
Change-Id: I80daa8652a85b2a1897c15117e6422bfd21cee6a
---
 tensorflow/python/BUILD                       |  19 ----
 .../python/keras/layers/recurrent_v2.py       |   4 +-
 tensorflow/python/platform/build_info_test.py |   6 +-
 tensorflow/python/platform/self_check.py      |   5 +-
 tensorflow/python/platform/sysconfig.py       |  28 -----
 tensorflow/python/platform/sysconfig_test.py  |  38 -------
 tensorflow/tensorflow.bzl                     |  25 ++---
 .../api/golden/v1/tensorflow.sysconfig.pbtxt  |   4 -
 .../api/golden/v2/tensorflow.sysconfig.pbtxt  |   4 -
 tensorflow/tools/build_info/BUILD             |   1 -
 tensorflow/tools/build_info/gen_build_info.py | 101 +++++++++---------
 tensorflow/tools/pip_package/setup.py         |  54 +++-------
 third_party/gpus/BUILD                        |   6 --
 13 files changed, 85 insertions(+), 210 deletions(-)
 delete mode 100644 tensorflow/python/platform/sysconfig_test.py

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index bf17c828d66..0b046ea8d61 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -264,7 +264,6 @@ py_library(
     deps = [
         ":_pywrap_util_port",
         ":lib",
-        ":platform_build_info",
         ":pywrap_tfe",
         ":util",
         "//tensorflow/core:protos_all_py",
@@ -329,24 +328,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "sysconfig_test",
-    size = "small",
-    srcs = ["platform/sysconfig_test.py"],
-    data = [
-        "platform/sysconfig.py",
-    ],
-    python_version = "PY3",
-    tags = [
-        "no_pip",
-        "no_windows",
-    ],
-    deps = [
-        ":platform",
-        ":platform_test",
-    ],
-)
-
 tf_py_test(
     name = "flags_test",
     size = "small",
diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py
index 9605c296885..a9d5ef8587c 100644
--- a/tensorflow/python/keras/layers/recurrent_v2.py
+++ b/tensorflow/python/keras/layers/recurrent_v2.py
@@ -601,7 +601,7 @@ def gpu_gru(inputs, init_h, kernel, recurrent_kernel, bias, mask, time_major,
   # (6 * units)
   bias = array_ops.split(K.flatten(bias), 6)
 
-  if build_info.build_info['is_cuda_build']:
+  if build_info.is_cuda_build:
     # Note that the gate order for CuDNN is different from the canonical format.
     # canonical format is [z, r, h], whereas CuDNN is [r, z, h]. The swap need
     # to be done for kernel, recurrent_kernel, input_bias, recurrent_bias.
@@ -1361,7 +1361,7 @@ def gpu_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias, mask,
   # so that mathematically it is same as the canonical LSTM implementation.
   full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)
 
-  if build_info.build_info['is_rocm_build']:
+  if build_info.is_rocm_build:
     # ROCm MIOpen's weight sequence for LSTM is different from both canonical
     # and Cudnn format
     # MIOpen: [i, f, o, c] Cudnn/Canonical: [i, f, c, o]
diff --git a/tensorflow/python/platform/build_info_test.py b/tensorflow/python/platform/build_info_test.py
index be253885715..f0df0b756cc 100644
--- a/tensorflow/python/platform/build_info_test.py
+++ b/tensorflow/python/platform/build_info_test.py
@@ -25,10 +25,8 @@ from tensorflow.python.platform import test
 class BuildInfoTest(test.TestCase):
 
   def testBuildInfo(self):
-    self.assertEqual(build_info.build_info['is_rocm_build'],
-                     test.is_built_with_rocm())
-    self.assertEqual(build_info.build_info['is_cuda_build'],
-                     test.is_built_with_cuda())
+    self.assertEqual(build_info.is_rocm_build, test.is_built_with_rocm())
+    self.assertEqual(build_info.is_cuda_build, test.is_built_with_cuda())
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/platform/self_check.py b/tensorflow/python/platform/self_check.py
index c10c4108c7d..f6cf7705e13 100644
--- a/tensorflow/python/platform/self_check.py
+++ b/tensorflow/python/platform/self_check.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import os
 
-MSVCP_DLL_NAMES = "msvcp_dll_names"
 
 try:
   from tensorflow.python.platform import build_info
@@ -43,9 +42,9 @@ def preload_check():
     # we load the Python extension, so that we can raise an actionable error
     # message if they are not found.
     import ctypes  # pylint: disable=g-import-not-at-top
-    if MSVCP_DLL_NAMES in build_info.build_info:
+    if hasattr(build_info, "msvcp_dll_names"):
       missing = []
-      for dll_name in build_info.build_info[MSVCP_DLL_NAMES].split(","):
+      for dll_name in build_info.msvcp_dll_names.split(","):
         try:
           ctypes.WinDLL(dll_name)
         except OSError:
diff --git a/tensorflow/python/platform/sysconfig.py b/tensorflow/python/platform/sysconfig.py
index a155ef04a4c..721ad99c60a 100644
--- a/tensorflow/python/platform/sysconfig.py
+++ b/tensorflow/python/platform/sysconfig.py
@@ -24,7 +24,6 @@ import platform as _platform
 from tensorflow.python.framework.versions import CXX11_ABI_FLAG as _CXX11_ABI_FLAG
 from tensorflow.python.framework.versions import MONOLITHIC_BUILD as _MONOLITHIC_BUILD
 from tensorflow.python.framework.versions import VERSION as _VERSION
-from tensorflow.python.platform import build_info
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -85,30 +84,3 @@ def get_link_flags():
     else:
       flags.append('-l:libtensorflow_framework.so.%s' % ver)
   return flags
-
-
-@tf_export('sysconfig.get_build_info')
-def get_build_info():
-  """Get a dictionary describing TensorFlow's build environment.
-
-  Values are generated when TensorFlow is compiled, and are static for each
-  TensorFlow package. The return value is a dictionary with string keys such as:
-
-    - cuda_version
-    - cudnn_version
-    - tensorrt_version
-    - nccl_version
-    - is_cuda_build
-    - is_rocm_build
-    - msvcp_dll_names
-    - nvcuda_dll_name
-    - cudart_dll_name
-    - cudnn_dll_name
-
-  Note that the actual keys and values returned by this function is subject to
-  change across different versions of TensorFlow or across platforms.
-
-  Returns:
-    A Dictionary describing TensorFlow's build environment.
-  """
-  return build_info.build_info
diff --git a/tensorflow/python/platform/sysconfig_test.py b/tensorflow/python/platform/sysconfig_test.py
deleted file mode 100644
index 3e5956bf4f7..00000000000
--- a/tensorflow/python/platform/sysconfig_test.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.platform import googletest
-from tensorflow.python.platform import sysconfig
-from tensorflow.python.platform import test
-
-
-class SysconfigTest(googletest.TestCase):
-
-  def test_get_build_info_works(self):
-    build_info = sysconfig.get_build_info()
-    self.assertIsInstance(build_info, dict)
-
-  def test_rocm_cuda_info_matches(self):
-    build_info = sysconfig.get_build_info()
-    self.assertEqual(build_info["is_rocm_build"], test.is_built_with_rocm())
-    self.assertEqual(build_info["is_cuda_build"], test.is_built_with_cuda())
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 7d35ee7d8f8..f56330b428a 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -2593,10 +2593,6 @@ def tf_version_info_genrule(name, out):
         arguments = "--generate \"$@\" --git_tag_override=${GIT_TAG_OVERRIDE:-}",
     )
 
-def dict_to_kv(d):
-    """Convert a dictionary to a space-joined list of key=value pairs."""
-    return " " + " ".join(["%s=%s" % (k, v) for k, v in d.items()])
-
 def tf_py_build_info_genrule(name, out):
     _local_genrule(
         name = name,
@@ -2604,17 +2600,16 @@ def tf_py_build_info_genrule(name, out):
         exec_tool = "//tensorflow/tools/build_info:gen_build_info",
         arguments =
             "--raw_generate \"$@\" " +
-            " --key_value" +
-            " is_rocm_build=" + if_rocm("True", "False") +
-            " is_cuda_build=" + if_cuda("True", "False") +
-            # TODO(angerson) Can we reliably load CUDA compute capabilities here?
-            if_windows(dict_to_kv({
-                "msvcp_dll_names": "msvcp140.dll,msvcp140_1.dll",
-            }), "") + if_windows_cuda(dict_to_kv({
-                "nvcuda_dll_name": "nvcuda.dll",
-                "cudart_dll_name": "cudart64_$$(echo $${TF_CUDA_VERSION:-} | sed \"s/\\.//\").dll",
-                "cudnn_dll_name": "cudnn64_$${TF_CUDNN_VERSION:-}.dll",
-            }), ""),
+            " --is_config_cuda " + if_cuda("True", "False") +
+            " --is_config_rocm " + if_rocm("True", "False") +
+            " --key_value " +
+            if_cuda(" cuda_version_number=${TF_CUDA_VERSION:-} cudnn_version_number=${TF_CUDNN_VERSION:-} ", "") +
+            if_windows(" msvcp_dll_names=msvcp140.dll,msvcp140_1.dll ", "") +
+            if_windows_cuda(" ".join([
+                "nvcuda_dll_name=nvcuda.dll",
+                "cudart_dll_name=cudart64_$(echo $${TF_CUDA_VERSION:-} | sed \"s/\\.//\").dll",
+                "cudnn_dll_name=cudnn64_${TF_CUDNN_VERSION:-}.dll",
+            ]), ""),
     )
 
 def cc_library_with_android_deps(
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt
index 7b05d382f6c..811ca18cdb4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sysconfig.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "MONOLITHIC_BUILD"
     mtype: "<type \'int\'>"
   }
-  member_method {
-    name: "get_build_info"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "get_compile_flags"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt
index 7b05d382f6c..811ca18cdb4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sysconfig.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "MONOLITHIC_BUILD"
     mtype: "<type \'int\'>"
   }
-  member_method {
-    name: "get_build_info"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "get_compile_flags"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/build_info/BUILD b/tensorflow/tools/build_info/BUILD
index f1292408448..556dd0c86f0 100644
--- a/tensorflow/tools/build_info/BUILD
+++ b/tensorflow/tools/build_info/BUILD
@@ -14,7 +14,6 @@ py_binary(
     srcs_version = "PY2AND3",
     tags = ["no-remote-exec"],
     deps = [
-        "//third_party/gpus:find_cuda_config",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/tools/build_info/gen_build_info.py b/tensorflow/tools/build_info/gen_build_info.py
index c8330d9310f..df9068fb3d1 100755
--- a/tensorflow/tools/build_info/gen_build_info.py
+++ b/tensorflow/tools/build_info/gen_build_info.py
@@ -1,4 +1,4 @@
-# Lint as: python3
+# Lint as: python2, python3
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,62 +19,50 @@ from __future__ import division
 from __future__ import print_function
 
 import argparse
-import os
-import platform
-import sys
 
 import six
 
-# CUDA library gathering is only valid in OSS
-try:
-  from third_party.gpus import find_cuda_config  # pylint: disable=g-import-not-at-top
-except ImportError:
-  find_cuda_config = None
 
-
-def write_build_info(filename, key_value_list):
+def write_build_info(filename, is_config_cuda, is_config_rocm, key_value_list):
   """Writes a Python that describes the build.
 
   Args:
     filename: filename to write to.
+    is_config_cuda: Whether this build is using CUDA.
+    is_config_rocm: Whether this build is using ROCm.
     key_value_list: A list of "key=value" strings that will be added to the
-      module's "build_info" dictionary as additional entries.
+      module as additional fields.
+
+  Raises:
+    ValueError: If `key_value_list` includes the key "is_cuda_build", which
+      would clash with one of the default fields.
   """
+  module_docstring = "\"\"\"Generates a Python module containing information "
+  module_docstring += "about the build.\"\"\""
 
-  build_info = {}
-  for arg in key_value_list:
-    key, value = six.ensure_str(arg).split("=")
-    if value.lower() == "true":
-      build_info[key] = True
-    elif value.lower() == "false":
-      build_info[key] = False
-    else:
-      build_info[key] = value
+  build_config_rocm_bool = "False"
+  build_config_cuda_bool = "False"
 
-  # Generate cuda_build_info, a dict describing the CUDA component versions
-  # used to build TensorFlow.
-  if find_cuda_config and build_info.get("is_cuda_build", False):
-    libs = ["_", "cuda", "cudnn"]
-    if platform.system() == "Linux":
-      if os.environ.get("TF_NEED_TENSORRT", "0") == "1":
-        libs.append("tensorrt")
-      if "TF_NCCL_VERSION" in os.environ:
-        libs.append("nccl")
-    # find_cuda_config accepts libraries to inspect as argv from the command
-    # line. We can work around this restriction by setting argv manually
-    # before calling find_cuda_config.
-    backup_argv = sys.argv
-    sys.argv = libs
-    cuda = find_cuda_config.find_cuda_config()
+  if is_config_rocm == "True":
+    build_config_rocm_bool = "True"
+  elif is_config_cuda == "True":
+    build_config_cuda_bool = "True"
 
-    build_info["cuda_version"] = cuda["cuda_version"]
-    build_info["cudnn_version"] = cuda["cudnn_version"]
-    build_info["tensorrt_version"] = cuda.get("tensorrt_version", None)
-    build_info["nccl_version"] = cuda.get("nccl_version", None)
-    sys.argv = backup_argv
+  key_value_pair_stmts = []
+  if key_value_list:
+    for arg in key_value_list:
+      key, value = six.ensure_str(arg).split("=")
+      if key == "is_cuda_build":
+        raise ValueError("The key \"is_cuda_build\" cannot be passed as one of "
+                         "the --key_value arguments.")
+      if key == "is_rocm_build":
+        raise ValueError("The key \"is_rocm_build\" cannot be passed as one of "
+                         "the --key_value arguments.")
+      key_value_pair_stmts.append("%s = %r" % (key, value))
+  key_value_pair_content = "\n".join(key_value_pair_stmts)
 
   contents = """
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -88,21 +76,33 @@ def write_build_info(filename, key_value_list):
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-\"\"\"Auto-generated module providing information about the build.\"\"\"
+%s
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import namedtuple
+is_rocm_build = %s
+is_cuda_build = %s
 
-build_info = {build_info}
-""".format(build_info=build_info)
+%s
+""" % (module_docstring, build_config_rocm_bool, build_config_cuda_bool,
+       key_value_pair_content)
   open(filename, "w").write(contents)
 
 
 parser = argparse.ArgumentParser(
     description="""Build info injection into the PIP package.""")
 
+parser.add_argument(
+    "--is_config_cuda",
+    type=str,
+    help="'True' for CUDA GPU builds, 'False' otherwise.")
+
+parser.add_argument(
+    "--is_config_rocm",
+    type=str,
+    help="'True' for ROCm GPU builds, 'False' otherwise.")
+
 parser.add_argument("--raw_generate", type=str, help="Generate build_info.py")
 
 parser.add_argument(
@@ -110,7 +110,10 @@ parser.add_argument(
 
 args = parser.parse_args()
 
-if args.raw_generate:
-  write_build_info(args.raw_generate, args.key_value)
+if (args.raw_generate is not None) and (args.is_config_cuda is not None) and (
+    args.is_config_rocm is not None):
+  write_build_info(args.raw_generate, args.is_config_cuda, args.is_config_rocm,
+                   args.key_value)
 else:
-  raise RuntimeError("--raw_generate must be used.")
+  raise RuntimeError(
+      "--raw_generate, --is_config_cuda and --is_config_rocm must be used")
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index fafe494bed4..f61e00c01d5 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -1,4 +1,3 @@
-# lint as: python3
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -44,8 +43,6 @@ from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
-from tensorflow.python.platform import build_info
-
 DOCLINES = __doc__.split('\n')
 
 # This version string is semver compatible, but incompatible with pip.
@@ -85,22 +82,6 @@ REQUIRED_PACKAGES = [
     'scipy == 1.2.2;python_version<"3"',
 ]
 
-# Generate a footer describing the CUDA technology this release was built
-# against.
-GPU_DESCRIPTION = ''
-if build_info.build_info['is_cuda_build']:
-  gpu_header = ('\nTensorFlow {} for NVIDIA GPUs was built with these '
-                'platform and library versions:\n\n  - ').format(_VERSION)
-  bi = build_info.build_info
-  trt_ver = bi['tensorrt_version']
-  nccl_ver = bi['nccl_version']
-  GPU_DESCRIPTION = gpu_header + '\n  - '.join([
-      'NVIDIA CUDA ' + bi['cuda_version'],
-      'NVIDIA cuDNN ' + bi['cudnn_version'],
-      'NVIDIA NCCL ' + 'not enabled' if not nccl_ver else nccl_ver,
-      'NVIDIA TensorRT ' + 'not enabled' if not trt_ver else trt_ver,
-  ])
-
 if sys.byteorder == 'little':
   # grpcio does not build correctly on big-endian machines due to lack of
   # BoringSSL support.
@@ -136,8 +117,7 @@ CONSOLE_SCRIPTS = [
     # even though the command is not removed, just moved to a different wheel.
     'tensorboard = tensorboard.main:run_main',
     'tf_upgrade_v2 = tensorflow.tools.compatibility.tf_upgrade_v2_main:main',
-    'estimator_ckpt_converter = '
-    'tensorflow_estimator.python.estimator.tools.checkpoint_converter:main',
+    'estimator_ckpt_converter = tensorflow_estimator.python.estimator.tools.checkpoint_converter:main',
 ]
 # pylint: enable=line-too-long
 
@@ -181,10 +161,11 @@ class InstallHeaders(Command):
   """
   description = 'install C/C++ header files'
 
-  user_options = [
-      ('install-dir=', 'd', 'directory to install header files to'),
-      ('force', 'f', 'force installation (overwrite existing files)'),
-  ]
+  user_options = [('install-dir=', 'd',
+                   'directory to install header files to'),
+                  ('force', 'f',
+                   'force installation (overwrite existing files)'),
+                 ]
 
   boolean_options = ['force']
 
@@ -194,7 +175,8 @@ class InstallHeaders(Command):
     self.outfiles = []
 
   def finalize_options(self):
-    self.set_undefined_options('install', ('install_headers', 'install_dir'),
+    self.set_undefined_options('install',
+                               ('install_headers', 'install_dir'),
                                ('force', 'force'))
 
   def mkdir_and_copy_file(self, header):
@@ -254,7 +236,9 @@ so_lib_paths = [
 
 matches = []
 for path in so_lib_paths:
-  matches.extend(['../' + x for x in find_files('*', path) if '.py' not in x])
+  matches.extend(
+      ['../' + x for x in find_files('*', path) if '.py' not in x]
+  )
 
 if os.name == 'nt':
   EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.pyd'
@@ -274,16 +258,17 @@ headers = (
     list(find_files('*.h', 'tensorflow/stream_executor')) +
     list(find_files('*.h', 'google/com_google_protobuf/src')) +
     list(find_files('*.inc', 'google/com_google_protobuf/src')) +
-    list(find_files('*', 'third_party/eigen3')) +
-    list(find_files('*.h', 'tensorflow/include/external/com_google_absl')) +
-    list(find_files('*.inc', 'tensorflow/include/external/com_google_absl')) +
-    list(find_files('*', 'tensorflow/include/external/eigen_archive')))
+    list(find_files('*', 'third_party/eigen3')) + list(
+        find_files('*.h', 'tensorflow/include/external/com_google_absl')) +
+    list(
+        find_files('*.inc', 'tensorflow/include/external/com_google_absl'))
+    + list(find_files('*', 'tensorflow/include/external/eigen_archive')))
 
 setup(
     name=project_name,
     version=_VERSION.replace('-', ''),
     description=DOCLINES[0],
-    long_description='\n'.join(DOCLINES[2:]) + GPU_DESCRIPTION,
+    long_description='\n'.join(DOCLINES[2:]),
     url='https://www.tensorflow.org/',
     download_url='https://github.com/tensorflow/tensorflow/tags',
     author='Google Inc.',
@@ -304,11 +289,6 @@ setup(
         ] + matches,
     },
     zip_safe=False,
-    # Accessible with importlib.metadata.metadata('tf-pkg-name').items()
-    platforms=[
-        '{}:{}'.format(key, value)
-        for key, value in build_info.build_info.items()
-    ],
     distclass=BinaryDistribution,
     cmdclass={
         'install_headers': InstallHeaders,
diff --git a/third_party/gpus/BUILD b/third_party/gpus/BUILD
index d570c4894ce..e69de29bb2d 100644
--- a/third_party/gpus/BUILD
+++ b/third_party/gpus/BUILD
@@ -1,6 +0,0 @@
-# Expose find_cuda_config.py as a library so other tools can reference it.
-py_library(
-    name = "find_cuda_config",
-    srcs = ["find_cuda_config.py"],
-    visibility = ["//visibility:public"],
-)

From b661070db9d29a2679310fe063b21582eeed9769 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Tue, 12 May 2020 10:11:10 -0700
Subject: [PATCH 039/412] IWYU in profiler/internal

PiperOrigin-RevId: 311149561
Change-Id: I71100194af937fc66d44c32265b8fe8febf070df
---
 tensorflow/core/profiler/internal/BUILD        | 16 ++++++----------
 .../core/profiler/internal/annotation_stack.cc |  4 ++++
 .../core/profiler/internal/annotation_stack.h  |  1 +
 tensorflow/core/profiler/internal/cpu/BUILD    | 16 ++++++++++------
 .../core/profiler/internal/cpu/host_tracer.cc  |  9 +++++++--
 .../profiler/internal/cpu/host_tracer_test.cc  | 18 ++++++++++++------
 .../profiler/internal/cpu/host_tracer_utils.cc |  3 +++
 .../internal/cpu/metadata_collector.cc         |  6 ++++++
 .../profiler/internal/cpu/python_tracer.cc     |  9 ++++-----
 .../core/profiler/internal/parse_annotation.cc |  3 +++
 .../core/profiler/internal/parse_annotation.h  |  1 -
 .../profiler/internal/parse_annotation_test.cc |  3 +++
 .../core/profiler/internal/profiler_factory.cc |  6 ++++++
 .../core/profiler/internal/profiler_factory.h  |  1 +
 .../profiler/internal/profiler_interface.h     |  1 -
 .../internal/scoped_annotation_test.cc         | 13 ++++++++-----
 .../core/profiler/internal/traceme_recorder.cc | 10 ++++++++++
 .../core/profiler/internal/traceme_recorder.h  |  2 --
 .../profiler/internal/traceme_recorder_test.cc | 15 ++++++++++++---
 19 files changed, 96 insertions(+), 41 deletions(-)

diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index 9fab42cd54a..85fa4e7fc44 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -423,8 +423,10 @@ tf_cc_test(
     deps = [
         ":traceme_recorder",
         "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -434,7 +436,6 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
     ],
 )
@@ -444,6 +445,7 @@ cc_library(
     hdrs = ["profiler_factory.h"],
     deps = [
         ":profiler_interface",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
     ] + if_static([
         ":profiler_factory_impl",
     ]),
@@ -461,8 +463,7 @@ cc_library(
     deps = [
         ":profiler_interface",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
     ],
     alwayslink = True,
 )
@@ -513,15 +514,10 @@ tf_cc_test(
     srcs = ["scoped_annotation_test.cc"],
     deps = [
         ":annotation_stack",
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/core/profiler/lib:scoped_annotation",
         "@com_google_absl//absl/strings",
     ],
@@ -544,6 +540,6 @@ tf_cc_test(
         ":parse_annotation",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
+        "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/profiler/internal/annotation_stack.cc b/tensorflow/core/profiler/internal/annotation_stack.cc
index 4cfd1027a68..4c15ca47c3d 100644
--- a/tensorflow/core/profiler/internal/annotation_stack.cc
+++ b/tensorflow/core/profiler/internal/annotation_stack.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/internal/annotation_stack.h"
 
+#include <atomic>
+
+#include "tensorflow/core/platform/types.h"
+
 namespace tensorflow {
 namespace profiler {
 namespace internal {
diff --git a/tensorflow/core/profiler/internal/annotation_stack.h b/tensorflow/core/profiler/internal/annotation_stack.h
index 38cd962cb32..e626c4c73cc 100644
--- a/tensorflow/core/profiler/internal/annotation_stack.h
+++ b/tensorflow/core/profiler/internal/annotation_stack.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <atomic>
+#include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
index e156667c5a7..c24c8c7d456 100644
--- a/tensorflow/core/profiler/internal/cpu/BUILD
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -18,6 +18,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:tf_op_utils",
         "//tensorflow/core/profiler/utils:xplane_builder",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -26,10 +27,10 @@ cc_library(
     srcs = ["host_tracer.cc"],
     deps = [
         ":host_tracer_utils",
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/internal:traceme_recorder",
@@ -50,14 +51,17 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/lib:profiler_session",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -67,17 +71,14 @@ cc_library(
     copts = ["-fexceptions"],
     features = ["-use_header_modules"],
     deps = [
-        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/python/profiler/internal:python_hooks",
-        "@com_google_absl//absl/strings",
     ],
     alwayslink = True,
 )
@@ -86,9 +87,12 @@ cc_library(
     name = "metadata_collector",
     srcs = ["metadata_collector.cc"],
     deps = [
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service/gpu:gpu_debug_info_manager",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler:profiler_options_proto_cc",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/internal:profiler_interface",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
index 30b87c84fa2..be1a7a2777b 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc
@@ -12,18 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
@@ -119,8 +124,8 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
           std::vector<absl::string_view> parts =
               absl::StrSplit(event.name, kUserMetadataMarker);
           if (parts.size() >= 2) {
-            ns->set_node_name(string(parts[0]));
-            ns->set_timeline_label(string(parts[1]));
+            ns->set_node_name(std::string(parts[0]));
+            ns->set_timeline_label(std::string(parts[1]));
           } else {
             ns->set_node_name(std::move(event.name));
           }
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
index e32ba92de66..499b7b6b564 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -12,17 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
+#include <ostream>
 #include <string>
 
 #include <gmock/gmock.h>
-#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
@@ -38,13 +44,13 @@ namespace {
 
 using ::testing::UnorderedElementsAre;
 
-NodeExecStats MakeNodeStats(const string& name, uint32 thread_id,
-                            const string& label = "") {
+NodeExecStats MakeNodeStats(absl::string_view name, uint32 thread_id,
+                            absl::string_view label = "") {
   NodeExecStats ns;
-  ns.set_node_name(name);
+  ns.set_node_name(std::string(name));
   ns.set_thread_id(thread_id);
   if (!label.empty()) {
-    ns.set_timeline_label(label);
+    ns.set_timeline_label(std::string(label));
   }
   return ns;
 }
@@ -109,7 +115,7 @@ TEST(HostTracerTest, CollectsTraceMeEventsAsRunMetadata) {
 
 TEST(HostTracerTest, CollectsTraceMeEventsAsXSpace) {
   uint32 thread_id;
-  string thread_name = "MyThreadName";
+  std::string thread_name = "MyThreadName";
   XSpace space;
 
   // We start a thread with a known and controled name. As of the time of
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
index a4709ae2113..2e5d8ac1770 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
 
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
diff --git a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
index c6aa7840920..58da20ae3c5 100644
--- a/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
+++ b/tensorflow/core/profiler/internal/cpu/metadata_collector.cc
@@ -13,17 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_builder.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/cpu/python_tracer.cc b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
index 103db6e0c71..d684cb8f768 100644
--- a/tensorflow/core/profiler/internal/cpu/python_tracer.cc
+++ b/tensorflow/core/profiler/internal/cpu/python_tracer.cc
@@ -12,18 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <utility>
-#include <vector>
+#include <memory>
 
-#include "absl/strings/str_split.h"
-#include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/env_var.h"
 #include "tensorflow/python/profiler/internal/python_hooks.h"
 
diff --git a/tensorflow/core/profiler/internal/parse_annotation.cc b/tensorflow/core/profiler/internal/parse_annotation.cc
index 2a3fa3f8454..32c26befa3d 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation.cc
@@ -15,6 +15,9 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 
 #include <stack>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/strings/ascii.h"
 #include "absl/strings/str_split.h"
diff --git a/tensorflow/core/profiler/internal/parse_annotation.h b/tensorflow/core/profiler/internal/parse_annotation.h
index 6c2e536962b..bb0f12217d3 100644
--- a/tensorflow/core/profiler/internal/parse_annotation.h
+++ b/tensorflow/core/profiler/internal/parse_annotation.h
@@ -16,7 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_PARSE_ANNOTATION_H_
 
-#include <utility>
 #include <vector>
 
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/core/profiler/internal/parse_annotation_test.cc b/tensorflow/core/profiler/internal/parse_annotation_test.cc
index 4d4a2d5ea95..e5d876ac5af 100644
--- a/tensorflow/core/profiler/internal/parse_annotation_test.cc
+++ b/tensorflow/core/profiler/internal/parse_annotation_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/parse_annotation.h"
 
+#include <vector>
+
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/internal/profiler_factory.cc b/tensorflow/core/profiler/internal/profiler_factory.cc
index e2bae59b892..5152e79bdc8 100644
--- a/tensorflow/core/profiler/internal/profiler_factory.cc
+++ b/tensorflow/core/profiler/internal/profiler_factory.cc
@@ -14,8 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/profiler_factory.h"
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/profiler_factory.h b/tensorflow/core/profiler/internal/profiler_factory.h
index 6bcdcf28c3c..c223d7275d9 100644
--- a/tensorflow/core/profiler/internal/profiler_factory.h
+++ b/tensorflow/core/profiler/internal/profiler_factory.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/internal/profiler_interface.h
index 79dfc7af2b2..9fe85e38652 100644
--- a/tensorflow/core/profiler/internal/profiler_interface.h
+++ b/tensorflow/core/profiler/internal/profiler_interface.h
@@ -16,7 +16,6 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_
 
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
diff --git a/tensorflow/core/profiler/internal/scoped_annotation_test.cc b/tensorflow/core/profiler/internal/scoped_annotation_test.cc
index 70a627fd640..50c1244b9ee 100644
--- a/tensorflow/core/profiler/internal/scoped_annotation_test.cc
+++ b/tensorflow/core/profiler/internal/scoped_annotation_test.cc
@@ -15,10 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
 
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/annotation_stack.h"
 
 namespace tensorflow {
@@ -48,11 +49,13 @@ TEST(ScopedAnnotation, Simple) {
   EXPECT_EQ(AnnotationStack::Get(), "");  // not enabled
 }
 
-string GenerateRandomString(int length) { return string(length, 'a'); }
+std::string GenerateRandomString(int length) {
+  return std::string(length, 'a');
+}
 
 void BM_ScopedAnnotationDisabled(int iters, int annotation_size) {
   testing::StopTiming();
-  string annotation = GenerateRandomString(annotation_size);
+  std::string annotation = GenerateRandomString(annotation_size);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
     ScopedAnnotation trace(annotation);
@@ -64,7 +67,7 @@ BENCHMARK(BM_ScopedAnnotationDisabled)->Arg(8)->Arg(32)->Arg(128);
 
 void BM_ScopedAnnotationEnabled(int iters, int annotation_size) {
   testing::StopTiming();
-  string annotation = GenerateRandomString(annotation_size);
+  std::string annotation = GenerateRandomString(annotation_size);
   AnnotationStack::Enable(true);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
@@ -78,7 +81,7 @@ BENCHMARK(BM_ScopedAnnotationEnabled)->Arg(8)->Arg(32)->Arg(128);
 
 void BM_ScopedAnnotationEnabled_Nested(int iters, int annotation_size) {
   testing::StopTiming();
-  string annotation = GenerateRandomString(annotation_size);
+  std::string annotation = GenerateRandomString(annotation_size);
   AnnotationStack::Enable(true);
   testing::StartTiming();
   for (int i = 0; i < iters; i++) {
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.cc b/tensorflow/core/profiler/internal/traceme_recorder.cc
index 365e3992bc3..268585bde8c 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder.cc
@@ -16,8 +16,18 @@ limitations under the License.
 
 #include <stddef.h>
 
+#include <algorithm>
+#include <atomic>
+#include <new>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/internal/traceme_recorder.h b/tensorflow/core/profiler/internal/traceme_recorder.h
index 8b5b32cf4bc..1da7d4cebb1 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.h
+++ b/tensorflow/core/profiler/internal/traceme_recorder.h
@@ -15,8 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
 
-#include <stddef.h>
-
 #include <atomic>
 #include <vector>
 
diff --git a/tensorflow/core/profiler/internal/traceme_recorder_test.cc b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
index 90478881361..8d7abc94e8f 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder_test.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
@@ -15,19 +15,28 @@ limitations under the License.
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 
 #include <atomic>
+#include <istream>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
-#include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/notification.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace {
 
+using ::testing::ElementsAre;
+
 MATCHER_P(Named, name, "") { return arg.name == name; }
 
 constexpr static uint64 kNanosInSec = 1000000000;
@@ -45,7 +54,7 @@ TEST(RecorderTest, SingleThreaded) {
 
   ASSERT_EQ(results.size(), 1);
   EXPECT_THAT(results[0].events,
-              ::testing::ElementsAre(Named("during1"), Named("during2")));
+              ElementsAre(Named("during1"), Named("during2")));
 }
 
 void SpinNanos(int nanos) {

From 2407170febcdc37fbe90d9f5d8968f2b94ec17dc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 10:48:33 -0700
Subject: [PATCH 040/412] Add json translation for tfjs mlir converter. TFJS
 ops are registered as TF custom ops, and utilize export_graphdef.cc to build
 out the GraphDef object that could contain both TF and TFJS dialects.

PiperOrigin-RevId: 311158257
Change-Id: I7313a5a01f12ef742a97fd5e9ff2bbffe8498b0c
---
 tensorflow/compiler/mlir/runlit.cfg.py        |   6 +-
 tensorflow/compiler/mlir/runlit.site.cfg.py   |   1 +
 .../mlir/tensorflow/utils/export_utils.cc     |  23 ++-
 .../mlir/tensorflow/utils/export_utils.h      |   7 +
 tensorflow/compiler/mlir/tfjs/BUILD           | 101 +++++++++-
 tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h   |   1 +
 tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD |  23 +++
 .../compiler/mlir/tfjs/tests/e2e/add.pbtxt    |  78 ++++++++
 .../compiler/mlir/tfjs/tests/e2e/prelu.pbtxt  | 175 ++++++++++++++++++
 .../compiler/mlir/tfjs/tf_tfjs_passes.cc      |   8 +-
 .../mlir/tfjs/translate/json_translate.cc     | 105 +++++++++++
 .../mlir/tfjs/translate/json_translate.h      |  31 ++++
 .../mlir/tfjs/translate/tf_tfjs_translate.cc  | 173 +++++++++++++++++
 .../mlir/tfjs/translate/tf_to_tfjs_json.cc    | 152 +++++++++++++++
 .../mlir/tfjs/translate/tf_to_tfjs_json.h     |  63 +++++++
 15 files changed, 938 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
 create mode 100644 tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt
 create mode 100644 tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt
 create mode 100644 tensorflow/compiler/mlir/tfjs/translate/json_translate.cc
 create mode 100644 tensorflow/compiler/mlir/tfjs/translate/json_translate.h
 create mode 100644 tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
 create mode 100644 tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc
 create mode 100644 tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h

diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index 6d3131a781c..f1271d0da24 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -70,9 +70,9 @@ tool_dirs = config.mlir_tf_tools_dirs + [
 ]
 tool_names = [
     'mlir-opt', 'mlir-translate', 'tf-opt', 'tf_tfl_translate',
-    'flatbuffer_to_string', 'flatbuffer_translate', 'tf-mlir-translate',
-    'mlir-tflite-runner', 'tfcompile', 'json_to_flatbuffer', 'xla-gpu-opt',
-    'xla-opt'
+    'tf_tfjs_translate', 'flatbuffer_to_string', 'flatbuffer_translate',
+    'tf-mlir-translate', 'mlir-tflite-runner', 'tfcompile',
+    'json_to_flatbuffer', 'xla-gpu-opt', 'xla-opt'
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/runlit.site.cfg.py b/tensorflow/compiler/mlir/runlit.site.cfg.py
index 661e6200df3..3e7596c75d7 100644
--- a/tensorflow/compiler/mlir/runlit.site.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.site.cfg.py
@@ -44,6 +44,7 @@ mlir_tf_tools_dirs = [
     'tensorflow/compiler/mlir',
     'tensorflow/compiler/mlir/lite',
     'tensorflow/compiler/mlir/tensorflow',
+    'tensorflow/compiler/mlir/tfjs',
     'tensorflow/compiler/mlir/xla',
     'tensorflow/compiler/aot',
     'tensorflow/compiler/xla/service/mlir_gpu',
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index cc795259893..4877cbc4a44 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -59,6 +59,18 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
+// static TensorFlow op prefix set.
+std::set<std::string>* GlobalOpPrefixes() {
+  static std::set<std::string>* global_op_prefixes = [] {
+    std::set<std::string>* result = new std::set<std::string>;
+    result->insert("tf.");
+    result->insert("_tf.");
+    result->insert("tf_executor.");
+    return result;
+  }();
+  return global_op_prefixes;
+}
+
 // Converts a location to the debug information for the node def.
 Status ConvertLocation(mlir::Location inst_loc,
                        NodeDef::ExperimentalDebugInfo* debug_info) {
@@ -268,8 +280,10 @@ StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef op_name) {
   // - ".sink" or ".Sink": only the NextIteration operation has this suffix. We
   // don't need to consider ".source"/".Source" because the nodes with this
   // suffix are skipped by the caller and will not be added to the graph.
-  if (!op_name.consume_front("_tf.") && !op_name.consume_front("tf.") &&
-      !op_name.consume_front("tf_executor.")) {
+  auto prefixes = GlobalOpPrefixes();
+  if (std::none_of(prefixes->begin(), prefixes->end(), [&](std::string prefix) {
+        return op_name.consume_front(prefix);
+      })) {
     return errors::FailedPrecondition("op node '", op_name.str(),
                                       "' was not a TF op!");
   }
@@ -506,4 +520,9 @@ bool IsLegacyCallInstruction(mlir::Operation* inst) {
          inst->getName().getStringRef().compare("_tf.LegacyCall") == 0;
 }
 
+Status AddTensorFlowOpPrefix(std::string prefix) {
+  GlobalOpPrefixes()->insert(prefix);
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
index 32ed528bd0d..58fe39fa4e8 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
@@ -34,10 +34,17 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 
+namespace mlir {
+class ShapedType;
+}  // namespace mlir
+
 namespace tensorflow {
 
 using stream_executor::port::StatusOr;
 
+// Add custom op prefix for TensorFlow dialects.
+Status AddTensorFlowOpPrefix(std::string);
+
 // Maps an MLIR op name in the TensorFlow dialect or the TensorFlow control
 // dialect back into a TensorFlow valid op name.
 StatusOr<llvm::StringRef> GetTensorFlowOpName(llvm::StringRef);
diff --git a/tensorflow/compiler/mlir/tfjs/BUILD b/tensorflow/compiler/mlir/tfjs/BUILD
index 9b731d2c912..806a77e9c38 100644
--- a/tensorflow/compiler/mlir/tfjs/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/BUILD
@@ -1,4 +1,5 @@
 load("//third_party/mlir:tblgen.bzl", "gentbl")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
 package(
     default_visibility = ["//visibility:public"],
@@ -131,10 +132,106 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_graph_optimization_pass",
-        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Transforms",
     ],
 )
+
+cc_library(
+    name = "json_translate_lib",
+    srcs = [
+        "translate/json_translate.cc",
+    ],
+    hdrs = [
+        "translate/json_translate.h",
+    ],
+    deps = [
+        ":tensorflow_js",
+        ":tensorflow_js_dialect_registration",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
+        "//tensorflow/compiler/mlir/tensorflow:export_utils",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_dialect_registration",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Translation",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tf_to_tfjs_json",
+    srcs = ["translate/tf_to_tfjs_json.cc"],
+    hdrs = [
+        "translate/tf_to_tfjs_json.h",
+    ],
+    deps = [
+        ":json_translate_lib",
+        ":tfjs_optimize",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:decode_constant_pass",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
+        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
+        "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_binary(
+    name = "json_translate",
+    deps = [
+        ":json_translate_lib",
+        "@llvm-project//mlir:MlirTranslateMain",
+    ],
+)
+
+filegroup(
+    name = "tf_tfjs_translate_main",
+    srcs = [
+        "translate/tf_tfjs_translate.cc",
+    ],
+)
+
+tf_cc_binary(
+    name = "tf_tfjs_translate",
+    srcs = [":tf_tfjs_translate_main"],
+    deps = [
+        ":json_translate_lib",
+        ":tensorflow_js_passes",
+        ":tf_to_tfjs_json",
+        ":tfjs_optimize",
+        "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
index 318895de79c..545183a052b 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+
 namespace mlir {
 namespace tfjs {
 
diff --git a/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD b/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
new file mode 100644
index 00000000000..5c8d37da2f0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/BUILD
@@ -0,0 +1,23 @@
+load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+
+licenses(["notice"])
+
+glob_lit_tests(
+    data = [
+        ":test_utilities",
+    ],
+    driver = "@llvm-project//mlir:run_lit.sh",
+    test_file_exts = [
+        "pbtxt",
+    ],
+)
+
+# Bundle together all of the test utilities that are used by tests.
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//tensorflow/compiler/mlir/tfjs:tf_tfjs_translate",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt b/tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt
new file mode 100644
index 00000000000..f6a324fdc13
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/add.pbtxt
@@ -0,0 +1,78 @@
+# RUN: tf_tfjs_translate %s -tf-input-arrays=input0,input1 -tf-input-data-types=DT_INT32,DT_INT32 -tf-input-shapes=10:10 -tf-output-arrays=Mul -o - | FileCheck %s --dump-input-on-failure
+# Add two tensor<4xi32> inputs and return the result
+
+node {
+  name: "Add"
+  op: "Add"
+  input: "input0"
+  input: "input1"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "input0"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "input1"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+node {
+  name: "Mul"
+  op: "Mul"
+  input: "Add"
+  input: "Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_INT32
+    }
+  }
+}
+versions {
+  producer: 27
+}
+
+# CHECK: "name": "input0"
+# CHECK-NEXT: "op": "Placeholder"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "input1",
+# CHECK-NEXT: "op": "Placeholder"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "Add"
+# CHECK-NEXT: "op": "AddV2"
+# CHECK-NEXT: "input":
+# CHECK-NEXT: "input0"
+# CHECK-NEXT: "input1"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "Mul1"
+# CHECK-NEXT: "op": "Mul"
+# CHECK-NEXT: "input":
+# CHECK-NEXT: "Add"
+# CHECK-NEXT: "Add"
+# CHECK: "type": "DT_INT32"
+# CHECK: "name": "Mul"
+# CHECK-NEXT: "op": "_Retval"
+# CHECK-NEXT: "input":
+# CHECK-NEXT: "Mul1"
+# CHECK: "type": "DT_INT32"
+# CHECK: "library"
+# CHECK: "versions"
+# CHECK: "producer": 27
+
diff --git a/tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt b/tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt
new file mode 100644
index 00000000000..810db71f5e0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/tests/e2e/prelu.pbtxt
@@ -0,0 +1,175 @@
+# RUN: tf_tfjs_translate %s -tf-input-arrays=input0 -tf-input-data-types=DT_FLOAT -tf-input-shapes=10 -tf-output-arrays=Add -tf-custom-opdefs="name: 'Prelu' input_arg: { name: 'x' type: DT_FLOAT } input_arg: { name: 'alpha' type: DT_FLOAT } output_arg: { name: 'c' type: DT_FLOAT }" -o - | FileCheck %s --dump-input-on-failure
+# Add two tensor<4xi32> inputs and return the result
+
+node {
+  name: "input0"
+  op: "Placeholder"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "shape"
+    value {
+      shape {
+        dim {
+          size: 10
+        }
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "alpha"
+  op: "Const"
+  attr {
+    key: "dtype"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "value"
+    value {
+      tensor {
+        dtype: DT_FLOAT
+        tensor_shape {
+        }
+        float_val: 0.5
+      }
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Relu"
+  op: "Relu"
+  input: "input0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Neg"
+  op: "Neg"
+  input: "input0"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Relu1"
+  op: "Relu"
+  input: "Neg"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Mul"
+  op: "Mul"
+  input: "alpha"
+  input: "Relu1"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "Add"
+  op: "Add"
+  input: "Relu"
+  input: "Mul"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  experimental_debug_info {
+  }
+}
+node {
+  name: "main"
+  op: "_Retval"
+  input: "Add"
+  attr {
+    key: "T"
+    value {
+      type: DT_FLOAT
+    }
+  }
+  attr {
+    key: "index"
+    value {
+      i: 0
+    }
+  }
+}
+library {
+}
+versions {
+  producer: 344
+}
+
+# CHECK: "node":
+# CHECK: "name": "input0",
+# CHECK-NEXT: "op": "Placeholder",
+# CHECK-NEXT: "attr":
+# CHECK: "type": "DT_FLOAT"
+# CHECK: "name": "Add.Relu.Neg.Relu1.Mul",
+# CHECK-NEXT: "op": "Const",
+# CHECK-NEXT: "attr":
+# CHECK: "value":
+# CHECK: "tensor":
+# CHECK: "dtype": "DT_FLOAT",
+# CHECK: "tensorShape": {},
+# CHECK: "floatVal":
+# CHECK: -0.5
+# CHECK: "name": "Add.Relu.Neg.Relu1.Mul1",
+# CHECK-NEXT: "op": "Prelu",
+# CHECK-NEXT: "input":
+# CHECK: "input0",
+# CHECK: "Add.Relu.Neg.Relu1.Mul"
+# CHECK: "attr":
+# CHECK: "_output_shapes":
+# CHECK: "list":
+# CHECK: "shape":
+# CHECK: "dim":
+# CHECK: "size": "10"
+# CHECK: "experimentalDebugInfo": {}
+# CHECK: "name": "Add",
+# CHECK-NEXT: "op": "_Retval",
+# CHECK-NEXT: "input":
+# CHECK: "Add.Relu.Neg.Relu1.Mul1"
+# CHECK: "attr":
+# CHECK: "T":
+# CHECK: "type": "DT_FLOAT"
+# CHECK: "library": {},
+# CHECK: "versions":
+# CHECK: "producer": 344
+
diff --git a/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
index 631bb1ae2af..a445937570e 100644
--- a/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
+++ b/tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/transforms/decode_constant.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tfjs/transforms/passes.h"
 
@@ -47,6 +46,11 @@ void AddTFToTFJSConversionPasses(mlir::OpPassManager* pm) {
   // Canonicalize, CSE etc.
   pm->addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
   pm->addNestedPass<mlir::FuncOp>(mlir::createCSEPass());
+
+  // raise to executor dialect in order to use GraphDef converter
+  pm->addNestedPass<mlir::FuncOp>(
+      mlir::CreateFunctionalToExecutorDialectConversionPass());
+  pm->addNestedPass<mlir::FuncOp>(mlir::CreateBreakUpIslandsPass());
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfjs/translate/json_translate.cc b/tensorflow/compiler/mlir/tfjs/translate/json_translate.cc
new file mode 100644
index 00000000000..7f4b8ffae09
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/json_translate.cc
@@ -0,0 +1,105 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfjs/translate/json_translate.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Translation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+
+using mlir::ModuleOp;
+using mlir::TranslateFromMLIRRegistration;
+using std::string;
+using tensorflow::Status;
+using xla::StatusOr;
+
+// Translates the given MLIR module in the TFJS dialect to TFJS JSON
+// format. Returns false on success.
+//
+bool tfjs::MlirToJSONTranslateFunction(ModuleOp module,
+                                       std::string* serialized_json) {
+  string json_output;
+  // Allow TF to treat TFJS ops as TF ops.
+  if (!tensorflow::AddTensorFlowOpPrefix("tfjs.").ok()) {
+    LOG(ERROR) << "Failed to add tfjs op prefix.";
+    return false;
+  }
+  tensorflow::GraphExportConfig confs;
+  confs.export_shapes = true;
+  confs.export_library = true;
+  tensorflow::FunctionLibraryDefinition flib_def(
+      tensorflow::OpRegistry::Global(), tensorflow::FunctionDefLibrary());
+  absl::flat_hash_set<tensorflow::Node*> control_ret_nodes;
+  auto graph = absl::make_unique<tensorflow::Graph>(flib_def);
+  auto status = tensorflow::ConvertMlirToGraph(module, confs, &graph, &flib_def,
+                                               &control_ret_nodes);
+  if (!status.ok()) {
+    LOG(ERROR) << "Graph export failed: " << status;
+    return false;
+  }
+  auto graphdef = absl::make_unique<tensorflow::GraphDef>();
+  graph->ToGraphDef(graphdef.get());
+
+  // Replace the _Arg nodes of the main function with Placeholder op.
+  auto nodes = graphdef->mutable_node();
+  for (const auto& node : llvm::enumerate(*nodes)) {
+    if (node.value().op() == "_Arg") {
+      nodes->Mutable(node.index())->set_op("Placeholder");
+    }
+  }
+
+  tensorflow::protobuf::util::JsonPrintOptions json_options;
+  json_options.add_whitespace = true;
+  auto jsonStatus = tensorflow::protobuf::util::MessageToJsonString(
+      *graphdef, &json_output, json_options);
+  if (!jsonStatus.ok()) {
+    LOG(ERROR) << "Proto2Json failed: " << status;
+    return false;
+  }
+  *serialized_json = std::move(json_output);
+  return true;
+}
+
+static mlir::LogicalResult MlirToJSONFileTranslateFunction(
+    ModuleOp module, llvm::raw_ostream& output) {
+  std::string serialized_json;
+  if (!tfjs::MlirToJSONTranslateFunction(module, &serialized_json))
+    return mlir::failure();
+
+  output << serialized_json;
+  return mlir::success();
+}
+
+static TranslateFromMLIRRegistration MLIRToJSONFileTranslate(
+    "mlir-to-tfjs-json", MlirToJSONFileTranslateFunction);
diff --git a/tensorflow/compiler/mlir/tfjs/translate/json_translate.h b/tensorflow/compiler/mlir/tfjs/translate/json_translate.h
new file mode 100644
index 00000000000..0a931f770ad
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/json_translate.h
@@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_JSON_TRANSLATE_H_
+#define TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_JSON_TRANSLATE_H_
+
+#include <string>
+
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tfjs {
+
+// Translates the given MLIR `module` into a JSON string. Returns true if
+// translation fails, otherwise returns false.
+bool MlirToJSONTranslateFunction(mlir::ModuleOp module,
+                                 std::string* serialized_json);
+}  // namespace tfjs
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_JSON_TRANSLATE_H_
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc b/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
new file mode 100644
index 00000000000..e735a3c7b8c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_tfjs_translate.cc
@@ -0,0 +1,173 @@
+
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <iostream>
+#include <string>
+
+#include "absl/strings/str_split.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Function.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/init_mlir.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
+#include "tensorflow/compiler/mlir/tfjs/tf_tfjs_passes.h"
+#include "tensorflow/compiler/mlir/tfjs/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+using llvm::cl::opt;
+using mlir::MLIRContext;
+using stream_executor::port::StatusOr;
+
+// NOLINTNEXTLINE
+opt<std::string> input_file_name(llvm::cl::Positional,
+                                 llvm::cl::desc("<input file>"),
+                                 llvm::cl::init("-"));
+
+// NOLINTNEXTLINE
+opt<bool> import_saved_model_object_graph(
+    "savedmodel-objectgraph-to-mlir",
+    llvm::cl::desc("Import a saved model to its MLIR representation"),
+    llvm::cl::value_desc("dir"));
+
+// NOLINTNEXTLINE
+opt<bool> import_saved_model_signature_defs(
+    "savedmodel-signaturedefs-to-mlir",
+    llvm::cl::desc("Import a saved model V1 to its MLIR representation"),
+    llvm::cl::value_desc("dir"));
+
+// NOLINTNEXTLINE
+opt<std::string> saved_model_tags(
+    "tf-savedmodel-tags",
+    llvm::cl::desc("Tags used to indicate which MetaGraphDef to import, "
+                   "separated by ','"),
+    llvm::cl::init("serve"));
+
+// NOLINTNEXTLINE
+opt<std::string> saved_model_exported_names(
+    "tf-savedmodel-exported-names",
+    llvm::cl::desc("Names to export from SavedModel, separated by ','. Empty "
+                   "(the default) means export all."),
+    llvm::cl::init(""));
+
+// NOLINTNEXTLINE
+opt<std::string> output_file_name("o", llvm::cl::desc("<output file>"),
+                                  llvm::cl::value_desc("filename"),
+                                  llvm::cl::init("-"));
+// NOLINTNEXTLINE
+opt<bool> input_mlir(
+    "input-mlir",
+    llvm::cl::desc("Take input TensorFlow model in textual MLIR instead of "
+                   "GraphDef format"),
+    llvm::cl::init(false), llvm::cl::Hidden);
+// NOLINTNEXTLINE
+opt<bool> output_mlir(
+    "output-mlir",
+    llvm::cl::desc("Output MLIR rather than JSON for the generated TFJS model"),
+    llvm::cl::init(false));
+
+// The following approach allows injecting opdefs in addition
+// to those that are already part of the global TF registry  to be linked in
+// prior to importing the graph. The primary goal is for support of custom ops.
+// This is not intended to be a general solution for custom ops for the future
+// but mainly for supporting older models like mobilenet_ssd. More appropriate
+// mechanisms, such as op hints or using functions to represent composable ops
+// like https://github.com/tensorflow/community/pull/113 should be encouraged
+// going forward.
+// NOLINTNEXTLINE
+llvm::cl::list<std::string> custom_opdefs(
+    "tf-custom-opdefs", llvm::cl::desc("List of custom opdefs when importing "
+                                       "graphdef"));
+
+// Debugging flag to print function mapping in the JSON.
+// NOLINTNEXTLINE
+static opt<bool> print_function_result_mapping(
+    "print-function-result-mapping",
+    llvm::cl::desc(
+        "Print the mapping of function result to json output buffer"),
+    llvm::cl::init(false));
+
+enum TranslationStatus { kTrSuccess, kTrFailure };
+
+static int PrintFunctionResultMapping(const std::string& result) {
+  std::cout << result << std::endl;
+  return kTrSuccess;
+}
+
+int main(int argc, char** argv) {
+  tensorflow::InitMlir y(&argc, &argv);
+
+  llvm::cl::ParseCommandLineOptions(argc, argv,
+                                    "TF GraphDef to TFJS JSON converter\n");
+
+  MLIRContext context;
+  llvm::SourceMgr source_mgr;
+  mlir::SourceMgrDiagnosticHandler sourceMgrHandler(source_mgr, &context);
+
+  StatusOr<mlir::OwningModuleRef> module;
+
+  if (import_saved_model_object_graph || import_saved_model_signature_defs) {
+    if (input_mlir)
+      module = tensorflow::errors::InvalidArgument(
+          "Importing saved model should not have input_mlir set");
+    module = tensorflow::ImportSavedModel(
+        import_saved_model_object_graph, import_saved_model_signature_defs,
+        custom_opdefs, input_file_name, saved_model_tags,
+        saved_model_exported_names, &context);
+  } else {
+    module = tensorflow::LoadFromGraphdefOrMlirSource(
+        input_file_name, input_mlir, custom_opdefs, debug_info_file,
+        input_arrays, input_dtypes, input_shapes, output_arrays,
+        /*prune_unused_nodes=*/true, &source_mgr, &context);
+  }
+
+  // If errors occur, the library call in the above already logged the error
+  // message. So we can just return here.
+  if (!module.ok()) return kTrFailure;
+
+  mlir::PassManager pm(&context);
+
+  tensorflow::AddTFToTFJSConversionPasses(&pm);
+
+  std::string result;
+  auto status = tensorflow::ConvertTFOpsToTfjsJSON(module.ValueOrDie().get(),
+                                                   output_mlir, &result, &pm);
+  if (!status.ok()) return kTrFailure;
+
+  std::string error_msg;
+  auto output = mlir::openOutputFile(output_file_name, &error_msg);
+  if (output == nullptr) {
+    llvm::errs() << error_msg << '\n';
+    return kTrFailure;
+  }
+  output->os() << result;
+  output->keep();
+
+  // Print out debugging info related to function mapping.
+  if (print_function_result_mapping) return PrintFunctionResultMapping(result);
+  return kTrSuccess;
+}
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc
new file mode 100644
index 00000000000..7dc9ea049ba
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.cc
@@ -0,0 +1,152 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h"
+
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tfjs/translate/json_translate.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def_builder.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+
+using mlir::MLIRContext;
+using mlir::ModuleOp;
+using mlir::OwningModuleRef;
+using stream_executor::port::StatusOr;
+
+namespace {
+tensorflow::Status RegisterCustomOps(
+    const std::vector<std::string>& extra_tf_opdefs) {
+  for (const auto& tf_opdefs_string : extra_tf_opdefs) {
+    tensorflow::OpDef opdef;
+    if (!tensorflow::protobuf::TextFormat::ParseFromString(tf_opdefs_string,
+                                                           &opdef)) {
+      LOG(ERROR) << "OpDef parsing failed for: " << tf_opdefs_string;
+      return errors::InvalidArgument("fail to parse extra OpDef");
+    }
+    // Register extra opdefs.
+    tensorflow::OpRegistry::Global()->Register(
+        [opdef](tensorflow::OpRegistrationData* op_reg_data) -> Status {
+          *op_reg_data = tensorflow::OpRegistrationData(opdef);
+          return Status::OK();
+        });
+  }
+  return Status::OK();
+}
+}  // namespace
+
+StatusOr<OwningModuleRef> LoadFromGraphdefOrMlirSource(
+    const std::string& input_filename, bool input_mlir,
+    const std::vector<std::string>& extra_tf_opdefs,
+    absl::string_view debug_info_file, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, bool prune_unused_nodes,
+    llvm::SourceMgr* source_mgr, MLIRContext* context) {
+  // Set up the input file.
+  std::string error_message;
+  auto file = mlir::openInputFile(input_filename, &error_message);
+  if (!file) {
+    llvm::errs() << error_message << "\n";
+    return errors::InvalidArgument("fail to open input file");
+  }
+
+  if (input_mlir) {
+    source_mgr->AddNewSourceBuffer(std::move(file), llvm::SMLoc());
+    return OwningModuleRef(mlir::parseSourceFile(*source_mgr, context));
+  }
+
+  TF_RETURN_IF_ERROR(RegisterCustomOps(extra_tf_opdefs));
+
+  return tensorflow::GraphdefToMlirTranslateFunction(
+      file->getBuffer(), debug_info_file, input_arrays, input_dtypes,
+      input_shapes, output_arrays, /*control_output_arrays=*/"",
+      prune_unused_nodes, /*convert_legacy_fed_inputs=*/true,
+      /*graph_as_function=*/false, /*upgrade_legacy=*/true,
+      /*enable_shape_inference=*/true, context);
+}
+
+Status ConvertTFOpsToTfjsJSON(mlir::ModuleOp module, bool export_to_mlir,
+                              std::string* result,
+                              mlir::PassManager* pass_manager) {
+  mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(),
+                                                    /*propagate=*/true);
+  if (failed(pass_manager->run(module))) {
+    return statusHandler.ConsumeStatus();
+  }
+
+  if (export_to_mlir) {
+    llvm::raw_string_ostream os(*result);
+    module.print(os);
+    return Status::OK();
+  }
+
+  return tfjs::MlirToJSONTranslateFunction(module, result)
+             ? Status::OK()
+             : statusHandler.ConsumeStatus();
+}
+
+StatusOr<mlir::OwningModuleRef> ImportSavedModel(
+    bool import_saved_model, bool import_saved_model_v1,
+    const std::vector<std::string>& extra_tf_opdefs,
+    const std::string& input_filename, const std::string& saved_model_tags,
+    const std::string& saved_model_exported_names, mlir::MLIRContext* context) {
+  std::unordered_set<std::string> tags = absl::StrSplit(saved_model_tags, ',');
+  std::vector<std::string> exported_names_in_vector =
+      absl::StrSplit(saved_model_exported_names, ',', absl::SkipEmpty());
+  absl::Span<std::string> exported_names(exported_names_in_vector);
+  if (import_saved_model) {
+    auto module = tensorflow::SavedModelObjectGraphToMlirImport(
+        input_filename, tags, absl::Span<std::string>(exported_names), context);
+    if (!module)
+      return tensorflow::errors::InvalidArgument("fail to open input file");
+    TF_RETURN_IF_ERROR(RegisterCustomOps(extra_tf_opdefs));
+    return module;
+  } else if (import_saved_model_v1) {
+    auto module = tensorflow::SavedModelSignatureDefsToMlirImport(
+        input_filename, tags, exported_names, context);
+
+    if (!module)
+      return tensorflow::errors::InvalidArgument("fail to open input file");
+    TF_RETURN_IF_ERROR(RegisterCustomOps(extra_tf_opdefs));
+    return module;
+  } else {
+    return tensorflow::errors::InvalidArgument(
+        "Should be either saved model v1 or v2");
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h
new file mode 100644
index 00000000000..d68f0e7d46e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfjs/translate/tf_to_tfjs_json.h
@@ -0,0 +1,63 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_TF_TO_TFJS_JSON_H_
+#define TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_TF_TO_TFJS_JSON_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "llvm/Support/SourceMgr.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Module.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+
+namespace tensorflow {
+
+// Load a TF model from a GraphDef definition or a TF control flow dialect MLIR
+// source into a MLIR module. If `input_mlir` is true, load from a MLIR source
+// file; otherwise, load from a GraphDef.
+// Setting prune_unused_nodes to true, would prune unreachable nodes if
+// output_arrays is specified.
+stream_executor::port::StatusOr<mlir::OwningModuleRef>
+LoadFromGraphdefOrMlirSource(
+    const std::string& input_filename, bool input_mlir,
+    const std::vector<std::string>& extra_tf_opdefs,
+    absl::string_view debug_info_file, absl::string_view input_arrays,
+    absl::string_view input_dtypes, absl::string_view input_shapes,
+    absl::string_view output_arrays, bool prune_unused_nodes,
+    llvm::SourceMgr* source_mgr, mlir::MLIRContext* context);
+
+// Load Saved model (either v1 or v2) into MLIR.
+stream_executor::port::StatusOr<mlir::OwningModuleRef> ImportSavedModel(
+    bool import_saved_model, bool import_saved_model_v1,
+    const std::vector<std::string>& extra_tf_opdefs,
+    const std::string& input_filename, const std::string& saved_model_tags,
+    const std::string& saved_model_exported_names, mlir::MLIRContext* context);
+
+// Taking a MLIR module in TF executor dialect and a set of parameters,
+// applies a set of passes to convert the module to TFJS dialect and
+// serializes the result to JSON string.
+// If `export_to_mlir` is true, the result is exported in MLIR text format,
+// otherwise exported in JSON.
+Status ConvertTFOpsToTfjsJSON(mlir::ModuleOp module, bool export_to_mlir,
+                              std::string* result,
+                              mlir::PassManager* pass_manager);
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFJS_TRANSLATE_TF_TO_TFJS_JSON_H_

From 88acf9fcc52b17def5f3600dcf02744cf655fec1 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 12 May 2020 11:15:46 -0700
Subject: [PATCH 041/412] Install the `wrapt` pip package.

PiperOrigin-RevId: 311164427
Change-Id: Ia1b287cf2285861dbc86be2349d4c322061dbbf8
---
 tensorflow/tools/ci_build/release/common.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index a6ef52b8bea..bb40042e3af 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -146,6 +146,7 @@ function install_pip_deps {
   ${PIP_CMD} install --user --upgrade attrs
   ${PIP_CMD} install --user --upgrade tf-estimator-nightly
   ${PIP_CMD} install --user --upgrade "future>=0.17.1"
+  ${PIP_CMD} install --user --upgrade wrapt
   # LINT.ThenChange(:ubuntu_16_pip_installations)
 }
 
@@ -178,6 +179,7 @@ function install_ubuntu_16_pip_deps {
   "${PIP_CMD}" install PyYAML==3.13 --user
   "${PIP_CMD}" install --user --upgrade tf-estimator-nightly
   "${PIP_CMD}" install --user --upgrade tb-nightly
+  "${PIP_CMD}" install --user --upgrade wrapt
   # LINT.ThenChange(:ubuntu_pip_installations)
 }
 
@@ -219,6 +221,7 @@ function install_macos_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
   ${PIP_CMD} install --user --upgrade attrs
   ${PIP_CMD} install --user --upgrade tf-estimator-nightly
+  ${PIP_CMD} install --user --upgrade wrapt
   ${PIP_CMD} install --user --upgrade "future>=0.17.1"
 }
 

From a2afd0e3588725f3839522e75e324febc9aaeaf5 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Tue, 12 May 2020 11:24:53 -0700
Subject: [PATCH 042/412] Refactor MLIR TF shape inference to have a context

This enables reusing the partial results computed/caching the query results (ValuePortResultMap). This also reduce some arguments being passed around (else in the follow up I'd need to pass a context everywhere). Should be NFC change.

PiperOrigin-RevId: 311166241
Change-Id: Icb6ea66c6c16a06d4bc9077225f1d7a783548dca
---
 .../tensorflow/transforms/shape_inference.cc  | 292 ++++++++++--------
 1 file changed, 162 insertions(+), 130 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 41902c46b40..5a2cae38062 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -66,8 +66,7 @@ using tensorflow::shape_inference::ShapeHandle;
 namespace mlir {
 namespace TF {
 namespace {
-Optional<llvm::SmallVector<mlir::Type, 4>> InferShapeForFunctionReturnType(
-    FuncOp func) {
+Optional<SmallVector<Type, 4>> InferShapeForFunctionReturnType(FuncOp func) {
   // Find any return ops.
   SmallVector<ReturnOp, 4> return_ops;
   for (Block& block : func) {
@@ -137,9 +136,9 @@ void AddCastBackForUnsupportedNonTFUses(Operation* op, Value result,
       cast_op = b.create<TF::CastOp>(op->getLoc(), old_type, result,
                                      /*truncate=*/b.getBoolAttr(false));
     }
-    return mlir::Value(cast_op);
+    return Value(cast_op);
   };
-  for (OpOperand& use : llvm::make_early_inc_range(result.getUses())) {
+  for (OpOperand& use : make_early_inc_range(result.getUses())) {
     if (use.getOwner()->getDialect() != tf_dialect &&
         !IsSupportedNonTFOp(use.getOwner()))
       use.set(get_cast_op());
@@ -162,7 +161,7 @@ Optional<tensorflow::PartialTensorShape> GetShapeFromMlirType(Type t) {
 bool InferShapeForPassThroughOps(OperandRange pass_through_operands,
                                  Operation* op, Dialect* tf_dialect) {
   bool changed = false;
-  for (auto entry : llvm::zip(pass_through_operands, op->getResults())) {
+  for (auto entry : zip(pass_through_operands, op->getResults())) {
     Type operand_type = std::get<0>(entry).getType();
     Value result = std::get<1>(entry);
     if (result.getType() == operand_type) continue;
@@ -204,7 +203,7 @@ bool InferShapeForNonTFDialectOperation(Operation* op, Dialect* tf_dialect) {
         tf_dialect);
   }
   // TODO(b/155227679): Use OpInterface instead of hard-coding for TensorCastOp.
-  if (auto tensor_cast = dyn_cast<mlir::TensorCastOp>(op)) {
+  if (auto tensor_cast = dyn_cast<TensorCastOp>(op)) {
     return InferShapeForPassThroughOps(
         tensor_cast.getOperation()->getOperands(), op, tf_dialect);
   }
@@ -254,7 +253,7 @@ GetSubtypes(Type type) {
 // match the i-th operand type). Returns true if anything is changed.
 bool PassThroughOperandTypes(OperandRange operands, ResultRange results) {
   bool changed = false;
-  for (auto entry : llvm::zip(operands, results)) {
+  for (auto entry : zip(operands, results)) {
     Type operand_type = std::get<0>(entry).getType();
     Type result_type = std::get<1>(entry).getType();
     if (operand_type == result_type) continue;
@@ -291,14 +290,13 @@ bool InferShapeForCall(Operation* op) {
   CallInterfaceCallable callable = call_op.getCallableForCallee();
   SymbolRefAttr sym = callable.dyn_cast<SymbolRefAttr>();
   if (!sym) return false;
-  FuncOp func =
-      dyn_cast<mlir::FuncOp>(SymbolTable::lookupNearestSymbolFrom(op, sym));
+  FuncOp func = dyn_cast<FuncOp>(SymbolTable::lookupNearestSymbolFrom(op, sym));
   if (!func) return false;
 
   bool changed = false;
   // Map each of the results of the call to the returned type of the
   // function.
-  for (auto result : llvm::zip(op->getResults(), func.getType().getResults())) {
+  for (auto result : zip(op->getResults(), func.getType().getResults())) {
     if (std::get<0>(result).getType() == std::get<1>(result)) continue;
     // Skip already statically shaped results.
     if (!CanBeRefined(std::get<0>(result).getType())) continue;
@@ -335,7 +333,7 @@ bool RefineWithInferTypeOpInterface(InferTypeOpInterface infer_ti,
   // Map each of the results of the call to the returned type of the
   // function.
   bool changed = false;
-  for (auto result : llvm::zip(op->getResults(), inferred)) {
+  for (auto result : zip(op->getResults(), inferred)) {
     if (std::get<0>(result).getType() == std::get<1>(result)) continue;
 
     // Inserts a cast back to the original type if any user is not in the
@@ -356,7 +354,7 @@ bool RefineWithInferTypeOpInterface(InferTypeOpInterface infer_ti,
 // so for tf.Const -> tensor<10x20xf32>, [0,2,18] would point to a unique output
 // scalar value).
 struct ValuePort {
-  llvm::PointerUnion<Operation*, BlockArgument> producer;
+  PointerUnion<Operation*, BlockArgument> producer;
   SmallVector<unsigned int, 2> port;
 
   bool operator==(const ValuePort& other) const {
@@ -374,39 +372,38 @@ struct ValuePort {
       port = {0};
     }
   }
-  ValuePort(llvm::PointerUnion<Operation*, BlockArgument> producer,
+  ValuePort(PointerUnion<Operation*, BlockArgument> producer,
             SmallVector<unsigned int, 2> port)
       : producer(producer), port(port) {}
 
-  llvm::raw_ostream& print(llvm::raw_ostream& os) const {
+  raw_ostream& print(raw_ostream& os) const {
     if (auto* op = producer.dyn_cast<Operation*>())
       os << "op " << op->getName();
     if (auto ba = producer.dyn_cast<BlockArgument>())
       os << "block_arg " << ba.getArgNumber();
-    os << llvm::formatv(" [{0}]", llvm::make_range(port.begin(), port.end()));
+    os << formatv(" [{0}]", llvm::make_range(port.begin(), port.end()));
     return os;
   }
 };
 
 struct ValuePortHasher {
   std::size_t operator()(const ValuePort& other) const {
-    return llvm::hash_combine(
-        llvm::hash_value(other.producer.getOpaqueValue()),
-        llvm::hash_value(ArrayRef<unsigned int>(other.port)));
+    return hash_combine(llvm::hash_value(other.producer.getOpaqueValue()),
+                        hash_value(ArrayRef<unsigned int>(other.port)));
   }
 };
 
 using ValuePortResultMap =
     std::unordered_map<ValuePort, Attribute, ValuePortHasher>;
-using ComputedQueryFn = llvm::function_ref<bool(ValuePort)>;
-using ValueQueryFn = llvm::function_ref<Attribute(const ValuePort&)>;
-using ValuePortInputs = llvm::SmallVectorImpl<ValuePort>;
+using ComputedQueryFn = function_ref<bool(ValuePort)>;
+using ValueQueryFn = function_ref<Attribute(const ValuePort&)>;
+using ValuePortInputs = SmallVectorImpl<ValuePort>;
 
-// TODO(jpienaar): InputsRequiredForOutput and ComputeOutputComponent are
+// TODO(jpienaar): ComputeInputsRequiredForOutput and ComputeOutputComponent are
 // intended to be switched to op interfaces once more refined.
-LogicalResult InputsRequiredForOutput(ValuePort value_port,
-                                      ComputedQueryFn has_been_computed,
-                                      ValuePortInputs* inputs) {
+LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
+                                             ComputedQueryFn has_been_computed,
+                                             ValuePortInputs* inputs) {
   auto op = value_port.producer.dyn_cast<Operation*>();
   auto& port = value_port.port;
   if (!op) return failure();
@@ -460,26 +457,94 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
   return nullptr;
 }
 
-ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
+// Context used during ShapeInference. This class contains common information
+// that is required by the individual shape inference helper functions (e.g.,
+// TF Graph version, constant values computed, etc.)
+class ShapeInference {
+ public:
+  ShapeInference(int64_t graph_version, MLIRContext* context);
+
+  LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
+                                               ValuePortInputs* inputs) {
+    return ::mlir::TF::ComputeInputsRequiredForOutput(
+        value_port,
+        [this](const ValuePort& port) {
+          return results_.find(port) != results_.end();
+        },
+        inputs);
+  }
+
+  Attribute ComputeOutputComponent(const ValuePort& value_port) {
+    return ::mlir::TF::ComputeOutputComponent(
+        value_port, [this](const ValuePort& port) { return results_[port]; });
+  }
+
+  // Returns ShapeHandle if the op result could be computed as shape.
+  ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic);
+
+  void RecordValue(const ValuePort& value_port, Attribute value) {
+    results_[value_port] = value;
+  }
+
+  // Performs shape inference on the provided op and return true if the type of
+  // at least one result has been changed.
+  // A tf.Cast() is inserted for any uses that isn't in the TensorFlow dialect.
+  // `graph_version` indicates the current GraphDef compatibility versions
+  // (the versions field in graph.proto).
+  bool InferShapeForSingleOperation(Operation* op);
+
+  // Infers shape on the provided region, including nested ones, iterate until
+  // fix point with a limit of max_iteration. Returns success if fix point is
+  // reached before max_iteration.
+  LogicalResult InferShapeUntilFixPoint(Region* region,
+                                        int64_t max_iteration = 10);
+
+  // Updates input types and refine shapes inside body of functions that are
+  // attached to ControlFlow ops (If/While). These functions include Then/Else
+  // branches of IfOp and Cond/Body functions of WhileOp. These functions share
+  // following common properties:
+  //   1) They are never reused, ie. having a single use in module.
+  //   2) Their input types match those of their parent ops (excluding inputs
+  //      like predicate).
+  // Returns a boolean indicating whether any change has been applied.
+  LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
+                                              ArrayRef<Type> input_types,
+                                              int64_t max_iteration);
+
+  // Propagate the shapes to the functions named.
+  LogicalResult PropagateShapeToFunctions(
+      ModuleOp module, Operation::operand_type_range input_types,
+      ArrayRef<StringRef> func_names, int64_t max_iteration);
+
+  // Shape propagation for call/control flow ops.
+  LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
+                                                    int64_t max_iteration);
+
+ private:
+  // Mapping between ValuePort (which corresponds to an OpResult or smaller,
+  // e.g., first element of OpResult produded) to an Attribute if the ValuePort
+  // corresponds to a constant value.
+  ValuePortResultMap results_;
+  int64_t graph_version_;
+  MLIRContext* context_;
+  Dialect* tf_dialect_;
+};
+
+ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context)
+    : graph_version_(graph_version) {
+  context_ = context;
+  tf_dialect_ = context->getRegisteredDialect<TensorFlowDialect>();
+}
+
+ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
+                                                 InferenceContext* ic) {
   LLVM_DEBUG(result.print(llvm::dbgs() << "\nEvaluate partially "));
   auto rt = result.getType().dyn_cast<RankedTensorType>();
   if (!rt || !rt.hasStaticShape() || rt.getRank() != 1) return {};
   int dim_size = rt.getDimSize(0);
 
   // Worklist to direct partial evaluation.
-  llvm::SmallVector<ValuePort, 4> worklist;
-  // The ValuePort evaluated results.
-  // TODO(jpienaar): This could be cached across invocations (e.g., part of some
-  // inference context).
-  ValuePortResultMap evaluated;
-  // Returns whether a ValuePort has been previously computed.
-  auto has_been_computed = [&evaluated](const ValuePort& port) {
-    return evaluated.find(port) != evaluated.end();
-  };
-  // Returns previously computed ValuePort value.
-  auto values = [&evaluated](const ValuePort& port) -> Attribute {
-    return evaluated[port];
-  };
+  SmallVector<ValuePort, 4> worklist;
 
   // Simple evaluator that attempts to partially evaluate the input value even
   // if unable to evaluate the complete output. Below follows a simple stack
@@ -498,7 +563,7 @@ ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
       LLVM_DEBUG(front.print(llvm::errs() << "\nWorklist front "));
 
       SmallVector<ValuePort, 4> inputs;
-      auto res = InputsRequiredForOutput(front, has_been_computed, &inputs);
+      auto res = ComputeInputsRequiredForOutput(front, &inputs);
       if (failed(res)) {
         // Abort if unable to find which required inputs need to be computed.
         worklist.clear();
@@ -513,16 +578,16 @@ ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
         continue;
       }
 
-      auto ret = ComputeOutputComponent(front, values);
+      auto ret = ComputeOutputComponent(front);
       if (!ret) continue;
 
-      evaluated[front] = ret;
+      RecordValue(front, ret);
       LLVM_DEBUG(ret.print(llvm::dbgs() << "\ncomputed result = "));
 
       // If worklist is empty, then this is the root query op.
       if (worklist.empty()) {
         LLVM_DEBUG(llvm::dbgs() << "[root node]\n");
-        if (auto dea = ret.dyn_cast<mlir::DenseIntElementsAttr>()) {
+        if (auto dea = ret.dyn_cast<DenseIntElementsAttr>()) {
           if (dea.getNumElements() != 1) {
             LLVM_DEBUG(llvm::errs() << "Unexpected number of elements\n");
             return {};
@@ -536,14 +601,8 @@ ShapeHandle ComputeOutputAsShape(OpResult result, InferenceContext* ic) {
   return ic->MakeShape(dims);
 }
 
-// Performs shape inference on the provided op and return true if the type of
-// at least one result has been changed.
-// A tf.Cast() is inserted for any uses that isn't in the TensorFlow dialect.
-// `graph_version` indicates the current GraphDef compatibility versions
-// (the versions field in graph.proto).
-bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
-                                  int64_t graph_version) {
-  assert(tf_dialect == op->getDialect());
+bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
+  assert(tf_dialect_ == op->getDialect());
   // The shape function of these ops sometimes does not propagate subtypes
   // (handle shapes) for resource and variant types. We use a simple passthrough
   // to make sure they are preserved in the output.
@@ -555,7 +614,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // If no result for this op needs shape inference, we have a fast-path return.
   // But if the type is a resource/variant, we do not skip it because we might
   // not have the handle shapes.
-  if (llvm::none_of(op->getResultTypes(), CanBeRefined)) {
+  if (none_of(op->getResultTypes(), CanBeRefined)) {
     LLVM_DEBUG(llvm::dbgs() << "Skipping inference for statically shaped op '"
                             << op->getName() << "'.\n");
     return false;
@@ -570,8 +629,8 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // This is necessary to avoid reprocessing the tf.Cast that are inserted at
   // the end of this function.
   if (isa<CastOp>(op) &&
-      llvm::all_of(op->getResult(0).getUsers(), [&](Operation* user) {
-        return user->getDialect() != tf_dialect;
+      all_of(op->getResult(0).getUsers(), [&](Operation* user) {
+        return user->getDialect() != tf_dialect_;
       })) {
     LLVM_DEBUG(llvm::dbgs() << "Skipping inference for tf.Cast with no TF "
                                "dialect operation users '"
@@ -651,7 +710,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // Perform the shape inference using an InferenceContext with the input
   // shapes. This object is abstracting the information that the ShapeInference
   // function operates on.
-  InferenceContext c(graph_version, *node_def, op_reg_data->op_def,
+  InferenceContext c(graph_version_, *node_def, op_reg_data->op_def,
                      input_shapes, input_tensors,
                      /*input_tensors_as_shapes=*/{}, handle_shapes_and_types);
   auto status = c.Run(op_reg_data->shape_inference_fn);
@@ -664,7 +723,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   // Determine if, during shape computation, the shape functions attempted to
   // query an input operand as shape where the input was not known/constant.
   bool requires_inputs =
-      llvm::any_of(llvm::seq<int>(0, c.num_inputs()), [&](int input) {
+      any_of(llvm::seq<int>(0, c.num_inputs()), [&](int input) {
         return c.requested_input_tensor_as_partial_shape(input) &&
                !input_tensors[input];
       });
@@ -728,7 +787,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
         new_element_type.isa<TF::VariantType>()) {
       auto handle_shapes_types = c.output_handle_shapes_and_types(output);
       if (handle_shapes_types) {
-        llvm::SmallVector<mlir::TensorType, 1> subtypes;
+        SmallVector<TensorType, 1> subtypes;
         OpBuilder b(op);
         for (const auto& shape_n_type : *handle_shapes_types) {
           Type element_type;
@@ -748,7 +807,7 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
     if (result.getType() == new_type) continue;
     // Inserts a cast back to the original type if any user is not in the TF
     // dialect.
-    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect,
+    AddCastBackForUnsupportedNonTFUses(op, result, tf_dialect_,
                                        result.getType());
     // Finally we inferred the shape and replace the type for this result.
     result.setType(new_type);
@@ -760,29 +819,13 @@ bool InferShapeForSingleOperation(Operation* op, Dialect* tf_dialect,
   return changed;
 }
 
-// Infers shape on the provided region, including nested ones, iterate until fix
-// point with a limit of max_iteration. Returns success if fix point is reached
-// before max_iteration.
-LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
-                                      int64_t max_iteration = 10);
-
-// Updates input types and refine shapes inside body of functions that are
-// attached to ControlFlow ops (If/While). These functions include Then/Else
-// branches of IfOp and Cond/Body functions of WhileOp. These functions share
-// following common properties:
-//   1) They are never reused, ie. having a single use in module.
-//   2) Their input types match those of their parent ops (excluding inputs like
-//      predicate).
-// Returns a boolean indicating whether any change has been applied.
-LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
-                                            llvm::ArrayRef<Type> input_types,
-                                            int64_t graph_version,
-                                            int64_t max_iteration) {
+LogicalResult ShapeInference::RefineShapeForControlFlowFunc(
+    FuncOp func, ArrayRef<Type> input_types, int64_t max_iteration) {
   ModuleOp module = func.getParentOfType<ModuleOp>();
   auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
   int num_uses = std::distance(func_uses->begin(), func_uses->end());
   if (num_uses != 1) {
-    func.emitWarning(llvm::formatv(
+    func.emitWarning(formatv(
         "expected control flow function {0} to have exactly 1 use, found {1}.",
         func.getName(), num_uses));
     return failure();
@@ -796,8 +839,7 @@ LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
     arg_and_idx.value().setType(input_types[arg_and_idx.index()]);
   }
 
-  auto res =
-      InferShapeUntilFixPoint(&func.getBody(), graph_version, max_iteration);
+  auto res = InferShapeUntilFixPoint(&func.getBody(), max_iteration);
   if (failed(res)) return res;
 
   auto new_return_types = InferShapeForFunctionReturnType(func);
@@ -809,20 +851,18 @@ LogicalResult RefineShapeForControlFlowFunc(FuncOp func,
   return success();
 }
 
-LogicalResult PropagateShapeToFunctions(
+LogicalResult ShapeInference::PropagateShapeToFunctions(
     ModuleOp module, Operation::operand_type_range input_types,
-    llvm::ArrayRef<StringRef> func_names, int64_t graph_version,
-    int64_t max_iteration) {
-  bool success = true;
+    ArrayRef<StringRef> func_names, int64_t max_iteration) {
+  bool all_succeeded = true;
   auto types = llvm::to_vector<4>(input_types);
   for (auto func_name : func_names) {
     FuncOp func = module.lookupSymbol<FuncOp>(func_name);
-    if (failed(RefineShapeForControlFlowFunc(func, types, graph_version,
-                                             max_iteration))) {
-      success = false;
-    }
+    all_succeeded =
+        succeeded(RefineShapeForControlFlowFunc(func, types, max_iteration)) &&
+        all_succeeded;
   }
-  return mlir::success(success);
+  return success(all_succeeded);
 }
 
 // If the callee has only one use, propagates any constant operand of call_op to
@@ -842,7 +882,7 @@ void PropagateConstantToCallee(CallOpInterface call_op,
     // the constant inside the function.
     for (auto arg : func.getArguments()) {
       auto operand = op->getOperand(arg.getArgNumber()).getDefiningOp();
-      if (llvm::isa_and_nonnull<TF::ConstOp>(operand)) {
+      if (isa_and_nonnull<TF::ConstOp>(operand)) {
         arg.replaceAllUsesWith(builder.clone(*operand)->getResult(0));
       }
     }
@@ -861,33 +901,31 @@ void PropagateConstantFromCallee(CallOpInterface call_op,
   for (auto retval :
        llvm::enumerate(func.front().getTerminator()->getOperands())) {
     auto retval_op = retval.value().getDefiningOp();
-    if (llvm::isa_and_nonnull<TF::ConstOp>(retval_op)) {
+    if (isa_and_nonnull<TF::ConstOp>(retval_op)) {
       op->getResult(retval.index())
           .replaceAllUsesWith(builder.clone(*retval_op)->getResult(0));
     }
   }
 }
 
-LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
-                                                  int64_t graph_version,
-                                                  int64_t max_iteration) {
+LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
+    Operation* op, int64_t max_iteration) {
   ModuleOp module = op->getParentOfType<ModuleOp>();
   if (auto if_op = dyn_cast<TF::IfOp>(op)) {
     return PropagateShapeToFunctions(
-        module, llvm::drop_begin(if_op.getOperandTypes(), 1),
-        {if_op.then_branch(), if_op.else_branch()}, graph_version,
-        max_iteration);
+        module, drop_begin(if_op.getOperandTypes(), 1),
+        {if_op.then_branch(), if_op.else_branch()}, max_iteration);
   } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
     return PropagateShapeToFunctions(module, while_op.getOperandTypes(),
                                      {while_op.cond(), while_op.body()},
-                                     graph_version, max_iteration);
+                                     max_iteration);
   } else if (auto call_op = dyn_cast<CallOpInterface>(op)) {
     CallInterfaceCallable callable = call_op.getCallableForCallee();
     if (SymbolRefAttr sym = callable.dyn_cast<SymbolRefAttr>()) {
       PropagateConstantToCallee(call_op, sym, module);
       if (failed(PropagateShapeToFunctions(
               module, call_op.getArgOperands().getTypes(),
-              {sym.getRootReference()}, graph_version, max_iteration))) {
+              {sym.getRootReference()}, max_iteration))) {
         return failure();
       }
       PropagateConstantFromCallee(call_op, sym, module);
@@ -900,13 +938,10 @@ LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
   return success();
 }
 
-LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
-                                      int64_t max_iteration) {
-  MLIRContext* ctx = region->getContext();
-  Dialect* tf_dialect = ctx->getRegisteredDialect<TensorFlowDialect>();
-
-  // An operation folder that is used to attempt folding before inference.
-  OperationFolder folder(ctx);
+LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
+                                                      int64_t max_iteration) {
+  // An operation folder that is used to attempt folding before inference._
+  OperationFolder folder(context_);
   bool changed = true;
 
   // TODO(aminim): we could have a more efficient traversal by guiding the
@@ -919,14 +954,14 @@ LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
                << "Shape inference, iteration " << iteration << "\n");
     region->walk([&](Operation* op) {
       if (auto infer_ti = dyn_cast<InferTypeOpInterface>(op)) {
-        changed |= RefineWithInferTypeOpInterface(infer_ti, tf_dialect);
+        changed |= RefineWithInferTypeOpInterface(infer_ti, tf_dialect_);
         // TODO(jpienaar): Debug why we can't just return here. We end up with
         // additional constant due to the propagation of constant into attached
         // function if we return already.
       }
 
-      if (op->getDialect() != tf_dialect) {
-        changed |= InferShapeForNonTFDialectOperation(op, tf_dialect);
+      if (op->getDialect() != tf_dialect_) {
+        changed |= InferShapeForNonTFDialectOperation(op, tf_dialect_);
         return;
       }
 
@@ -935,13 +970,12 @@ LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
 
       // Best-effort shape inference in attached functions. Do not return
       // failure even if it doesn't get to fixed point.
-      if (failed(PropagateShapeIntoAttachedFunctions(op, graph_version,
-                                                     max_iteration))) {
+      if (failed(PropagateShapeIntoAttachedFunctions(op, max_iteration))) {
         op->emitWarning() << "unable to refine shape of attached function "
                              "arguments and bodies";
       }
 
-      changed |= InferShapeForSingleOperation(op, tf_dialect, graph_version);
+      changed |= InferShapeForSingleOperation(op);
     });
   }
 
@@ -956,44 +990,43 @@ LogicalResult InferShapeUntilFixPoint(Region* region, int64_t graph_version,
 LogicalResult InferShapeForFunction(FuncOp func,
                                     ArrayRef<ArrayRef<int64_t>> arg_shapes,
                                     int64_t graph_version) {
+  ShapeInference context(graph_version, func.getContext());
   if (arg_shapes.empty()) {
-    if (failed(InferShapeUntilFixPoint(&func.getBody(), graph_version)))
+    if (failed(context.InferShapeUntilFixPoint(&func.getBody())))
       return failure();
     // TODO(b/156276510): Verify that it is always fine to refine a function's
     // return type, as long as we do not change the argument shapes.
     if (auto return_types = InferShapeForFunctionReturnType(func)) {
-      func.setType(mlir::FunctionType::get(func.getType().getInputs(),
-                                           return_types.getValue(),
-                                           func.getContext()));
+      func.setType(FunctionType::get(func.getType().getInputs(),
+                                     return_types.getValue(),
+                                     func.getContext()));
     }
 
     return success();
   }
-  mlir::FunctionType func_type = func.getType();
+  FunctionType func_type = func.getType();
   bool needs_refinement = false;
-  llvm::SmallVector<mlir::Type, 4> new_arg_types;
+  SmallVector<Type, 4> new_arg_types;
   new_arg_types.reserve(func_type.getNumInputs());
 
   // Update argument types in-place using the provided arg_shapes.
   for (size_t i = 0; i < func_type.getNumInputs(); ++i) {
     ArrayRef<int64_t> shape = arg_shapes[i];
-    mlir::Type element_type;
-    if (auto input_ty =
-            func_type.getInput(i).dyn_cast<mlir::RankedTensorType>()) {
+    Type element_type;
+    if (auto input_ty = func_type.getInput(i).dyn_cast<RankedTensorType>()) {
       if (!input_ty || input_ty.getShape().size() != shape.size()) {
         return failure();
       }
       element_type = input_ty.getElementType();
     } else {
-      auto unranked_input_ty =
-          func_type.getInput(i).dyn_cast<mlir::TensorType>();
+      auto unranked_input_ty = func_type.getInput(i).dyn_cast<TensorType>();
       if (!unranked_input_ty) {
         return failure();
       }
       element_type = unranked_input_ty.getElementType();
     }
 
-    auto new_arg_type = mlir::RankedTensorType::get(shape, element_type);
+    auto new_arg_type = RankedTensorType::get(shape, element_type);
     if (new_arg_type != func_type.getInput(i)) {
       // If the new type is more detailed, trigger shape inference.
       func.getArgument(i).setType(new_arg_type);
@@ -1006,18 +1039,17 @@ LogicalResult InferShapeForFunction(FuncOp func,
     return success();
   }
 
-  mlir::LogicalResult result =
-      mlir::TF::InferShapeUntilFixPoint(&func.getBody(), graph_version);
+  LogicalResult result = context.InferShapeUntilFixPoint(&func.getBody());
   if (failed(result)) {
     return failure();
   }
 
   auto return_types = InferShapeForFunctionReturnType(func);
-  func.setType(mlir::FunctionType::get(new_arg_types,
-                                       return_types.hasValue()
-                                           ? return_types.getValue()
-                                           : func.getType().getResults(),
-                                       func.getContext()));
+  func.setType(FunctionType::get(new_arg_types,
+                                 return_types.hasValue()
+                                     ? return_types.getValue()
+                                     : func.getType().getResults(),
+                                 func.getContext()));
 
   return success();
 }

From 563a8a5ce0f9583ccfcbda97a9b0c9fd8d3620d0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 11:32:47 -0700
Subject: [PATCH 043/412] Add bz2-devel so python will be compiled with bz2
 support.

PiperOrigin-RevId: 311167882
Change-Id: Ideeb21ae9bd8507d0e2cad4c95d4e81fb0d344fa
---
 ...rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
index 2e520f62cde..9c85091563e 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython
@@ -59,11 +59,12 @@ RUN /install/install_deb_packages.sh
 # - dependencies to build Python from source
 # - patchelf, as it is required by auditwheel
 RUN apt-get update && apt-get install -y \
-    libncurses5-dev \
+    libbz2-dev \
+    libffi-dev \
     libgdbm-dev \
+    libncurses5-dev \
     libnss3-dev \
     libreadline-dev \
-    libffi-dev \
     patchelf \
       && \
     rm -rf /var/lib/apt/lists/*

From b730a73909790d08fbfbf8977e77ab5b57d2d2e6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 11:44:13 -0700
Subject: [PATCH 044/412] Update manylinux docker image to latest hash.

PiperOrigin-RevId: 311170200
Change-Id: Icebc7ba48fc5de8b9a638d39ecc87e6a47140e08
---
 third_party/toolchains/preconfig/generate/containers.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index b1d0389a16d..9be398f5f2d 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -9,7 +9,7 @@ container_digests = {
     "cuda10.1-cudnn7-centos6": "sha256:454b899657e87893ee5e68dc0f87df59b6a0a7418ae09cafcc3dd65ac71feca9",
     "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:5812d9d0ef0a3276fc5faaf4cd01f3d6e03d635893a6e2d2e04f6f01d626c432",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:cc7f760195d7bbe283b45ae740409751d0b74d8ffbdc2f7a3cb62c71a71fbe25",
-    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:c460570b88eab3da92f06fdf30098d89be4de0f3b010ee3d39086f4d000dd3b8",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:13aa5e700bb609521cd4365d4152d7d8f4118cae7ce174ce7d54cc529e21766a",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }

From 1712a14d011035b61cdce1c578646f557ef422da Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Tue, 12 May 2020 11:53:10 -0700
Subject: [PATCH 045/412] [tfdbg2] Ensure initialization on
 DebugEventsWriter.WriteGraphExecutionTrace()

Background:
- When a TF Graph that contains tfdbg2's `DebugIdentityV2` ops is transferred to
  a remote server (tensorflow_std_server) and executed remotely, the
  `DebugEventsWriter.Init()` method is not called beforehand. This is different
  from the case where the Graph is executed on the localhost, where the `Init()`
  method is called from Python binding when the
  `tf.debugging.experimental.enable_dump_debug_info()` API is called.
- This can cause the remotely-executing Graph to fail to write debug logs
- This CL corrects that by calling `Init()` from the `WriteGraphExecutionTrace()`
  method (i.e., the method used by the `DebugIdentityV2` op.
PiperOrigin-RevId: 311171858
Change-Id: I0726ff363a991b1a9edb8b3d824b09374100d338
---
 tensorflow/core/kernels/debug_ops.h           |  6 +-
 tensorflow/core/util/debug_events_writer.cc   | 50 ++++++++-------
 tensorflow/core/util/debug_events_writer.h    | 26 ++++----
 .../core/util/debug_events_writer_test.cc     | 63 +++++++++++++------
 4 files changed, 88 insertions(+), 57 deletions(-)

diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index 00356778026..42364e416ea 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -435,9 +435,9 @@ class DebugIdentityV2Op : public OpKernel {
     for (const string& dump_root : dump_roots_) {
       tfdbg::DebugEventsWriter* debug_events_writer =
           tfdbg::DebugEventsWriter::GetDebugEventsWriter(dump_root);
-      debug_events_writer->WriteGraphExecutionTrace(
-          tfdbg_context_id_, device_name_, op_name_, output_slot_,
-          tensor_debug_mode_, tensor);
+      OP_REQUIRES_OK(context, debug_events_writer->WriteGraphExecutionTrace(
+                                  tfdbg_context_id_, device_name_, op_name_,
+                                  output_slot_, tensor_debug_mode_, tensor));
     }
     context->set_output(0, tensor);
   }
diff --git a/tensorflow/core/util/debug_events_writer.cc b/tensorflow/core/util/debug_events_writer.cc
index 595f92d07c0..d9c3393ce3c 100644
--- a/tensorflow/core/util/debug_events_writer.cc
+++ b/tensorflow/core/util/debug_events_writer.cc
@@ -179,7 +179,7 @@ Status DebugEventsWriter::Init() {
   metadata->set_tensorflow_version(TF_VERSION_STRING);
   metadata->set_file_version(
       strings::Printf("%s%d", kVersionPrefix, kCurrentFormatVersion));
-  SerializeAndWriteDebugEvent(&debug_event, METADATA);
+  TF_RETURN_IF_ERROR(SerializeAndWriteDebugEvent(&debug_event, METADATA));
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       metadata_writer_->Flush(), "Failed to flush debug event metadata writer");
 
@@ -189,38 +189,38 @@ Status DebugEventsWriter::Init() {
   return Status::OK();
 }
 
-void DebugEventsWriter::WriteSourceFile(SourceFile* source_file) {
+Status DebugEventsWriter::WriteSourceFile(SourceFile* source_file) {
   DebugEvent debug_event;
   debug_event.set_allocated_source_file(source_file);
-  SerializeAndWriteDebugEvent(&debug_event, SOURCE_FILES);
+  return SerializeAndWriteDebugEvent(&debug_event, SOURCE_FILES);
 }
 
-void DebugEventsWriter::WriteStackFrameWithId(
+Status DebugEventsWriter::WriteStackFrameWithId(
     StackFrameWithId* stack_frame_with_id) {
   DebugEvent debug_event;
   debug_event.set_allocated_stack_frame_with_id(stack_frame_with_id);
-  SerializeAndWriteDebugEvent(&debug_event, STACK_FRAMES);
+  return SerializeAndWriteDebugEvent(&debug_event, STACK_FRAMES);
 }
 
-void DebugEventsWriter::WriteGraphOpCreation(
+Status DebugEventsWriter::WriteGraphOpCreation(
     GraphOpCreation* graph_op_creation) {
   DebugEvent debug_event;
   debug_event.set_allocated_graph_op_creation(graph_op_creation);
-  SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
+  return SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
 }
 
-void DebugEventsWriter::WriteDebuggedGraph(DebuggedGraph* debugged_graph) {
+Status DebugEventsWriter::WriteDebuggedGraph(DebuggedGraph* debugged_graph) {
   DebugEvent debug_event;
   debug_event.set_allocated_debugged_graph(debugged_graph);
-  SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
+  return SerializeAndWriteDebugEvent(&debug_event, GRAPHS);
 }
 
-void DebugEventsWriter::WriteExecution(Execution* execution) {
+Status DebugEventsWriter::WriteExecution(Execution* execution) {
   if (circular_buffer_size_ <= 0) {
     // No cyclic-buffer behavior.
     DebugEvent debug_event;
     debug_event.set_allocated_execution(execution);
-    SerializeAndWriteDebugEvent(&debug_event, EXECUTION);
+    return SerializeAndWriteDebugEvent(&debug_event, EXECUTION);
   } else {
     // Circular buffer behavior.
     DebugEvent debug_event;
@@ -234,16 +234,18 @@ void DebugEventsWriter::WriteExecution(Execution* execution) {
     if (execution_buffer_.size() > circular_buffer_size_) {
       execution_buffer_.pop_front();
     }
+    return Status::OK();
   }
 }
 
-void DebugEventsWriter::WriteGraphExecutionTrace(
+Status DebugEventsWriter::WriteGraphExecutionTrace(
     GraphExecutionTrace* graph_execution_trace) {
+  TF_RETURN_IF_ERROR(Init());
   if (circular_buffer_size_ <= 0) {
     // No cyclic-buffer behavior.
     DebugEvent debug_event;
     debug_event.set_allocated_graph_execution_trace(graph_execution_trace);
-    SerializeAndWriteDebugEvent(&debug_event, GRAPH_EXECUTION_TRACES);
+    return SerializeAndWriteDebugEvent(&debug_event, GRAPH_EXECUTION_TRACES);
   } else {
     // Circular buffer behavior.
     DebugEvent debug_event;
@@ -257,15 +259,14 @@ void DebugEventsWriter::WriteGraphExecutionTrace(
     if (graph_execution_trace_buffer_.size() > circular_buffer_size_) {
       graph_execution_trace_buffer_.pop_front();
     }
+    return Status::OK();
   }
 }
 
-void DebugEventsWriter::WriteGraphExecutionTrace(const string& tfdbg_context_id,
-                                                 const string& device_name,
-                                                 const string& op_name,
-                                                 int32 output_slot,
-                                                 int32 tensor_debug_mode,
-                                                 const Tensor& tensor_value) {
+Status DebugEventsWriter::WriteGraphExecutionTrace(
+    const string& tfdbg_context_id, const string& device_name,
+    const string& op_name, int32 output_slot, int32 tensor_debug_mode,
+    const Tensor& tensor_value) {
   std::unique_ptr<GraphExecutionTrace> trace(new GraphExecutionTrace());
   trace->set_tfdbg_context_id(tfdbg_context_id);
   if (!op_name.empty()) {
@@ -279,7 +280,7 @@ void DebugEventsWriter::WriteGraphExecutionTrace(const string& tfdbg_context_id,
   }
   trace->set_device_name(device_name);
   tensor_value.AsProtoTensorContent(trace->mutable_tensor_proto());
-  WriteGraphExecutionTrace(trace.release());
+  return WriteGraphExecutionTrace(trace.release());
 }
 
 void DebugEventsWriter::WriteSerializedNonExecutionDebugEvent(
@@ -487,8 +488,8 @@ Status DebugEventsWriter::InitNonMetadataFile(DebugEventFileType type) {
   return Status::OK();
 }
 
-void DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
-                                                    DebugEventFileType type) {
+Status DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
+                                                      DebugEventFileType type) {
   std::unique_ptr<SingleDebugEventFileWriter>* writer = nullptr;
   SelectWriter(type, &writer);
   if (writer != nullptr) {
@@ -497,6 +498,11 @@ void DebugEventsWriter::SerializeAndWriteDebugEvent(DebugEvent* debug_event,
     string str;
     debug_event->AppendToString(&str);
     (*writer)->WriteSerializedDebugEvent(str);
+    return Status::OK();
+  } else {
+    return errors::Internal(
+        "Unable to find debug events file writer for DebugEventsFileType ",
+        type);
   }
 }
 
diff --git a/tensorflow/core/util/debug_events_writer.h b/tensorflow/core/util/debug_events_writer.h
index 6d219d7c9ef..39835adf1a6 100644
--- a/tensorflow/core/util/debug_events_writer.h
+++ b/tensorflow/core/util/debug_events_writer.h
@@ -119,27 +119,27 @@ class DebugEventsWriter {
   // The four DebugEvent fields below are written _without_ the circular buffer.
   // Source file contents are written to the *.source_files file.
   // Takes ownership of source_file.
-  void WriteSourceFile(SourceFile* source_file);
+  Status WriteSourceFile(SourceFile* source_file);
   // Stack frames are written to the *.code_locations file.
   // Takes ownership of stack_frame_with_id.
-  void WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id);
+  Status WriteStackFrameWithId(StackFrameWithId* stack_frame_with_id);
   // Graph op creation events are written to the *.graphs file.
   // Takes ownership of graph_op_creation.
-  void WriteGraphOpCreation(GraphOpCreation* graph_op_creation);
+  Status WriteGraphOpCreation(GraphOpCreation* graph_op_creation);
   // Debugged graphs are written to the *.graphs file.
   // Takes ownership of debugged_graph.
-  void WriteDebuggedGraph(DebuggedGraph* debugged_graph);
+  Status WriteDebuggedGraph(DebuggedGraph* debugged_graph);
 
   // The two DebugEvent fields below are written to the circular buffer
   // and saved to disk only at the FlushExecutionFiles() call.
   // Execution events (eager execution of an op or a tf.function) are written to
   // the *.execution file.
   // Takes ownership of execution.
-  void WriteExecution(Execution* execution);
+  Status WriteExecution(Execution* execution);
   // Graph execution traces (graph-internal tensor values or their summaries)
   // are written to the *.graph_execution_traces file.
   // Takes ownership of graph_execution_trace.
-  void WriteGraphExecutionTrace(GraphExecutionTrace* graph_execution_trace);
+  Status WriteGraphExecutionTrace(GraphExecutionTrace* graph_execution_trace);
 
   // Write a graph execution trace without using a protocol buffer.
   // Instead, pass the raw values related to the graph execution trace.
@@ -155,11 +155,11 @@ class DebugEventsWriter {
   //   tensor_value: The value of the tensor that describes the tensor(s)
   //     that this trace is concerned with. The semantics of this tensor value
   //     depends on the value of `tensor_debug_mode`.
-  void WriteGraphExecutionTrace(const string& tfdbg_context_id,
-                                const string& device_name,
-                                const string& op_name, int32 output_slot,
-                                int32 tensor_debug_mode,
-                                const Tensor& tensor_value);
+  Status WriteGraphExecutionTrace(const string& tfdbg_context_id,
+                                  const string& device_name,
+                                  const string& op_name, int32 output_slot,
+                                  int32 tensor_debug_mode,
+                                  const Tensor& tensor_value);
 
   // Writes a serialized DebugEvent to one of the debug-events files
   // concerned with the non-execution events: the SOURCE_FILES, STACK_FRAMES
@@ -217,8 +217,8 @@ class DebugEventsWriter {
   // Initialize the TFRecord writer for non-metadata file type.
   Status InitNonMetadataFile(DebugEventFileType type);
 
-  void SerializeAndWriteDebugEvent(DebugEvent* debug_event,
-                                   DebugEventFileType type);
+  Status SerializeAndWriteDebugEvent(DebugEvent* debug_event,
+                                     DebugEventFileType type);
 
   void SelectWriter(DebugEventFileType type,
                     std::unique_ptr<SingleDebugEventFileWriter>** writer);
diff --git a/tensorflow/core/util/debug_events_writer_test.cc b/tensorflow/core/util/debug_events_writer_test.cc
index 66cde55864b..bd0c731bc90 100644
--- a/tensorflow/core/util/debug_events_writer_test.cc
+++ b/tensorflow/core/util/debug_events_writer_test.cc
@@ -263,7 +263,7 @@ TEST_F(DebugEventsWriterTest, WriteSourceFile) {
   source_file_1->add_lines("");
   source_file_1->add_lines("print(tf.constant([42.0]))");
   source_file_1->add_lines("");
-  writer->WriteSourceFile(source_file_1);
+  TF_ASSERT_OK(writer->WriteSourceFile(source_file_1));
 
   SourceFile* source_file_2 = new SourceFile();
   source_file_2->set_file_path("/home/tf_programs/train.py");
@@ -271,7 +271,7 @@ TEST_F(DebugEventsWriterTest, WriteSourceFile) {
   source_file_2->add_lines("import tensorflow.keras as keras");
   source_file_2->add_lines("");
   source_file_2->add_lines("model = keras.Sequential()");
-  writer->WriteSourceFile(source_file_2);
+  TF_ASSERT_OK(writer->WriteSourceFile(source_file_2));
 
   TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   TF_ASSERT_OK(writer->Close());
@@ -336,8 +336,8 @@ TEST_F(DebugEventsWriterTest, WriteStackFramesFile) {
   file_line_col->set_func("my_func");
   file_line_col->set_code("  x = x ** 2.0");
 
-  writer->WriteStackFrameWithId(stack_frame_1);
-  writer->WriteStackFrameWithId(stack_frame_2);
+  TF_ASSERT_OK(writer->WriteStackFrameWithId(stack_frame_1));
+  TF_ASSERT_OK(writer->WriteStackFrameWithId(stack_frame_2));
   TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   TF_ASSERT_OK(writer->Close());
 
@@ -382,12 +382,12 @@ TEST_F(DebugEventsWriterTest, WriteGraphOpCreationAndDebuggedGraph) {
   GraphOpCreation* graph_op_creation = new GraphOpCreation();
   graph_op_creation->set_op_type("MatMul");
   graph_op_creation->set_op_name("Dense_1/MatMul");
-  writer->WriteGraphOpCreation(graph_op_creation);
+  TF_ASSERT_OK(writer->WriteGraphOpCreation(graph_op_creation));
 
   DebuggedGraph* debugged_graph = new DebuggedGraph();
   debugged_graph->set_graph_id("deadbeaf");
   debugged_graph->set_graph_name("my_func_graph");
-  writer->WriteDebuggedGraph(debugged_graph);
+  TF_ASSERT_OK(writer->WriteDebuggedGraph(debugged_graph));
 
   TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   TF_ASSERT_OK(writer->Close());
@@ -428,7 +428,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheSameFile) {
     SourceFile* source_file = new SourceFile();
     source_file->set_file_path(file_path);
     source_file->set_host_name("localhost.localdomain");
-    writer->WriteSourceFile(source_file);
+    TF_ASSERT_OK(writer->WriteSourceFile(source_file));
   };
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
     thread_pool->Schedule(fn);
@@ -469,7 +469,7 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteAndFlushCallsToTheSameFile) {
     SourceFile* source_file = new SourceFile();
     source_file->set_file_path(file_path);
     source_file->set_host_name("localhost.localdomain");
-    writer->WriteSourceFile(source_file);
+    TF_ASSERT_OK(writer->WriteSourceFile(source_file));
     TF_ASSERT_OK(writer->FlushNonExecutionFiles());
   };
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
@@ -512,16 +512,16 @@ TEST_F(DebugEventsWriterTest, ConcurrentWriteCallsToTheDifferentFiles) {
       source_file->set_file_path(
           strings::Printf("/home/tf_programs/program_%.2d.py", index));
       source_file->set_host_name("localhost.localdomain");
-      writer->WriteSourceFile(source_file);
+      TF_ASSERT_OK(writer->WriteSourceFile(source_file));
     } else if (index % 3 == 1) {
       StackFrameWithId* stack_frame = new StackFrameWithId();
       stack_frame->set_id(strings::Printf("e%.2d", index));
-      writer->WriteStackFrameWithId(stack_frame);
+      TF_ASSERT_OK(writer->WriteStackFrameWithId(stack_frame));
     } else {
       GraphOpCreation* op_creation = new GraphOpCreation();
       op_creation->set_op_type("Log");
       op_creation->set_op_name(strings::Printf("Log_%.2d", index));
-      writer->WriteGraphOpCreation(op_creation);
+      TF_ASSERT_OK(writer->WriteGraphOpCreation(op_creation));
     }
   };
   for (size_t i = 0; i < kConcurrentWrites; ++i) {
@@ -586,7 +586,7 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferNoFlush) {
     Execution* execution = new Execution();
     execution->set_op_type("Log");
     execution->add_input_tensor_ids(i);
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   }
 
   std::vector<DebugEvent> actuals;
@@ -611,7 +611,7 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) {
     Execution* execution = new Execution();
     execution->set_op_type("Log");
     execution->add_input_tensor_ids(i);
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   }
 
   TF_ASSERT_OK(writer->FlushExecutionFiles());
@@ -637,7 +637,7 @@ TEST_F(DebugEventsWriterTest, WriteExecutionWithCyclicBufferFlush) {
     Execution* execution = new Execution();
     execution->set_op_type("Abs");
     execution->add_input_tensor_ids(counter.fetch_add(1));
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   };
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     thread_pool->Schedule(fn);
@@ -682,7 +682,7 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) {
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(strings::Printf("graph_%.2ld", i));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   }
 
   std::vector<DebugEvent> actuals;
@@ -695,6 +695,31 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferNoFlush) {
   TF_ASSERT_OK(writer->Close());
 }
 
+TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithoutPreviousInitCall) {
+  const size_t kCyclicBufferSize = -1;
+  DebugEventsWriter* writer =
+      DebugEventsWriter::GetDebugEventsWriter(dump_root_, kCyclicBufferSize);
+  // NOTE(cais): `writer->Init()` is not called here before
+  // WriteGraphExecutionTrace() is called. This test checks that this is okay
+  // and the `GraphExecutionTrace` gets written correctly even without `Init()`
+  // being called first. This scenario can happen when a TF Graph with tfdbg
+  // debug ops are executed on a remote TF server.
+
+  GraphExecutionTrace* trace = new GraphExecutionTrace();
+  trace->set_tfdbg_context_id(strings::Printf("graph_0"));
+  TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
+  TF_ASSERT_OK(writer->FlushExecutionFiles());
+
+  std::vector<DebugEvent> actuals;
+  ReadDebugEventProtos(writer, DebugEventFileType::GRAPH_EXECUTION_TRACES,
+                       &actuals);
+  EXPECT_EQ(actuals.size(), 1);
+  EXPECT_EQ(actuals[0].graph_execution_trace().tfdbg_context_id(), "graph_0");
+
+  // Close the writer so the files can be safely deleted.
+  TF_ASSERT_OK(writer->Close());
+}
+
 TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
   const size_t kCyclicBufferSize = 10;
   DebugEventsWriter* writer =
@@ -706,7 +731,7 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(strings::Printf("graph_%.2ld", i));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   }
 
   TF_ASSERT_OK(writer->FlushExecutionFiles());
@@ -731,7 +756,7 @@ TEST_F(DebugEventsWriterTest, WriteGrahExecutionTraceWithCyclicBufferFlush) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(
         strings::Printf("new_graph_%.2ld", counter.fetch_add(1)));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   };
   for (size_t i = 0; i < kCyclicBufferSize * 2; ++i) {
     thread_pool->Schedule(fn);
@@ -818,7 +843,7 @@ TEST_F(DebugEventsWriterTest, DisableCyclicBufferBehavior) {
     Execution* execution = new Execution();
     execution->set_op_type("Log");
     execution->add_input_tensor_ids(i);
-    writer->WriteExecution(execution);
+    TF_ASSERT_OK(writer->WriteExecution(execution));
   }
   TF_ASSERT_OK(writer->FlushExecutionFiles());
 
@@ -834,7 +859,7 @@ TEST_F(DebugEventsWriterTest, DisableCyclicBufferBehavior) {
   for (size_t i = 0; i < kNumEvents; ++i) {
     GraphExecutionTrace* trace = new GraphExecutionTrace();
     trace->set_tfdbg_context_id(strings::Printf("graph_%.2ld", i));
-    writer->WriteGraphExecutionTrace(trace);
+    TF_ASSERT_OK(writer->WriteGraphExecutionTrace(trace));
   }
   TF_ASSERT_OK(writer->FlushExecutionFiles());
 

From 1de39b575611a252531d0238eefb8a394fa96286 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 11:57:52 -0700
Subject: [PATCH 046/412] Implement outside compilation head extraction.

PiperOrigin-RevId: 311172756
Change-Id: Id3dbcbd1582a01ec94424dbb8b08bb475466568c
---
 ...extract_head_tail_outside_compilation.mlir |  83 ++++++--
 .../mlir/tensorflow/transforms/passes.h       |   2 +-
 ...u_extract_head_tail_outside_compilation.cc | 194 ++++++++++++++++--
 3 files changed, 247 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
index 77ca08c089a..eb67bdcc914 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -1,13 +1,17 @@
 // RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-tpu-extract-head-tail-outside-compilation | FileCheck %s --dump-input-on-failure
 
-// Tests extraction of a single outside compiled cluster with no input or output dependecies.
+// Tests extraction of a outside compiled ops at head of TPU computation.
 
-// CHECK-LABEL: func @nodep_single_head_outside_compilation
-func @nodep_single_head_outside_compilation() -> () {
-   // CHECK: "tf.A"
-   // CHECK-NEXT: "tf_device.launch"
-  "tf_device.launch"() ( {
-    "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> ()
+func @single_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+  // CHECK:      tf_device.launch
+  // CHECK:        "tf.A"
+  // CHECK-NEXT:   tf_device.return
+  //
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.C"
+  // CHECK-NEXT:   tf_device.return
+  "tf_device.cluster"() ( {
+    "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
     "tf.B"() : () -> ()
     "tf.C"() : () -> ()
     tf_device.return
@@ -15,15 +19,62 @@ func @nodep_single_head_outside_compilation() -> () {
   return
 }
 
-// CHECK-LABEL: func @nodep_multiple_head_outside_compilation
-func @nodep_multiple_head_outside_compilation() -> () {
-   // CHECK: "tf.A"
-   // CHECK-NEXT: "tf.B"
-   // CHECK-NEXT: "tf_device.launch"
-  "tf_device.launch"() ( {
-    "tf.A"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.B"() {_xla_outside_compilation = "cluster1"} : () -> ()
-    "tf.C"() : () -> ()
+// CHECK-LABEL: func @multiple_head_outside_compilation
+func @multiple_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+  // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+  // CHECK:        %[[A_OUT:.*]] = "tf.A"
+  // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
+  // CHECK:        "tf.C"
+  // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+  //
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.D"(%[[LAUNCH_OUT]])
+  // CHECK-NEXT:   tf_device.return
+  "tf_device.cluster"() ( {
+    %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+    %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+    "tf.C"(%1, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
+    "tf.D"(%1) : (tensor<i32>) -> ()
+    tf_device.return
+  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @test_do_not_outside_compiled_ops_in_middle
+func @test_do_not_outside_compiled_ops_in_middle(%arg0 : tensor<i32>) -> () {
+  // CHECK-NOT:  tf_device.launch
+  // CHECK:      "tf_device.cluster"
+  // CHECK-NEXT:   "tf.A"
+  // CHECK-NEXT:   "tf.B"
+  // CHECK-NEXT:   "tf.C"
+  // CHECK-NEXT:   tf_device.return
+  "tf_device.cluster"() ( {
+    %0 = "tf.A"(%arg0) {} : (tensor<i32>) -> (tensor<i32>)
+    %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+    "tf.C"(%1) : (tensor<i32>) -> ()
+    tf_device.return
+  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
+  return
+}
+
+// CHECK-LABEL: func @test_ops_with_tpu_operands_not_extracted
+func @test_ops_with_tpu_operands_not_extracted(%arg0 : tensor<i32>) -> () {
+  // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+  // CHECK:        %[[A_OUT:.*]] = "tf.A"
+  // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
+  // CHECK-NEXT:   tf_device.return %[[D_OUT]]
+  //
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        "tf.B"
+  // CHECK:        "tf.C"
+  // CHECK:        "tf.E"
+  // CHECK-NEXT:   tf_device.return
+  "tf_device.cluster"() ( {
+    %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+    %1 = "tf.B"() {} : () -> (tensor<i32>)
+    %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+    %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+    %4 = "tf.E"(%3) {} : (tensor<i32>) -> (tensor<i32>)
     tf_device.return
   }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
   return
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index c1d99c2dee3..0b1ff2beebb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -258,7 +258,7 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateTPUVariableReformattingPass();
 
 // Creates a pass that extracts outside compilation (CPU ops inside TPU cluster)
 // at head/tail of TPU cluster to run before/after TPU computation.
-std::unique_ptr<OperationPass<FuncOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUExtractHeadTailOutsideCompilationPass();
 
 // Creates a pass that extract outside compilation (CPU ops inside TPU cluster)
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index 141feeb6b24..b9e214470cd 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -14,11 +14,23 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <type_traits>
 
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 
 namespace mlir {
 namespace TFTPU {
@@ -30,30 +42,182 @@ namespace {
 
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 
-struct TPUExtractHeadTailOutsideCompilation
-    : public PassWrapper<TPUExtractHeadTailOutsideCompilation, FunctionPass> {
-  void runOnFunction() override;
-};
+bool HasOutsideCompilationAttribute(Operation* op) {
+  return op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr) != nullptr;
+}
 
-void TPUExtractHeadTailOutsideCompilation::runOnFunction() {
-  getFunction().walk([&](tf_device::LaunchOp launch) {
-    Block& launch_block = launch.GetBody();
-    for (auto& op : llvm::make_early_inc_range(launch_block.getOperations())) {
-      // TODO(b/155115766): Handle outputs that should be inputs to TPU
-      // LaunchOp.
-      if (auto attr =
-              op.getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
-        op.moveBefore(launch);
-      } else {
+// Returns whether all operands of `op` are from values inside the
+// `input_value_set`.
+bool OpContainsOperandsFromSet(Operation* op,
+                               const llvm::SetVector<Value>& input_value_set) {
+  for (auto operand : op->getOperands())
+    if (input_value_set.count(operand) == 0) return false;
+
+  return true;
+}
+
+void RecordOutsideCompiledOpsAndUsages(
+    Operation* op, llvm::SmallSetVector<Operation*, 4>* outside_compiled_ops,
+    llvm::SetVector<Value>* outside_compiled_op_usages) {
+  if (HasOutsideCompilationAttribute(op) &&
+      OpContainsOperandsFromSet(op, *outside_compiled_op_usages)) {
+    outside_compiled_ops->insert(op);
+    outside_compiled_op_usages->insert(op->getResults().begin(),
+                                       op->getResults().end());
+  }
+}
+
+// Traverses the MLIR graph and returns a set of ops that
+// are connected to inputs of TPU computation and outside compiled.
+void ExtractOutsideCompiledOpsConnectedToHead(
+    Value input_value, llvm::SetVector<Value>* values_used_in_host_cluster,
+    llvm::SmallSetVector<Operation*, 4>* outside_compiled_ops) {
+  llvm::SmallSetVector<Operation*, 4> parent_outside_compiled_ops_at_head;
+  for (auto& usage : input_value.getUses()) {
+    auto head_operation = usage.getOwner();
+    RecordOutsideCompiledOpsAndUsages(head_operation,
+                                      &parent_outside_compiled_ops_at_head,
+                                      values_used_in_host_cluster);
+  }
+
+  // Traverse the graph and find all outside compiled ops connected from
+  // the `input_value`.
+  while (!parent_outside_compiled_ops_at_head.empty()) {
+    llvm::SmallSetVector<Operation*, 4> connected_outside_compiled_ops;
+    for (auto head_outside_compiled_op : parent_outside_compiled_ops_at_head) {
+      auto op_results = head_outside_compiled_op->getOpResults();
+      for (auto op_result : op_results) {
+        for (auto& use : op_result.getUses()) {
+          auto connected_op = use.getOwner();
+          RecordOutsideCompiledOpsAndUsages(connected_op,
+                                            &connected_outside_compiled_ops,
+                                            values_used_in_host_cluster);
+        }
+      }
+    }
+
+    outside_compiled_ops->insert(parent_outside_compiled_ops_at_head.begin(),
+                                 parent_outside_compiled_ops_at_head.end());
+    std::swap(parent_outside_compiled_ops_at_head,
+              connected_outside_compiled_ops);
+  }
+}
+
+// TODO(hongjunchoi): Also handle ops without inputs that are outside
+// compiled.
+//
+// Returns set of ops that are outside compiled and are directly connected
+// to inputs to the TPU computation.
+llvm::SmallSetVector<Operation*, 4> IdentifyOutsideCompiledOpsAtHead(
+    tf_device::ClusterOp tpu_cluster) {
+  llvm::SmallSetVector<Operation*, 4> outside_compiled_at_head_ops;
+  llvm::SetVector<Value> values_used_in_cluster;
+  auto& cluster_region = tpu_cluster.body();
+  getUsedValuesDefinedAbove(cluster_region, cluster_region,
+                            values_used_in_cluster);
+
+  auto input_value_list = llvm::to_vector<8>(values_used_in_cluster);
+  for (auto input_value : input_value_list)
+    ExtractOutsideCompiledOpsConnectedToHead(
+        input_value, &values_used_in_cluster, &outside_compiled_at_head_ops);
+  return outside_compiled_at_head_ops;
+}
+
+// Returns output values of extracted outside compiled cluster at head that
+// are used by the TPU computation.
+llvm::SmallVector<Value, 8> GetHeadExtractedClusterOutputs(
+    const llvm::SmallSetVector<Operation*, 4>& head_outside_compiled_ops) {
+  llvm::SmallVector<Value, 8> outputs;
+  outputs.reserve(head_outside_compiled_ops.size());
+
+  for (auto op : head_outside_compiled_ops) {
+    for (Operation* user : op->getUsers()) {
+      if (!head_outside_compiled_ops.count(user)) {
+        outputs.append(op->result_begin(), op->result_end());
         break;
       }
     }
+  }
+
+  return outputs;
+}
+
+// Creates new tf_device.launch op with outside compiled ops extracted
+// from the head of TPU computation.
+llvm::Optional<tf_device::LaunchOp> IsolateHeadExtractedOpsToLaunchOp(
+    OpBuilder* builder, tf_device::ClusterOp cluster,
+    const llvm::SmallSetVector<Operation*, 4>& head_outside_compiled_ops) {
+  if (head_outside_compiled_ops.empty())
+    return llvm::Optional<tf_device::LaunchOp>();
+
+  // Create tf_device.launch op to separate all extracted outside compiled ops
+  // before the tf_device.cluster.
+  auto output_values =
+      GetHeadExtractedClusterOutputs(head_outside_compiled_ops);
+
+  llvm::SmallVector<Type, 8> output_return_types;
+  output_return_types.reserve(output_values.size());
+  for (auto output : output_values)
+    output_return_types.emplace_back(output.getType());
+
+  builder->setInsertionPoint(cluster);
+  auto host_launch_op = builder->create<tf_device::LaunchOp>(
+      cluster.getLoc(), builder->getStringAttr(""), output_return_types);
+
+  // Replace all usages of outside compiled ops that are used in TPU
+  // computation with the results of the above created launch op.
+  for (auto output_and_index : llvm::enumerate(output_values)) {
+    auto output_index = output_and_index.index();
+    auto output = output_and_index.value();
+    for (auto& use : output.getUses()) {
+      if (!head_outside_compiled_ops.count(use.getOwner()))
+        use.set(host_launch_op.getResult(output_index));
+    }
+  }
+
+  // Create terminator op for the newly created launch op.
+  host_launch_op.body().push_back(new Block());
+  builder->setInsertionPointToEnd(&host_launch_op.GetBody());
+  auto terminator = builder->create<tf_device::ReturnOp>(
+      host_launch_op.getLoc(), output_values);
+
+  // Move all outside compile ops from cluster op to launch op.
+  for (auto outside_compiled_op : head_outside_compiled_ops)
+    outside_compiled_op->moveBefore(terminator);
+
+  return host_launch_op;
+}
+
+struct TPUExtractHeadTailOutsideCompilation
+    : public PassWrapper<TPUExtractHeadTailOutsideCompilation,
+                         OperationPass<ModuleOp>> {
+  void runOnOperation() override;
+};
+
+void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
+  // Get runtime devices information from the closest parent module.
+  auto module = getOperation();
+  mlir::TF::RuntimeDevices devices;
+  if (failed(tensorflow::GetDevicesFromOp(module, &devices)))
+    return signalPassFailure();
+
+  OpBuilder builder(&getContext());
+  module.walk([&](tf_device::ClusterOp cluster) {
+    auto head_outside_compiled_ops = IdentifyOutsideCompiledOpsAtHead(cluster);
+    IsolateHeadExtractedOpsToLaunchOp(&builder, cluster,
+                                      head_outside_compiled_ops);
+
+    // TODO(b/156030523): Update device attribute of newly created host launch
+    // op as well as enclosing Replicate op (if TPU computation is replicated)
+    // with host device names.
+
+    // TODO(b/155115766): Implement tail outside compiled op extraction.
   });
 }
 
 }  // anonymous namespace
 
-std::unique_ptr<OperationPass<FuncOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
 CreateTPUExtractHeadTailOutsideCompilationPass() {
   return std::make_unique<TPUExtractHeadTailOutsideCompilation>();
 }

From 9f58e6902cea5d26e68635d1c766c2dc6125577a Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Tue, 12 May 2020 12:26:39 -0700
Subject: [PATCH 047/412] [XLA/GPU] Make Thunk::Initialize() happen at
 compile-time, not run-time. This simplifies GpuExecutable for MLIR
 transition.

PiperOrigin-RevId: 311178815
Change-Id: Ib9c8b8a2f8719c0cd8b342ab07af6e8cb65d82bf
---
 tensorflow/compiler/xla/service/gpu/BUILD     | 12 ++++++-----
 .../xla/service/gpu/amdgpu_compiler.cc        | 10 ++++-----
 .../xla/service/gpu/amdgpu_compiler.h         |  2 +-
 .../xla/service/gpu/conditional_thunk.cc      |  4 ++--
 .../xla/service/gpu/conditional_thunk.h       |  2 +-
 .../compiler/xla/service/gpu/for_thunk.cc     |  4 ++--
 .../compiler/xla/service/gpu/for_thunk.h      |  2 +-
 .../compiler/xla/service/gpu/gpu_compiler.cc  | 21 +++++++++++++------
 .../compiler/xla/service/gpu/gpu_compiler.h   |  7 +++----
 .../xla/service/gpu/gpu_executable.cc         | 10 ++++-----
 .../compiler/xla/service/gpu/gpu_executable.h | 18 ++++++----------
 .../compiler/xla/service/gpu/gpu_types.h      | 17 +++++++++++++++
 .../compiler/xla/service/gpu/kernel_thunk.cc  |  9 ++++----
 .../compiler/xla/service/gpu/kernel_thunk.h   |  4 +---
 .../xla/service/gpu/nvptx_compiler.cc         | 11 ++++------
 .../compiler/xla/service/gpu/nvptx_compiler.h |  2 +-
 .../xla/service/gpu/sequential_thunk.cc       |  4 ++--
 .../xla/service/gpu/sequential_thunk.h        |  2 +-
 tensorflow/compiler/xla/service/gpu/thunk.h   |  5 ++---
 .../compiler/xla/service/gpu/while_thunk.cc   |  6 +++---
 .../compiler/xla/service/gpu/while_thunk.h    |  2 +-
 .../service/mlir_gpu/mlir_compiler_impl.cc    |  7 ++++---
 .../compiler/xla/tests/llvm_compiler_test.cc  |  7 +++----
 23 files changed, 90 insertions(+), 78 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 61bc41283e1..8f8263a85f9 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -17,15 +17,15 @@ load(
     "tf_cuda_library",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm",
+    "if_rocm_is_configured",
+)
 load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-)
 load("//tensorflow:tensorflow.bzl", "if_nccl")
 
 package(
@@ -86,6 +86,7 @@ cc_library(
     name = "gpu_types",
     hdrs = ["gpu_types.h"],
     deps = [
+        "//tensorflow/compiler/xla:types",
         "@com_google_absl//absl/types:variant",
     ],
 )
@@ -405,6 +406,7 @@ cc_library(
     deps = [
         ":buffer_allocations",
         ":gpu_executable_run_options",
+        ":gpu_types",
         ":hlo_execution_profiler",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla/service:hlo",
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
index 974db02b1b3..485aff0c4d8 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
@@ -104,11 +104,9 @@ GpuVersion AMDGPUCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
   return isa_version;
 }
 
-StatusOr<std::pair<std::string, std::vector<uint8>>>
-AMDGPUCompiler::CompileTargetBinary(const HloModule* module,
-                                    llvm::Module* llvm_module,
-                                    GpuVersion gpu_version,
-                                    se::StreamExecutor* stream_exec) {
+StatusOr<GpuTargetBinary> AMDGPUCompiler::CompileTargetBinary(
+    const HloModule* module, llvm::Module* llvm_module, GpuVersion gpu_version,
+    se::StreamExecutor* stream_exec) {
   if (rocdl_dir_.empty()) {
     // Compute rocdl_dir_ just once and cache it in this member.
     rocdl_dir_ = GetROCDLDir(module->config());
@@ -129,7 +127,7 @@ AMDGPUCompiler::CompileTargetBinary(const HloModule* module,
     user_post_optimization_hook_(*llvm_module);
   }
 
-  return std::pair<std::string, std::vector<uint8>>("", std::move(hsaco));
+  return GpuTargetBinary{"", std::move(hsaco)};
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
index acc5e021e3d..9033585763b 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
@@ -39,7 +39,7 @@ class AMDGPUCompiler : public GpuCompiler {
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
 
-  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
+  StatusOr<GpuTargetBinary> CompileTargetBinary(
       const HloModule* hlo_module, llvm::Module* llvm_module,
       GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index e31f45942b1..5e7d89c7aee 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -50,7 +50,7 @@ void ConditionalThunk::ComputeAnnotations() {
   }
 }
 
-Status ConditionalThunk::Initialize(const GpuExecutable& executable,
+Status ConditionalThunk::Initialize(const GpuTargetBinary& target_binary,
                                     se::StreamExecutor* executor) {
   if (branch_index_is_bool_) {
     TF_RET_CHECK(branch_thunks_.size() == 2);
@@ -58,7 +58,7 @@ Status ConditionalThunk::Initialize(const GpuExecutable& executable,
     TF_RET_CHECK(!branch_thunks_.empty());
   }
   for (auto& branch_thunk : branch_thunks_) {
-    TF_RETURN_IF_ERROR(branch_thunk->Initialize(executable, executor));
+    TF_RETURN_IF_ERROR(branch_thunk->Initialize(target_binary, executor));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index 404e2131eff..ba69e1a38ec 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -52,7 +52,7 @@ class ConditionalThunk : public Thunk {
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuExecutable& executable,
+  Status Initialize(const GpuTargetBinary& target_binary,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index 0a97f668b38..aacc9deb739 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -39,9 +39,9 @@ void ForThunk::ComputeAnnotations() {
   body_thunk_sequence_->ComputeAnnotations();
 }
 
-Status ForThunk::Initialize(const GpuExecutable& executable,
+Status ForThunk::Initialize(const GpuTargetBinary& target_binary,
                             se::StreamExecutor* executor) {
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(target_binary, executor));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index 57402f70627..57657b6825f 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -38,7 +38,7 @@ class ForThunk : public Thunk {
   ForThunk& operator=(const ForThunk&) = delete;
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuExecutable& executable,
+  Status Initialize(const GpuTargetBinary& target_binary,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 5f6dfd7d3a5..533ff52a90d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -565,8 +565,7 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   GpuVersion gpu_version = GetGpuVersion(stream_exec);
 
-  using BackendCompileResult = std::pair<std::string, std::vector<uint8>>;
-  TF_ASSIGN_OR_RETURN(BackendCompileResult backend_result,
+  TF_ASSIGN_OR_RETURN(GpuTargetBinary backend_result,
                       CompileTargetBinary(module.get(), &llvm_module,
                                           gpu_version, stream_exec));
 
@@ -578,6 +577,11 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
                             thunk_schedule->ToString());
   }
 
+  std::vector<Thunk*> thunks;
+  for (Thunk* thunk : thunk_schedule->TotalOrder()) {
+    thunks.push_back(thunk);
+  }
+
   std::unique_ptr<HloProfileIndexMap> profile_index_map;
   std::unique_ptr<HloProfilePrinterData> profile_printer;
 
@@ -597,14 +601,19 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   }
 
   auto* gpu_executable = new GpuExecutable(
-      backend_result.first, backend_result.second, gpu_version,
-      std::move(thunk_schedule), std::move(module),
-      std::move(buffer_assignment), std::move(profile_printer),
-      std::move(profile_index_map));
+      std::move(backend_result), gpu_version, std::move(thunk_schedule),
+      std::move(module), std::move(buffer_assignment),
+      std::move(profile_printer), std::move(profile_index_map));
   if (embed_ir_in_executable) {
     DCHECK_NE("", ir_module_string_before_opt);
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
   }
+
+  for (Thunk* thunk : thunks) {
+    TF_RETURN_IF_ERROR(
+        thunk->Initialize(gpu_executable->target_binary(), stream_exec));
+  }
+
   return std::unique_ptr<Executable>(gpu_executable);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index b52af5392d1..deb5d785777 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -74,10 +74,9 @@ class GpuCompiler : public LLVMCompiler {
 
   virtual GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) = 0;
 
-  virtual StatusOr<std::pair<std::string, std::vector<uint8>>>
-  CompileTargetBinary(const HloModule* hlo_module, llvm::Module* llvm_module,
-                      GpuVersion gpu_version,
-                      se::StreamExecutor* stream_exec) = 0;
+  virtual StatusOr<GpuTargetBinary> CompileTargetBinary(
+      const HloModule* hlo_module, llvm::Module* llvm_module,
+      GpuVersion gpu_version, se::StreamExecutor* stream_exec) = 0;
 
   Status PrepareHloModuleForIrEmitting(HloModule* hlo_module);
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 2df6b50d361..ebd3630635b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -52,16 +52,15 @@ using ::tensorflow::profiler::ScopedAnnotation;
 // Implementation note: HLO profiling is always enabled for GPU executables,
 // since we can use timers around thunks.
 GpuExecutable::GpuExecutable(
-    const string& text, const std::vector<uint8>& binary,
-    GpuVersion gpu_version, std::unique_ptr<const ThunkSchedule> thunk_schedule,
+    GpuTargetBinary target_binary, GpuVersion gpu_version,
+    std::unique_ptr<const ThunkSchedule> thunk_schedule,
     std::shared_ptr<HloModule> hlo_module,
     std::shared_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
-      text_(text),
-      binary_(binary),
+      target_binary_(std::move(target_binary)),
       gpu_version_(gpu_version),
       thunk_schedule_(std::move(thunk_schedule)),
       assignment_(std::move(assignment)) {
@@ -176,7 +175,6 @@ Status GpuExecutable::ExecuteThunks(
     // module, we won't get any data, but that's probably an OK trade-off.
     ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
 
-    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
     int32 stream_no =
         thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
     se::Stream* stream =
@@ -469,7 +467,7 @@ const InstructionValueSet& GpuExecutable::GetRootValueSet() const {
 int64 GpuExecutable::SizeOfGeneratedCodeInBytes() {
   // Non-empty PTX but empty cubin: compilation must have failed, return
   // "unknown".
-  if (binary().empty() && !text_.empty()) {
+  if (binary().empty() && !text().empty()) {
     return -1;
   }
   return binary().size();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 045a36c099b..29441c60b04 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -52,8 +52,7 @@ class GpuExecutable : public Executable {
   // We need to share ownership of hlo_module and assignment with profiler to
   // safely keep a reference to these objects during tracing period, thus they
   // are passed as shared pointers.
-  GpuExecutable(const string& text, const std::vector<uint8>& binary,
-                GpuVersion gpu_version,
+  GpuExecutable(GpuTargetBinary target_binary, GpuVersion gpu_version,
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
                 std::shared_ptr<HloModule> hlo_module,
                 std::shared_ptr<const BufferAssignment> assignment,
@@ -73,12 +72,14 @@ class GpuExecutable : public Executable {
 
   // Returns the compiled code for the computation. The compiled code is PTX in
   // Cuda and unused empty string in ROCm.
-  const string& text() const { return text_; }
+  const string& text() const { return target_binary_.text; }
 
   // Returns the binary stored in this GpuExecutable. The binary is cubin in
   // Cuda, and HSA code object in ROCm. It may be empty, in which case
   // compilation is left up to the GPU driver.
-  const std::vector<uint8>& binary() const { return binary_; }
+  const std::vector<uint8>& binary() const { return target_binary_.binary; }
+
+  const GpuTargetBinary& target_binary() const { return target_binary_; }
 
   // ExecuteAsyncOnStream will fail if the compute capability of the stream
   // doesn't match the compute capability passed to this object's constructor.
@@ -131,14 +132,7 @@ class GpuExecutable : public Executable {
   // This string should be modified only before ExecuteOnStream.
   string ir_module_string_;
 
-  // The compiled code for the computation.
-  const string text_;
-
-  // The GPU machine code for the computation, targeting GPUs at
-  // compute_capability_.
-  //
-  // May be empty, in which case we leave compilation up to the GPU driver.
-  const std::vector<uint8> binary_;
+  const GpuTargetBinary target_binary_;
 
   // The GPU version for compute compatibility check.
   GpuVersion gpu_version_;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_types.h b/tensorflow/compiler/xla/service/gpu/gpu_types.h
index 1c51040fb82..5c8b8093d65 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_types.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_types.h
@@ -16,7 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TYPES_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TYPES_H_
 
+#include <string>
+#include <vector>
+
 #include "absl/types/variant.h"
+#include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace gpu {
@@ -25,6 +29,19 @@ namespace gpu {
 // it comprises a pair of integers denoting major and minor version.
 // On ROCm platform, it comprises one integer for AMD GCN ISA version.
 using GpuVersion = absl::variant<std::pair<int, int>, int>;
+
+// A struct to carry around compiled results by the GPU assembler.
+struct GpuTargetBinary {
+  GpuTargetBinary(const GpuTargetBinary& other) = delete;
+  GpuTargetBinary(GpuTargetBinary&& other) = default;
+
+  // The text format of the compiled result, e.g. PTX.
+  std::string text;
+
+  // The actual compiled binary.
+  std::vector<tensorflow::uint8> binary;
+};
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index d976b5d8d4d..0b5010ea66b 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -42,7 +41,7 @@ KernelThunk::KernelThunk(absl::Span<const BufferAllocation* const> args,
       kernel_name_(kernel_name),
       unroll_factor_(unroll_factor) {}
 
-Status KernelThunk::Initialize(const GpuExecutable& executable,
+Status KernelThunk::Initialize(const GpuTargetBinary& target_binary,
                                se::StreamExecutor* executor) {
   tensorflow::mutex_lock lock(mutex_);
 
@@ -55,8 +54,10 @@ Status KernelThunk::Initialize(const GpuExecutable& executable,
   if (kernel_cache_.end() == it) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<se::KernelBase> kernel,
-        CreateKernel(kernel_name_, args_.size(), executable.text(),
-                     executable.binary(), executor));
+        CreateKernel(kernel_name_, args_.size(), target_binary.text,
+                     target_binary.binary, executor));
+    CHECK(!target_binary.binary.empty());
+    CHECK(kernel);
 
     kernel_cache_.emplace(executor, std::move(kernel));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 88351881f3a..97a1d08a57e 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -35,8 +35,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-class GpuExecutable;
-
 // This class stores everything that StreamExecutor needs for launching a
 // kernel. It implements the ExecuteOnStream interface for GpuExecutable to
 // invoke the corresponding kernel.
@@ -58,7 +56,7 @@ class KernelThunk : public Thunk {
   int unroll_factor() const { return unroll_factor_; }
   void SetLaunchDimensions(const LaunchDimensions& launch_dims);
 
-  Status Initialize(const GpuExecutable& executable,
+  Status Initialize(const GpuTargetBinary& target_binary,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 0196267d904..cf6fe9292e5 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -295,11 +295,9 @@ GpuVersion NVPTXCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
   return std::make_pair(cc_major, cc_minor);
 }
 
-StatusOr<std::pair<std::string, std::vector<uint8>>>
-NVPTXCompiler::CompileTargetBinary(const HloModule* module,
-                                   llvm::Module* llvm_module,
-                                   GpuVersion gpu_version,
-                                   se::StreamExecutor* stream_exec) {
+StatusOr<GpuTargetBinary> NVPTXCompiler::CompileTargetBinary(
+    const HloModule* module, llvm::Module* llvm_module, GpuVersion gpu_version,
+    se::StreamExecutor* stream_exec) {
   std::pair<int, int> compute_capability =
       absl::get<std::pair<int, int>>(gpu_version);
 
@@ -340,8 +338,7 @@ NVPTXCompiler::CompileTargetBinary(const HloModule* module,
       stream_exec, ptx, compute_capability.first, compute_capability.second,
       module->config());
 
-  return std::pair<std::string, std::vector<uint8>>(std::move(ptx),
-                                                    std::move(cubin));
+  return GpuTargetBinary{std::move(ptx), std::move(cubin)};
 }
 
 std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index e69be947522..ec550b5b2ff 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -48,7 +48,7 @@ class NVPTXCompiler : public GpuCompiler {
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
 
-  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
+  StatusOr<GpuTargetBinary> CompileTargetBinary(
       const HloModule* hlo_module, llvm::Module* llvm_module,
       GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
index 025ca60ef0c..bd260336c28 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@@ -34,10 +34,10 @@ void SequentialThunk::ComputeAnnotations() {
   }
 }
 
-Status SequentialThunk::Initialize(const GpuExecutable& executable,
+Status SequentialThunk::Initialize(const GpuTargetBinary& target_binary,
                                    se::StreamExecutor* executor) {
   for (auto& thunk : thunks_) {
-    TF_RETURN_IF_ERROR(thunk->Initialize(executable, executor));
+    TF_RETURN_IF_ERROR(thunk->Initialize(target_binary, executor));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index 3abb82c0b66..b5475664733 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -40,7 +40,7 @@ class SequentialThunk : public Thunk {
   const std::vector<std::unique_ptr<Thunk>>& thunks() const { return thunks_; }
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuExecutable& executable,
+  Status Initialize(const GpuTargetBinary& target_binary,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index e9be41b74de..7aff9ca47b7 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -30,8 +31,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-class GpuExecutable;
-
 // Thunk acts as the bridge between IrEmitter and GpuExecutable. It stores the
 // metadata IrEmitter generates for GpuExecutable to invoke an HloInstruction.
 //
@@ -97,7 +96,7 @@ class Thunk {
   // This may be called multiple times.  Its main purpose is to give us a chance
   // to do initialization outside of ExecuteOnStream() so that the
   // time spent initializing doesn't count towards our execution profile.
-  virtual Status Initialize(const GpuExecutable& /*executable*/,
+  virtual Status Initialize(const GpuTargetBinary& /*target_binary*/,
                             se::StreamExecutor* /*executor*/) {
     return Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index 4134cd39832..2650508093e 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -45,11 +45,11 @@ void WhileThunk::ComputeAnnotations() {
   body_thunk_sequence_->ComputeAnnotations();
 }
 
-Status WhileThunk::Initialize(const GpuExecutable& executable,
+Status WhileThunk::Initialize(const GpuTargetBinary& target_binary,
                               se::StreamExecutor* executor) {
   TF_RETURN_IF_ERROR(
-      condition_thunk_sequence_->Initialize(executable, executor));
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
+      condition_thunk_sequence_->Initialize(target_binary, executor));
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(target_binary, executor));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 31db01b72ba..77ee0104a1f 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -47,7 +47,7 @@ class WhileThunk : public Thunk {
   WhileThunk& operator=(const WhileThunk&) = delete;
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuExecutable& executable,
+  Status Initialize(const GpuTargetBinary& target_binary,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
index 35ac3b2bf63..667cdef8f6c 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -549,10 +549,11 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
   }
 
   // TODO(b/137624192): Add profiling support.
+
   return {absl::make_unique<GpuExecutable>(
-      ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
-      emission_context.releaseHloModule(), std::move(buffer_assignment),
-      nullptr, nullptr)};
+      xla::gpu::GpuTargetBinary{ptx, cubin}, GetGpuVersion(stream_exec),
+      std::move(thunk_schedule), emission_context.releaseHloModule(),
+      std::move(buffer_assignment), nullptr, nullptr)};
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index 1947f517bd9..16ed02296b7 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -55,16 +55,15 @@ class GpuDummyCompiler : public GpuCompiler {
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) { return 0; }
 
-  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
+  StatusOr<GpuTargetBinary> CompileTargetBinary(
       const HloModule* hlo_module, llvm::Module* llvm_module,
-      GpuVersion gpu_version, se::StreamExecutor* stream_exec) {
+      GpuVersion gpu_version, se::StreamExecutor* stream_exec) override {
     if (user_post_optimization_hook_) {
       user_post_optimization_hook_(*llvm_module);
     }
 
     std::vector<uint8> compiled_results;
-    return std::pair<std::string, std::vector<uint8>>(
-        "", std::move(compiled_results));
+    return GpuTargetBinary{"", std::move(compiled_results)};
   }
 };
 }  // namespace gpu

From 88dfd8ce6dc063659e4fb9b8a6a040b8a673c466 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Tue, 12 May 2020 12:28:52 -0700
Subject: [PATCH 048/412] Update hexagon_delegate guide to reference delegate
 readme

PiperOrigin-RevId: 311179262
Change-Id: Ic3dd3851facd12c8e1e8adde3f9f60a31355e430
---
 .../g3doc/performance/hexagon_delegate.md     | 38 +------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/hexagon_delegate.md b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
index 51af59891dc..60fe9465bf4 100644
--- a/tensorflow/lite/g3doc/performance/hexagon_delegate.md
+++ b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
@@ -259,43 +259,7 @@ ro.board.platform`).
     *   This is tentatively planned for a future release, though there is no
         concrete timeline.
 *   Which ops are supported by the delegate?
-    *   Initial list of supported ops:
-        *   Add
-        *   ArgMax
-        *   ArgMin
-        *   AveragePool2D (without any activation)
-        *   Concat
-        *   Conv2D with following constraints:
-            *   stride width/height <= 3
-        *   DepthToSpace
-        *   DepthwiseConv2D with following constraints:
-            *   Filter width == 3
-            *   depth_multiplier == 1
-            *   dilation only supported when stride == 1
-            *   Otherwise, stride height/width <= 3
-        *   FullyConnected (without any activation)
-        *   Hardswish
-        *   L2Normalization (without any activation)
-        *   Logistic (aka Sigmoid)
-        *   MaxPool2D (without any activation)
-        *   Mul (without any activation)
-        *   Neg
-        *   Pad: Only supports 0 padding
-        *   Relu
-        *   Relu6
-        *   Reshape
-        *   Resize Bilinear with following constraints:
-            *   Requested size <= 65
-        *   Resize Nearest Neighbor
-        *   SoftMax
-        *   SpaceToDepth
-        *   Split
-        *   Sub
-        *   Tanh
-        *   Transpose
-        *   TransposeConv2D with following constraints:
-            *   stride height/width <= 3
-            *   dilation height/width == 1
+    *   See the current list of [supported ops and constraints](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/delegates/hexagon/README.md)
 *   How can I tell that the model is using the DSP when I enable the delegate?
     *   Two log messages will be printed when you enable the delegate - one to
         indicate if the delegate was created and another to indicate how many

From 5100abc4af068b6915a26b9f6531b9fec4da4c06 Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Tue, 12 May 2020 12:29:59 -0700
Subject: [PATCH 049/412] Initial checkin of C++ header-only TensorHandle as
 part of RFC https://github.com/tensorflow/community/pull/207.

PiperOrigin-RevId: 311179503
Change-Id: Ib3cfb2547150d09ee655db6ca6bc72ef3ef7adde
---
 tensorflow/c/eager/c_api.cc                   |   2 +-
 tensorflow/c/eager/c_api.h                    |   2 +-
 tensorflow/cc/experimental/base/public/BUILD  |  14 ++
 .../cc/experimental/base/public/runtime.h     |   3 +
 .../base/public/runtime_builder.h             |   2 +
 .../cc/experimental/base/public/status.h      |   3 +
 .../cc/experimental/base/public/tensor.h      |   2 +
 .../experimental/base/public/tensorhandle.h   |  98 ++++++++++
 tensorflow/cc/experimental/base/tests/BUILD   |  29 +++
 .../cc/experimental/base/tests/tensor_test.cc | 101 +++-------
 .../base/tests/tensor_types_test_util.h       |  76 ++++++++
 .../base/tests/tensorhandle_test.cc           | 184 ++++++++++++++++++
 .../experimental/public/concrete_function.h   |   2 +
 .../public/concrete_function_list.h           |   2 +
 .../experimental/public/function_metadata.h   |   2 +
 .../experimental/public/saved_model_api.h     |   2 +
 .../tests/saved_model_api_test.cc             |  27 +--
 17 files changed, 462 insertions(+), 89 deletions(-)
 create mode 100644 tensorflow/cc/experimental/base/public/tensorhandle.h
 create mode 100644 tensorflow/cc/experimental/base/tests/tensor_types_test_util.h
 create mode 100644 tensorflow/cc/experimental/base/tests/tensorhandle_test.cc

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 73c2f7824b2..5c01ccb82bb 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -924,7 +924,7 @@ extern TFE_ContextDevicePlacementPolicy TFE_ContextGetDevicePlacementPolicy(
       context->GetDevicePlacementPolicy());
 }
 
-TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t, TF_Status* status) {
+TFE_TensorHandle* TFE_NewTensorHandle(const TF_Tensor* t, TF_Status* status) {
   tensorflow::Tensor tensor;
   status->status = tensorflow::TF_TensorToTensor(t, &tensor);
   if (!status->status.ok()) return nullptr;
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index 070b3a9bb60..5afe3047dd7 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -137,7 +137,7 @@ TF_CAPI_EXPORT extern void TFE_ContextSetServerDef(TFE_Context* ctx,
 // placed in memory of different devices or remote address spaces.
 typedef struct TFE_TensorHandle TFE_TensorHandle;
 
-TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(TF_Tensor* t,
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandle(const TF_Tensor* t,
                                                             TF_Status* status);
 // Indicates that the caller will not be using `h` any more.
 TF_CAPI_EXPORT extern void TFE_DeleteTensorHandle(TFE_TensorHandle* h);
diff --git a/tensorflow/cc/experimental/base/public/BUILD b/tensorflow/cc/experimental/base/public/BUILD
index 93acf1bd319..045d4e6cd97 100644
--- a/tensorflow/cc/experimental/base/public/BUILD
+++ b/tensorflow/cc/experimental/base/public/BUILD
@@ -62,3 +62,17 @@ cc_library(
         "//tensorflow/c:tf_tensor",
     ],
 )
+
+cc_library(
+    name = "tensorhandle",
+    hdrs = [
+        "tensorhandle.h",
+    ],
+    deps = [
+        ":runtime",
+        ":status",
+        ":tensor",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_experimental",
+    ],
+)
diff --git a/tensorflow/cc/experimental/base/public/runtime.h b/tensorflow/cc/experimental/base/public/runtime.h
index 47fd8869647..711a38c233a 100644
--- a/tensorflow/cc/experimental/base/public/runtime.h
+++ b/tensorflow/cc/experimental/base/public/runtime.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_experimental.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // Runtime represents an opaque instance of a Tensorflow runtime, with its own
@@ -40,6 +41,7 @@ class Runtime {
  private:
   friend class RuntimeBuilder;
   friend class SavedModelAPI;
+  friend class TensorHandle;
 
   // Wraps a TFE_Context. Takes ownership of ctx.
   explicit Runtime(TFE_Context* ctx) : ctx_(ctx) {}
@@ -63,6 +65,7 @@ class Runtime {
 };
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_H_
diff --git a/tensorflow/cc/experimental/base/public/runtime_builder.h b/tensorflow/cc/experimental/base/public/runtime_builder.h
index ed3c93ae135..737e06cb2c6 100644
--- a/tensorflow/cc/experimental/base/public/runtime_builder.h
+++ b/tensorflow/cc/experimental/base/public/runtime_builder.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/cc/experimental/base/public/status.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // RuntimeBuilder is a builder used to construct a tensorflow::cc::Runtime.
@@ -79,6 +80,7 @@ inline std::unique_ptr<Runtime> RuntimeBuilder::Build(Status* status) {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_RUNTIME_BUILDER_H_
diff --git a/tensorflow/cc/experimental/base/public/status.h b/tensorflow/cc/experimental/base/public/status.h
index f91f2caccd8..98c8cf6ced2 100644
--- a/tensorflow/cc/experimental/base/public/status.h
+++ b/tensorflow/cc/experimental/base/public/status.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/c/tf_status.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // Status is a wrapper around an error code and an optional error message.
@@ -57,6 +58,7 @@ class Status {
   friend class RuntimeBuilder;
   friend class Runtime;
   friend class SavedModelAPI;
+  friend class TensorHandle;
 
   // Wraps a TF_Status*, and takes ownership of it.
   explicit Status(TF_Status* status) : status_(status) {}
@@ -88,6 +90,7 @@ inline void Status::SetStatus(TF_Code code, const std::string& msg) {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_STATUS_H_
diff --git a/tensorflow/cc/experimental/base/public/tensor.h b/tensorflow/cc/experimental/base/public/tensor.h
index 26b0e5dc55e..fc447262ce1 100644
--- a/tensorflow/cc/experimental/base/public/tensor.h
+++ b/tensorflow/cc/experimental/base/public/tensor.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/cc/experimental/base/public/status.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // Tensor represents an n-dimensional array of values.
@@ -168,6 +169,7 @@ inline Tensor Tensor::FromBuffer(TF_DataType dtype,
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSOR_H_
diff --git a/tensorflow/cc/experimental/base/public/tensorhandle.h b/tensorflow/cc/experimental/base/public/tensorhandle.h
new file mode 100644
index 00000000000..99453ee7ea8
--- /dev/null
+++ b/tensorflow/cc/experimental/base/public/tensorhandle.h
@@ -0,0 +1,98 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_experimental.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/status.h"
+#include "tensorflow/cc/experimental/base/public/tensor.h"
+
+namespace tensorflow {
+namespace experimental {
+namespace cc {
+
+// An opaque representation of a tensor computed/managed by the Tensorflow
+// runtime (tensorflow:cc::Runtime). Unlike a tensor, a Tensorhandle may refer
+// to tensors placed in memory of different devices or remote address spaces.
+// Note that tensorflow::cc::Runtime MUST outlive all TensorHandles created
+// from it.
+class TensorHandle {
+ public:
+  // Unwraps a Tensor from the given TensorHandle. If an error occurred,
+  // status->ok() will be false, and the returned Tensor must not be used.
+  Tensor Resolve(Status* status);
+
+  // Constructs a TensorHandle from a Tensor. If an error occurred,
+  // status->ok() will be false, and the returned TensorHandle must not be used.
+  static TensorHandle FromTensor(const Tensor& tensor, const Runtime& runtime,
+                                 Status* status);
+
+  // TensorHandle is movable, and not copyable
+  TensorHandle(TensorHandle&&) = default;
+  TensorHandle& operator=(TensorHandle&&) = default;
+
+ private:
+  // Wraps a TFE_TensorHandle. Takes ownership of handle.
+  explicit TensorHandle(TFE_TensorHandle* handle) : handle_(handle) {}
+
+  // TensorHandle is not copyable
+  TensorHandle(const TensorHandle&) = delete;
+  TensorHandle& operator=(const TensorHandle&) = delete;
+
+  // Returns the underlying TFE_TensorHandle that this object wraps.
+  // This object retains ownership of the pointer.
+  TFE_TensorHandle* GetTFETensorHandle() const { return handle_.get(); }
+
+  // Deletes the currently wrapped TFE_TensorHandle, and swaps it with handle,
+  // and takes ownership of handle.
+  void Reset(TFE_TensorHandle* handle) { handle_.reset(handle); }
+
+  struct TFETensorHandleDeleter {
+    void operator()(TFE_TensorHandle* p) const { TFE_DeleteTensorHandle(p); }
+  };
+  std::unique_ptr<TFE_TensorHandle, TFETensorHandleDeleter> handle_;
+};
+
+inline Tensor TensorHandle::Resolve(Status* status) {
+  TF_Tensor* tensor =
+      TFE_TensorHandleResolve(handle_.get(), status->GetTFStatus());
+  if (!status->ok()) {
+    return Tensor(nullptr);
+  }
+  return Tensor(tensor);
+}
+
+inline TensorHandle TensorHandle::FromTensor(const Tensor& tensor,
+                                             const Runtime& runtime,
+                                             Status* status) {
+  TFE_TensorHandle* tensor_handle = TFE_NewTensorHandleFromTensor(
+      runtime.GetTFEContext(), tensor.GetTFTensor(), status->GetTFStatus());
+  if (!status->ok()) {
+    return TensorHandle(nullptr);
+  }
+  return TensorHandle(tensor_handle);
+}
+
+}  // namespace cc
+}  // namespace experimental
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_PUBLIC_TENSORHANDLE_H_
diff --git a/tensorflow/cc/experimental/base/tests/BUILD b/tensorflow/cc/experimental/base/tests/BUILD
index a2b634a70f4..f449d618f72 100644
--- a/tensorflow/cc/experimental/base/tests/BUILD
+++ b/tensorflow/cc/experimental/base/tests/BUILD
@@ -5,12 +5,22 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "tensor_types_test_util",
+    testonly = True,
+    hdrs = ["tensor_types_test_util.h"],
+    deps = [
+        "//tensorflow/c:tf_datatype",
+    ],
+)
+
 tf_cc_test(
     name = "tensor_test",
     srcs = [
         "tensor_test.cc",
     ],
     deps = [
+        ":tensor_types_test_util",
         "//tensorflow/c:tf_datatype",
         "//tensorflow/cc/experimental/base/public:status",
         "//tensorflow/cc/experimental/base/public:tensor",
@@ -19,3 +29,22 @@ tf_cc_test(
         "//tensorflow/core:test_main",
     ],
 )
+
+tf_cc_test(
+    name = "tensorhandle_test",
+    srcs = [
+        "tensorhandle_test.cc",
+    ],
+    deps = [
+        ":tensor_types_test_util",
+        "//tensorflow/c:tf_datatype",
+        "//tensorflow/cc/experimental/base/public:runtime",
+        "//tensorflow/cc/experimental/base/public:runtime_builder",
+        "//tensorflow/cc/experimental/base/public:status",
+        "//tensorflow/cc/experimental/base/public:tensor",
+        "//tensorflow/cc/experimental/base/public:tensorhandle",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/cc/experimental/base/tests/tensor_test.cc b/tensorflow/cc/experimental/base/tests/tensor_test.cc
index 86a50bac5cd..33f9ab637e8 100644
--- a/tensorflow/cc/experimental/base/tests/tensor_test.cc
+++ b/tensorflow/cc/experimental/base/tests/tensor_test.cc
@@ -16,69 +16,22 @@ limitations under the License.
 #include "tensorflow/cc/experimental/base/public/tensor.h"
 
 #include <stddef.h>
-
-#include <cstdint>
+#include <stdint.h>
 
 #include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/cc/experimental/base/tests/tensor_types_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/test.h"
 
-namespace tensorflow {
 namespace {
 
-// Each of the following struct types have two members: a kDType that
-// corresponds to a TF_Datatype enum value, and a typedef "type"
-// of its corresponding C++ type. These types allow us to write Dtype-agnostic
-// tests via GoogleTest's TypedTests:
-// https://github.com/google/googletest/blob/e589a337170554c48bc658cc857cf15080c9eacc/googletest/docs/advanced.md#typed-tests
-struct FloatType {
-  using type = float;
-  static constexpr TF_DataType kDType = TF_FLOAT;
-};
+using tensorflow::experimental::cc::Status;
+using tensorflow::experimental::cc::Tensor;
 
-struct DoubleType {
-  using type = double;
-  static constexpr TF_DataType kDType = TF_DOUBLE;
-};
-
-struct Int32Type {
-  using type = int32_t;
-  static constexpr TF_DataType kDType = TF_INT32;
-};
-
-struct UINT8Type {
-  using type = uint8_t;
-  static constexpr TF_DataType kDType = TF_UINT8;
-};
-
-struct INT8Type {
-  using type = int8_t;
-  static constexpr TF_DataType kDType = TF_INT8;
-};
-
-struct INT64Type {
-  using type = int64_t;
-  static constexpr TF_DataType kDType = TF_INT64;
-};
-
-struct UINT16Type {
-  using type = uint16_t;
-  static constexpr TF_DataType kDType = TF_UINT16;
-};
-
-struct UINT32Type {
-  using type = uint32_t;
-  static constexpr TF_DataType kDType = TF_UINT32;
-};
-
-struct UINT64Type {
-  using type = uint64_t;
-  static constexpr TF_DataType kDType = TF_UINT64;
-};
-
-using SimpleTypes =
-    ::testing::Types<FloatType, DoubleType, Int32Type, UINT8Type, INT8Type,
-                     INT64Type, UINT16Type, UINT32Type, UINT64Type>;
+using SimpleTypes = ::testing::Types<
+    tensorflow::FloatType, tensorflow::DoubleType, tensorflow::Int32Type,
+    tensorflow::UINT8Type, tensorflow::INT8Type, tensorflow::INT64Type,
+    tensorflow::UINT16Type, tensorflow::UINT32Type, tensorflow::UINT64Type>;
 
 template <typename T>
 class ConstructScalarTensorTest : public ::testing::Test {};
@@ -88,14 +41,13 @@ TYPED_TEST_SUITE(ConstructScalarTensorTest, SimpleTypes);
 // and verifies the expected dimensions, dtype, value, number of bytes, and
 // number of elements.
 TYPED_TEST(ConstructScalarTensorTest, ValidTensorAttributesAfterConstruction) {
-  cc::Status status;
+  Status status;
   TF_DataType dtype = TypeParam::kDType;
   typename TypeParam::type value = 42;
-  cc::Tensor tensor =
-      cc::Tensor::FromBuffer(/*dtype=*/dtype, /*shape=*/{},
-                             /*data=*/&value,
-                             /*len=*/sizeof(value),
-                             /*deleter=*/[](void*, size_t) {}, &status);
+  Tensor tensor = Tensor::FromBuffer(/*dtype=*/dtype, /*shape=*/{},
+                                     /*data=*/&value,
+                                     /*len=*/sizeof(value),
+                                     /*deleter=*/[](void*, size_t) {}, &status);
   ASSERT_TRUE(status.ok()) << status.message();
 
   EXPECT_EQ(tensor.dims(), 0);
@@ -113,7 +65,7 @@ TYPED_TEST_SUITE(Construct1DTensorTest, SimpleTypes);
 // and verifies the expected dimensions, dtype, value, number of bytes, and
 // number of elements.
 TYPED_TEST(Construct1DTensorTest, ValidTensorAttributesAfterConstruction) {
-  cc::Status status;
+  Status status;
   TF_DataType dtype = TypeParam::kDType;
   // This is our 1D tensor of varying dtype.
   std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
@@ -121,7 +73,7 @@ TYPED_TEST(Construct1DTensorTest, ValidTensorAttributesAfterConstruction) {
   std::vector<int64_t> shape;
   shape.push_back(value.size());
 
-  cc::Tensor tensor = cc::Tensor::FromBuffer(
+  Tensor tensor = Tensor::FromBuffer(
       /*dtype=*/dtype, /*shape=*/shape,
       /*data=*/value.data(),
       /*len=*/value.size() * sizeof(typename TypeParam::type),
@@ -130,7 +82,7 @@ TYPED_TEST(Construct1DTensorTest, ValidTensorAttributesAfterConstruction) {
 
   EXPECT_EQ(tensor.dims(), 1);
   EXPECT_EQ(tensor.dtype(), dtype);
-  gtl::ArraySlice<typename TypeParam::type> tensor_view(
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
       reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
   EXPECT_EQ(tensor_view[0], 42);
   EXPECT_EQ(tensor_view[1], 100);
@@ -152,14 +104,14 @@ TYPED_TEST_SUITE(Construct2DTensorTest, SimpleTypes);
 // and verifies the expected dimensions, dtype, value, number of bytes, and
 // number of elements.
 TYPED_TEST(Construct2DTensorTest, ValidTensorAttributesAfterConstruction) {
-  cc::Status status;
+  Status status;
   TF_DataType dtype = TypeParam::kDType;
   // This is our 1D tensor of varying dtype.
   std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
   // Shape is Rank 2 vector with shape 2 x 3.
   std::vector<int64_t> shape({2, 3});
 
-  cc::Tensor tensor = cc::Tensor::FromBuffer(
+  Tensor tensor = Tensor::FromBuffer(
       /*dtype=*/dtype, /*shape=*/shape,
       /*data=*/value.data(),
       /*len=*/value.size() * sizeof(typename TypeParam::type),
@@ -169,7 +121,7 @@ TYPED_TEST(Construct2DTensorTest, ValidTensorAttributesAfterConstruction) {
 
   EXPECT_EQ(tensor.dims(), 2);
   EXPECT_EQ(tensor.dtype(), dtype);
-  gtl::ArraySlice<typename TypeParam::type> tensor_view(
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
       reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
   EXPECT_EQ(tensor_view[0], 42);
   EXPECT_EQ(tensor_view[1], 100);
@@ -185,22 +137,22 @@ TYPED_TEST(Construct2DTensorTest, ValidTensorAttributesAfterConstruction) {
 
 TEST(CPPTensorAPI, ConstructTensorFromBuffer) {
   bool done = false;
-  cc::Status status;
+  Status status;
   std::vector<int32_t> data_vector({12, 14, 20, 18, 39, 42, 100});
   {
     // data_vector is a rank 1 tensor.
     std::vector<int64_t> shape;
     shape.push_back(data_vector.size());
 
-    cc::Tensor::DeleterCallback callback = [&done](void* data, size_t len) {
+    Tensor::DeleterCallback callback = [&done](void* data, size_t len) {
       done = true;
     };
 
-    cc::Tensor tensor =
-        cc::Tensor::FromBuffer(/*dtype=*/TF_INT32, /*shape=*/shape,
-                               /*data=*/data_vector.data(),
-                               /*len=*/data_vector.size() * sizeof(int32_t),
-                               /*deleter=*/callback, &status);
+    Tensor tensor =
+        Tensor::FromBuffer(/*dtype=*/TF_INT32, /*shape=*/shape,
+                           /*data=*/data_vector.data(),
+                           /*len=*/data_vector.size() * sizeof(int32_t),
+                           /*deleter=*/callback, &status);
     ASSERT_TRUE(status.ok()) << status.message();
   }
   // At this point, tensor has been destroyed, and the deleter callback should
@@ -209,4 +161,3 @@ TEST(CPPTensorAPI, ConstructTensorFromBuffer) {
 }
 
 }  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h b/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h
new file mode 100644
index 00000000000..af9cad7529b
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/tensor_types_test_util.h
@@ -0,0 +1,76 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CC_EXPERIMENTAL_BASE_TEST_TENSOR_TYPES_TEST_UTIL_H_
+#define TENSORFLOW_CC_EXPERIMENTAL_BASE_TEST_TENSOR_TYPES_TEST_UTIL_H_
+
+#include <stdint.h>
+
+#include "tensorflow/c/tf_datatype.h"
+
+namespace tensorflow {
+
+// Each of the following struct types have two members: a kDType that
+// corresponds to a TF_Datatype enum value, and a typedef "type"
+// of its corresponding C++ type. These types allow us to write Dtype-agnostic
+// tests via GoogleTest's TypedTests:
+// https://github.com/google/googletest/blob/e589a337170554c48bc658cc857cf15080c9eacc/googletest/docs/advanced.md#typed-tests
+struct FloatType {
+  using type = float;
+  static constexpr TF_DataType kDType = TF_FLOAT;
+};
+
+struct DoubleType {
+  using type = double;
+  static constexpr TF_DataType kDType = TF_DOUBLE;
+};
+
+struct Int32Type {
+  using type = int32_t;
+  static constexpr TF_DataType kDType = TF_INT32;
+};
+
+struct UINT8Type {
+  using type = uint8_t;
+  static constexpr TF_DataType kDType = TF_UINT8;
+};
+
+struct INT8Type {
+  using type = int8_t;
+  static constexpr TF_DataType kDType = TF_INT8;
+};
+
+struct INT64Type {
+  using type = int64_t;
+  static constexpr TF_DataType kDType = TF_INT64;
+};
+
+struct UINT16Type {
+  using type = uint16_t;
+  static constexpr TF_DataType kDType = TF_UINT16;
+};
+
+struct UINT32Type {
+  using type = uint32_t;
+  static constexpr TF_DataType kDType = TF_UINT32;
+};
+
+struct UINT64Type {
+  using type = uint64_t;
+  static constexpr TF_DataType kDType = TF_UINT64;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CC_EXPERIMENTAL_BASE_TEST_TENSOR_TYPES_TEST_UTIL_H_
diff --git a/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc b/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc
new file mode 100644
index 00000000000..cfeaba4e392
--- /dev/null
+++ b/tensorflow/cc/experimental/base/tests/tensorhandle_test.cc
@@ -0,0 +1,184 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/cc/experimental/base/public/tensorhandle.h"
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <memory>
+
+#include "tensorflow/c/tf_datatype.h"
+#include "tensorflow/cc/experimental/base/public/runtime.h"
+#include "tensorflow/cc/experimental/base/public/runtime_builder.h"
+#include "tensorflow/cc/experimental/base/public/tensor.h"
+#include "tensorflow/cc/experimental/base/tests/tensor_types_test_util.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+using tensorflow::experimental::cc::Runtime;
+using tensorflow::experimental::cc::RuntimeBuilder;
+using tensorflow::experimental::cc::Status;
+using tensorflow::experimental::cc::Tensor;
+using tensorflow::experimental::cc::TensorHandle;
+
+using SimpleTypes = ::testing::Types<
+    tensorflow::FloatType, tensorflow::DoubleType, tensorflow::Int32Type,
+    tensorflow::UINT8Type, tensorflow::INT8Type, tensorflow::INT64Type,
+    tensorflow::UINT16Type, tensorflow::UINT32Type, tensorflow::UINT64Type>;
+
+template <typename T>
+class ConstructScalarTensorHandleTest : public ::testing::Test {};
+TYPED_TEST_SUITE(ConstructScalarTensorHandleTest, SimpleTypes);
+
+// This test constructs a scalar tensor for each of the types in "SimpleTypes",
+// then wraps it in a TensorHandle. We then unwrap it back into a Tensor, and
+// verify the expected dims, dtype, value, num bytes, and num elements.
+TYPED_TEST(ConstructScalarTensorHandleTest,
+           ValidTensorAttributesAfterConstruction) {
+  Status status;
+  RuntimeBuilder runtime_builder;
+  std::unique_ptr<Runtime> runtime = runtime_builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TF_DataType dtype = TypeParam::kDType;
+  typename TypeParam::type value = 42;
+  Tensor original_tensor =
+      Tensor::FromBuffer(/*dtype=*/dtype, /*shape=*/{},
+                         /*data=*/&value,
+                         /*len=*/sizeof(value),
+                         /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TensorHandle handle =
+      TensorHandle::FromTensor(original_tensor, *runtime, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  Tensor tensor = handle.Resolve(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 0);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  EXPECT_EQ(*reinterpret_cast<typename TypeParam::type*>(tensor.data()), 42);
+  EXPECT_EQ(tensor.num_bytes(), sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), 1);
+}
+
+template <typename T>
+class Construct1DTensorHandleTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct1DTensorHandleTest, SimpleTypes);
+
+// This test constructs a 1D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct1DTensorHandleTest,
+           ValidTensorAttributesAfterConstruction) {
+  Status status;
+  RuntimeBuilder runtime_builder;
+  std::unique_ptr<Runtime> runtime = runtime_builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 1 vector.
+  std::vector<int64_t> shape;
+  shape.push_back(value.size());
+
+  Tensor original_tensor = Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TensorHandle handle =
+      TensorHandle::FromTensor(original_tensor, *runtime, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  Tensor tensor = handle.Resolve(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 1);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+template <typename T>
+class Construct2DTensorHandleTest : public ::testing::Test {};
+TYPED_TEST_SUITE(Construct2DTensorHandleTest, SimpleTypes);
+
+// This test constructs a 2D tensor for each of the types in "SimpleTypes",
+// and verifies the expected dimensions, dtype, value, number of bytes, and
+// number of elements.
+TYPED_TEST(Construct2DTensorHandleTest,
+           ValidTensorAttributesAfterConstruction) {
+  Status status;
+  RuntimeBuilder runtime_builder;
+  std::unique_ptr<Runtime> runtime = runtime_builder.Build(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TF_DataType dtype = TypeParam::kDType;
+  // This is our 1D tensor of varying dtype.
+  std::vector<typename TypeParam::type> value = {42, 100, 0, 1, 4, 29};
+  // Shape is Rank 2 vector with shape 2 x 3.
+  std::vector<int64_t> shape({2, 3});
+
+  Tensor original_tensor = Tensor::FromBuffer(
+      /*dtype=*/dtype, /*shape=*/shape,
+      /*data=*/value.data(),
+      /*len=*/value.size() * sizeof(typename TypeParam::type),
+      /*deleter=*/[](void*, size_t) {}, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  TensorHandle handle =
+      TensorHandle::FromTensor(original_tensor, *runtime, &status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  Tensor tensor = handle.Resolve(&status);
+  ASSERT_TRUE(status.ok()) << status.message();
+
+  EXPECT_EQ(tensor.dims(), 2);
+  EXPECT_EQ(tensor.dtype(), dtype);
+  tensorflow::gtl::ArraySlice<typename TypeParam::type> tensor_view(
+      reinterpret_cast<typename TypeParam::type*>(tensor.data()), value.size());
+  EXPECT_EQ(tensor_view[0], 42);
+  EXPECT_EQ(tensor_view[1], 100);
+  EXPECT_EQ(tensor_view[2], 0);
+  EXPECT_EQ(tensor_view[3], 1);
+  EXPECT_EQ(tensor_view[4], 4);
+  EXPECT_EQ(tensor_view[5], 29);
+
+  EXPECT_EQ(tensor.num_bytes(),
+            value.size() * sizeof(typename TypeParam::type));
+  EXPECT_EQ(tensor.num_elements(), value.size());
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/experimental/public/concrete_function.h b/tensorflow/cc/saved_model/experimental/public/concrete_function.h
index f57ba052f1a..1adaf70b01a 100644
--- a/tensorflow/cc/saved_model/experimental/public/concrete_function.h
+++ b/tensorflow/cc/saved_model/experimental/public/concrete_function.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/experimental/public/function_metadata.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // ConcreteFunction is an executable "function" loaded from a SavedModelAPI.
@@ -54,6 +55,7 @@ inline const FunctionMetadata* ConcreteFunction::GetFunctionMetadata() {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h b/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
index bab95278eac..88cb779ef15 100644
--- a/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
+++ b/tensorflow/cc/saved_model/experimental/public/concrete_function_list.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/experimental/public/concrete_function.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // ConcreteFunctionList helps convert an opaque pointer to an array of
@@ -56,6 +57,7 @@ inline std::vector<ConcreteFunction*> ConcreteFunctionList::ToVector() {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/function_metadata.h b/tensorflow/cc/saved_model/experimental/public/function_metadata.h
index c3dcc45af0e..11e1a860d84 100644
--- a/tensorflow/cc/saved_model/experimental/public/function_metadata.h
+++ b/tensorflow/cc/saved_model/experimental/public/function_metadata.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // FunctionMetadata stores additional function information, including
@@ -40,6 +41,7 @@ class FunctionMetadata final {
 };
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_FUNCTION_METADATA_H_
diff --git a/tensorflow/cc/saved_model/experimental/public/saved_model_api.h b/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
index 814479de213..04018bf2aab 100644
--- a/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
+++ b/tensorflow/cc/saved_model/experimental/public/saved_model_api.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/cc/saved_model/experimental/public/concrete_function_list.h"
 
 namespace tensorflow {
+namespace experimental {
 namespace cc {
 
 // SavedModelAPI offers a way to load Tensorflow Saved Models
@@ -155,6 +156,7 @@ inline std::vector<ConcreteFunction*> SavedModelAPI::ListFunctions() {
 }
 
 }  // namespace cc
+}  // namespace experimental
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_EXPERIMENTAL_PUBLIC_SAVED_MODEL_API_H_
diff --git a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
index 155c58604bf..7f7f6b09a6d 100644
--- a/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
+++ b/tensorflow/cc/saved_model/experimental/tests/saved_model_api_test.cc
@@ -26,10 +26,14 @@ limitations under the License.
 #include "tensorflow/core/platform/stringpiece.h"
 #include "tensorflow/core/platform/test.h"
 
-namespace tensorflow {
 
 namespace {
 
+using tensorflow::experimental::cc::Runtime;
+using tensorflow::experimental::cc::RuntimeBuilder;
+using tensorflow::experimental::cc::SavedModelAPI;
+using tensorflow::experimental::cc::Status;
+
 constexpr char kTestData[] = "cc/saved_model/testdata";
 
 std::string SavedModelPath(tensorflow::StringPiece saved_model_dir) {
@@ -43,21 +47,21 @@ std::string SavedModelPath(tensorflow::StringPiece saved_model_dir) {
 class CPPSavedModelAPITest : public ::testing::TestWithParam<bool> {};
 
 TEST_P(CPPSavedModelAPITest, LoadsSavedModelWithTags) {
-  cc::Status status;
-  cc::RuntimeBuilder builder;
+  Status status;
+  RuntimeBuilder builder;
   bool use_tfrt = GetParam();
   if (use_tfrt) {
     GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
   }
 
   builder.SetUseTFRT(use_tfrt);
-  std::unique_ptr<cc::Runtime> runtime = builder.Build(&status);
+  std::unique_ptr<Runtime> runtime = builder.Build(&status);
   ASSERT_TRUE(status.ok()) << status.message();
 
   std::string model_dir = SavedModelPath("VarsAndArithmeticObjectGraph");
   std::unordered_set<std::string> tags = {"serve"};
-  std::unique_ptr<cc::SavedModelAPI> model =
-      cc::SavedModelAPI::Load(model_dir, *runtime, &status, &tags);
+  std::unique_ptr<SavedModelAPI> model =
+      SavedModelAPI::Load(model_dir, *runtime, &status, &tags);
 
   // TODO(bmzhao): Change this to expect TF_OK when loading is implemented.
   // That unblocks writing other tests that require a TF_SavedModel*,
@@ -67,20 +71,20 @@ TEST_P(CPPSavedModelAPITest, LoadsSavedModelWithTags) {
 }
 
 TEST_P(CPPSavedModelAPITest, LoadsSavedModel) {
-  cc::Status status;
-  cc::RuntimeBuilder builder;
+  Status status;
+  RuntimeBuilder builder;
   bool use_tfrt = GetParam();
   if (use_tfrt) {
     GTEST_SKIP();  // TODO(chky) : Enable this once TFRT is open sourced.
   }
 
   builder.SetUseTFRT(use_tfrt);
-  std::unique_ptr<cc::Runtime> runtime = builder.Build(&status);
+  std::unique_ptr<Runtime> runtime = builder.Build(&status);
   ASSERT_TRUE(status.ok()) << status.message();
 
   std::string model_dir = SavedModelPath("VarsAndArithmeticObjectGraph");
-  std::unique_ptr<cc::SavedModelAPI> model =
-      cc::SavedModelAPI::Load(model_dir, *runtime, &status);
+  std::unique_ptr<SavedModelAPI> model =
+      SavedModelAPI::Load(model_dir, *runtime, &status);
 
   // TODO(bmzhao): Change this to expect TF_OK when loading is implemented.
   // That unblocks writing other tests that require a TF_SavedModel*,
@@ -94,4 +98,3 @@ INSTANTIATE_TEST_SUITE_P(RuntimeAgnosticCPPSavedModelTests,
 
 }  // namespace
 
-}  // namespace tensorflow

From 9eac27f8bb3404567d6db6698c3163e12f09d960 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 12:55:58 -0700
Subject: [PATCH 050/412] Add Int8/BF16 1D un-tiled layout support for TPU.
 Host can transfer data to/from device directly in host layout and pack/unpack
 will be done on device side.

PiperOrigin-RevId: 311184816
Change-Id: Ib08ef8ec0c3189455b3459af223a2960ca46a0ac
---
 .../compiler/xla/service/hlo_verifier.cc      |  6 ++++--
 .../compiler/xla/service/layout_assignment.cc | 10 ++-------
 .../xla/service/layout_assignment_test.cc     | 21 -------------------
 3 files changed, 6 insertions(+), 31 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 360c8e50d55..d15a36532eb 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -662,9 +662,11 @@ Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
           shape_size_function_(bitcast->operand(0)->shape())) {
     return InternalError(
         "Bitcast cannot have different shape sizes of output (%d) and operand "
-        "(%d)",
+        "(%d) (%s) (%s)",
         shape_size_function_(bitcast->shape()),
-        shape_size_function_(bitcast->operand(0)->shape()));
+        shape_size_function_(bitcast->operand(0)->shape()),
+        bitcast->shape().ToString(true),
+        bitcast->operand(0)->shape().ToString(true));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 84654bf3213..13699f3adf9 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -951,7 +951,8 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
                 if (!Shape::Equal()
                          .IgnoreDynamicDimension()
                          .MinorToMajorOnlyInLayout()(instruction_subshape,
-                                                     buffer->shape())) {
+                                                     buffer->shape()) &&
+                    instruction->opcode() != HloOpcode::kBitcast) {
                   return InternalError(
                       "Layout of instruction %s at index {%s} does not match "
                       "source LogicalBuffer %s: %s vs %s",
@@ -1798,13 +1799,6 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
   // potential bugs in the layout assignment pass that may accidentally use the
   // existing layout.
   for (HloInstruction* instruction : computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kBitcast) {
-      // bitcasts are inherently layout sensitive and so a bitcast instruction
-      // present in the IR before layout assignment is a bug.
-      return InternalError(
-          "Unexpected bitcast operation seen during layout assignment: %s.",
-          instruction->ToString());
-    }
     // Some instructions carry mandatory layouts in their shape.
     if (instruction->opcode() != HloOpcode::kInfeed &&
         !IsLayoutConstrainedCustomCall(instruction) &&
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 304a80c7a52..6e575247e6b 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -814,27 +814,6 @@ TEST_F(LayoutAssignmentTest, ConditionalAsymmetricLayout) {
   EXPECT_THAT(false_result->opcode(), HloOpcode::kCopy);
 }
 
-TEST_F(LayoutAssignmentTest, InternalErrorOnBitcast) {
-  auto builder = HloComputation::Builder(TestName());
-  auto constant0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2WithLayout<float>(
-          {{1.0, 2.0}, {3.0, 4.0}}, LayoutUtil::MakeLayout({0, 1}))));
-  builder.AddInstruction(
-      HloInstruction::CreateBitcast(constant0->shape(), constant0));
-  auto m = CreateNewVerifiedModule();
-  m->AddEntryComputation(builder.Build());
-
-  ComputationLayout computation_layout(
-      m->entry_computation()->ComputeProgramShape());
-  LayoutAssignment layout_assignment(&computation_layout);
-  Status error_status = layout_assignment.Run(m.get()).status();
-  EXPECT_FALSE(error_status.ok());
-  EXPECT_THAT(
-      error_status.error_message(),
-      ::testing::HasSubstr(
-          "Unexpected bitcast operation seen during layout assignment"));
-}
-
 TEST_F(LayoutAssignmentTest, ChannelLayoutMismatch) {
   // Pin non matching layouts to parameter and root.
   const char* module_str = R"(

From e036f1bd8f5a3f64276f9c79892998f770598337 Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Tue, 12 May 2020 12:56:18 -0700
Subject: [PATCH 051/412] Code cleanup: Use the combined left-or-right shift
 instruction.

The Xtensa compiler probably already did this optimization, as there is absolutely no difference in the generated binary.

PiperOrigin-RevId: 311184878
Change-Id: I28891223b89987bd23304701a210c2c6d49ab7f2
---
 .../lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h   | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
index 2ed3e45ece1..918192c4d8f 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/fixedpoint_utils.h
@@ -65,7 +65,11 @@ inline ae_q56s MultiplyByQuantizedMultiplier(ae_p24x2s x_24x2,
   ae_q56s result_56 = AE_MULP24S_HH(x_24x2, quantized_multiplier_24x2);
 
   // Shift right if shift amount is positive, left if shift amount is negative.
-  result_56 = AE_SLAASQ56S(result_56, shift_amount);
+  if (shift_amount >= 0) {
+    result_56 = AE_Q56S_SRA(result_56, shift_amount);
+  } else {
+    result_56 = AE_Q56S_SLA(result_56, -shift_amount);
+  }
 
   // Round off the bottom 16 bits.
   // Q48.0 / 2^16 -> Q32.0 aligned to 48 bits.

From 117c75d3117ac8babe84393dce32dbc2dd2dbe36 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 13:43:09 -0700
Subject: [PATCH 052/412] Add layout config to HloModuleConfig.

PiperOrigin-RevId: 311193903
Change-Id: I9b6680c5a9919804e449ed617bd6bc310800183e
---
 .../compiler/xla/service/hlo_module_config.h      | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index 61ea8392d94..833d0fe59d0 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -204,6 +204,14 @@ class HloModuleConfig {
 
   std::vector<std::vector<int64>>* mutable_dot_config() { return &dot_config_; }
 
+  absl::Span<const std::vector<std::vector<int64>>> layout_config() const {
+    return layout_config_;
+  }
+
+  std::vector<std::vector<std::vector<int64>>>* mutable_layout_config() {
+    return &layout_config_;
+  }
+
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
@@ -241,6 +249,9 @@ class HloModuleConfig {
   FusionConfigCollection fusion_config_collection_ =
       FusionConfigCollection::kOff;
 
+  // TODO(b/155665133): Consolidate fusion, dot, and layout config into a proto
+  // similar to backend config.
+
   // Custom fusion configuration, where fusion_config_[c][v] control if node v
   // in computation c must be fused to all its consumers (true) or not (false).
   std::vector<std::vector<bool>> fusion_config_;
@@ -249,6 +260,10 @@ class HloModuleConfig {
   // how to convert dot operation v (sorted topologically and by computation) to
   // convolution.
   std::vector<std::vector<int64>> dot_config_;
+
+  // Layout configuration, where layout_config_[v][i] controls the layout
+  // decision i of operation v.
+  std::vector<std::vector<std::vector<int64>>> layout_config_;
 };
 
 }  // namespace xla

From fd895bf2b98250929a442e5cf689f6bb272ac52c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 13:47:34 -0700
Subject: [PATCH 053/412] [BUILD] Create a separate BUILD file for
 "tensorflow/core/protobuf"`.

This change leaves all existing targets in "tensorflow/core/BUILD" in place, with some becoming aliases. In future, we will remove aliases and point to the new locations.

PiperOrigin-RevId: 311194740
Change-Id: Id413277651b260641c1c2e06cb54d16629e6e662
---
 tensorflow/compiler/xla/service/gpu/BUILD     |   8 +-
 tensorflow/core/BUILD                         | 194 +++++++-----------
 tensorflow/core/lib/core/BUILD                |   9 +-
 tensorflow/core/platform/BUILD                |   2 +-
 .../core/platform/default/build_config.bzl    |   4 +-
 tensorflow/core/protobuf/BUILD                | 182 ++++++++++++++++
 tensorflow/go/saved_model.go                  |   2 +-
 tensorflow/go/signature.go                    |   2 +-
 tensorflow/go/signature_test.go               |   2 +-
 tensorflow/python/BUILD                       |  24 +--
 10 files changed, 282 insertions(+), 147 deletions(-)
 create mode 100644 tensorflow/core/protobuf/BUILD

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 8f8263a85f9..bff8734de5f 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -686,7 +686,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/util/proto:proto_utils",
@@ -722,7 +722,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_casting_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/core:autotuning_proto_cc",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -1676,7 +1676,7 @@ tf_proto_library_cc(
     protodeps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/core:autotuning_proto",
+        "//tensorflow/core/protobuf:autotuning_proto",
     ],
 )
 
@@ -1687,8 +1687,8 @@ cc_library(
     deps = [
         ":gpu_autotuning_proto_cc",
         "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/core:autotuning_proto_cc",
         "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core/protobuf:autotuning_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index a655a9509d3..6b4874a8393 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -105,7 +105,6 @@ load("//tensorflow:tensorflow.bzl", "tf_monitoring_deps")
 # For platform specific build config
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "tf_additional_all_protos",
     "tf_additional_lib_deps",
     "tf_additional_test_deps",
     "tf_jspb_proto_library",
@@ -114,11 +113,9 @@ load(
     "tf_portable_deps_no_runtime",
     "tf_portable_proto_lib",
     "tf_proto_library",
-    "tf_proto_library_cc",
     "tf_protos_all_impl",
     "tf_protos_grappler_impl",
     "tf_protos_profiler_impl",
-    "tf_pyclif_proto_library",
 )
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -181,18 +178,18 @@ package_group(name = "friends")
 # filegroup; e.g.  ones with individual proto_library targets.
 # LINT.IfChange
 COMMON_PROTO_SRCS = [
-    "protobuf/bfc_memory_map.proto",
-    "protobuf/config.proto",
-    "protobuf/cluster.proto",
-    "protobuf/debug.proto",
-    "protobuf/device_filters.proto",
-    "protobuf/device_properties.proto",
-    "protobuf/graph_debug_info.proto",
-    "protobuf/queue_runner.proto",
-    "protobuf/rewriter_config.proto",
-    "protobuf/tensor_bundle.proto",
-    "protobuf/saver.proto",
-    "protobuf/verifier_config.proto",
+    "//tensorflow/core/protobuf:bfc_memory_map.proto",
+    "//tensorflow/core/protobuf:config.proto",
+    "//tensorflow/core/protobuf:cluster.proto",
+    "//tensorflow/core/protobuf:debug.proto",
+    "//tensorflow/core/protobuf:device_filters.proto",
+    "//tensorflow/core/protobuf:device_properties.proto",
+    "//tensorflow/core/protobuf:graph_debug_info.proto",
+    "//tensorflow/core/protobuf:queue_runner.proto",
+    "//tensorflow/core/protobuf:rewriter_config.proto",
+    "//tensorflow/core/protobuf:tensor_bundle.proto",
+    "//tensorflow/core/protobuf:saver.proto",
+    "//tensorflow/core/protobuf:verifier_config.proto",
 ]
 
 EXAMPLE_PROTO_SRCS = [
@@ -239,7 +236,7 @@ PROFILER_PROTO_SRCS = [
 ]
 
 ERROR_CODES_PROTO_SRCS = [
-    "protobuf/error_codes.proto",
+    "//tensorflow/core/protobuf:error_codes.proto",
     "//tensorflow/core/lib/core:error_codes.proto",
 ]
 # LINT.ThenChange(//tensorflow/core/portable_proto_config.asciipb)
@@ -252,11 +249,13 @@ tf_proto_library(
     cc_api_version = 2,
     make_default_target_header_only = True,
     protodeps = [
-        ":core_protos",
-        ":error_codes_proto_impl",
         "//tensorflow/core/example:protos_all",
         "//tensorflow/core/framework:protos_all",
         "//tensorflow/core/lib/core:error_codes_proto",
+        "//tensorflow/core/profiler/protobuf:xplane_proto",
+        "//tensorflow/core/profiler:profiler_options_proto",
+        "//tensorflow/core/protobuf:error_codes_proto_impl",
+        "//tensorflow/core/protobuf:for_core_protos",
         "//tensorflow/core/util:protos_all",
         "//tensorflow/core/util:test_log_proto_impl",
     ],
@@ -1603,20 +1602,13 @@ alias(
     [
         alias(
             name = "protobuf_%s_pyclif%s" % (proto_name, target_suffix),
-            actual = ":protobuf/%s_pyclif%s" % (proto_name, target_suffix),
+            actual = "//tensorflow/core/protobuf:%s_pyclif%s" % (proto_name, target_suffix),
             visibility = ["//visibility:public"],
         )
         for target_suffix in [
             "",
             "_pb2",
         ]
-    ] + [
-        tf_pyclif_proto_library(
-            name = "protobuf/%s_pyclif" % proto_name,
-            proto_lib = ":protos_all",
-            proto_srcfile = "protobuf/%s.proto" % proto_name,
-            visibility = ["//visibility:public"],
-        ),
     ]
     for proto_name in [
         "config",
@@ -1630,77 +1622,74 @@ alias(
 # -----------------------------------------------------------------------------
 # Internal targets
 
-tf_proto_library(
+alias(
     name = "autotuning_proto",
-    srcs = ["protobuf/autotuning.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
+    actual = "//tensorflow/core/protobuf:autotuning_proto",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library(
+alias(
+    name = "autotuning_proto_cc",
+    actual = "//tensorflow/core/protobuf:autotuning_proto_cc",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
     name = "conv_autotuning_proto",
-    srcs = ["protobuf/conv_autotuning.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    protodeps = [
-        "//tensorflow/stream_executor:dnn_proto",
-    ],
+    actual = "//tensorflow/core/protobuf:conv_autotuning_proto",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library_cc(
-    name = "worker_proto",
-    srcs = ["protobuf/worker.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
-    visibility = ["//visibility:public"],
-)
-
-tf_proto_library_cc(
-    name = "worker_service_proto",
-    srcs = ["protobuf/worker_service.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_stubby_versions = ["2"],
-    protodeps = [":worker_proto"],
+alias(
+    name = "conv_autotuning_proto_cc",
+    actual = "//tensorflow/core/protobuf:conv_autotuning_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library_cc(
-    name = "master_proto",
-    srcs = ["protobuf/master.proto"],
-    cc_api_version = 2,
-    protodeps = tf_additional_all_protos(),
-    visibility = ["//tensorflow:internal"],
-)
-
-tf_proto_library_cc(
-    name = "master_service_proto",
-    srcs = ["protobuf/master_service.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_stubby_versions = ["2"],
-    protodeps = [":master_proto"],
+alias(
+    name = "worker_proto_cc",
+    actual = "//tensorflow/core/protobuf:worker_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
 )
 
-tf_proto_library_cc(
-    name = "eager_service_proto",
-    srcs = ["protobuf/eager_service.proto"],
-    has_services = 1,
-    cc_api_version = 2,
-    cc_grpc_version = 1,
-    cc_stubby_versions = ["2"],
-    protodeps = tf_additional_all_protos(),
+alias(
+    name = "worker_service_proto_cc",
+    actual = "//tensorflow/core/protobuf:worker_service_proto_cc",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
+    name = "master_proto_cc",
+    actual = "//tensorflow/core/protobuf:master_proto_cc",
+    visibility = [
+        "//learning/brain/frameworks/uptc:__subpackages__",
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
+    name = "master_service_proto_cc",
+    actual = "//tensorflow/core/protobuf:master_service_proto_cc",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+alias(
+    name = "eager_service_proto_cc",
+    actual = "//tensorflow/core/protobuf:eager_service_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
@@ -2112,49 +2101,14 @@ cc_library(
     ],
 )
 
-tf_proto_library(
+alias(
     name = "error_codes_proto_impl",
-    srcs = ["protobuf/error_codes.proto"],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
+    actual = "//tensorflow/core/protobuf:error_codes_proto_impl",
 )
 
-tf_proto_library(
-    name = "core_protos",
-    srcs = COMMON_PROTO_SRCS + [
-        # Protos which are not needed on mobile builds, but should be included
-        # in protos_all.
-        #
-        # Note that some protos are in neither core_proto_srcs nor this
-        # filegroup; e.g. ones with individual proto_library targets.
-        "protobuf/control_flow.proto",
-        # TODO(ebrevdo): Re-enable once CriticalSection is in core.
-        # "protobuf/critical_section.proto",
-        "protobuf/data/experimental/snapshot.proto",
-        "protobuf/debug_event.proto",
-        "protobuf/meta_graph.proto",
-        "protobuf/named_tensor.proto",
-        "protobuf/remote_tensor_handle.proto",
-        "protobuf/saved_model.proto",
-        "protobuf/saved_object_graph.proto",
-        "protobuf/struct.proto",
-        "protobuf/tensorflow_server.proto",
-        "protobuf/trackable_object_graph.proto",
-        "protobuf/transport_options.proto",
-    ],
-    cc_api_version = 2,
-    make_default_target_header_only = True,
-    protodeps = [
-        ":error_codes_proto_impl",
-        "//tensorflow/core/example:protos_all",
-        "//tensorflow/core/framework:protos_all",
-        "//tensorflow/core/lib/core:error_codes_proto",
-        "//tensorflow/core/profiler/protobuf:xplane_proto",
-        "//tensorflow/core/profiler:profiler_options_proto",
-        "//tensorflow/core/util:protos_all",
-        "//tensorflow/core/util:test_log_proto_impl",
-    ],
-    visibility = ["//visibility:private"],
+alias(
+    name = "error_codes_proto_impl_cc",
+    actual = "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
 )
 
 alias(
@@ -2446,13 +2400,9 @@ alias(
     visibility = ["//visibility:public"],
 )
 
-tf_proto_library_cc(
-    name = "replay_log_proto",
-    srcs = ["protobuf/replay_log.proto"],
-    cc_api_version = 2,
-    protodeps = [
-        ":master_proto",
-    ] + tf_additional_all_protos(),
+alias(
+    name = "replay_log_proto_cc",
+    actual = "//tensorflow/core/protobuf:replay_log_proto_cc",
     visibility = [
         "//tensorflow:internal",
     ],
diff --git a/tensorflow/core/lib/core/BUILD b/tensorflow/core/lib/core/BUILD
index 80ad4943f16..491e4c5e7aa 100644
--- a/tensorflow/core/lib/core/BUILD
+++ b/tensorflow/core/lib/core/BUILD
@@ -138,10 +138,13 @@ tf_proto_library(
     cc_api_version = 2,
     make_default_target_header_only = True,
     protodeps = [
-        "//tensorflow/core:error_codes_proto_impl",
+        "//tensorflow/core/protobuf:error_codes_proto_impl",
     ],
-    visibility = ["//tensorflow/core:__subpackages__"],
-    exports = ["//tensorflow/core:error_codes_proto_impl"],
+    visibility = [
+        "//tensorflow/core:__subpackages__",
+        "//tensorflow/core/protobuf:__subpackages__",
+    ],
+    exports = ["//tensorflow/core/protobuf:error_codes_proto_impl"],
 )
 
 # Export source files needed for mobile builds, which do not use granular targets.
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 819f8fcdadb..c7ff378d2ac 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -621,7 +621,7 @@ cc_library(
         ":stringpiece",
         ":stringprintf",
         ":types",
-        "//tensorflow/core:error_codes_proto_impl_cc",
+        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/base",
     ],
 )
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index fd6e78addce..2dc4fdc0fd9 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -577,8 +577,8 @@ def tf_additional_all_protos():
 
 def tf_protos_all_impl():
     return [
-        clean_dep("//tensorflow/core:autotuning_proto_cc_impl"),
-        clean_dep("//tensorflow/core:conv_autotuning_proto_cc_impl"),
+        clean_dep("//tensorflow/core/protobuf:autotuning_proto_cc_impl"),
+        clean_dep("//tensorflow/core/protobuf:conv_autotuning_proto_cc_impl"),
         clean_dep("//tensorflow/core:protos_all_cc_impl"),
     ]
 
diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
new file mode 100644
index 00000000000..a374c808a14
--- /dev/null
+++ b/tensorflow/core/protobuf/BUILD
@@ -0,0 +1,182 @@
+# For platform specific build config
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_all_protos",
+    "tf_proto_library",
+    "tf_proto_library_cc",
+    "tf_pyclif_proto_library",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+        "//tensorflow/core:__subpackages__",
+        "//tensorflow_models:__subpackages__",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+COMMON_PROTO_SRCS = [
+    "bfc_memory_map.proto",
+    "config.proto",
+    "cluster.proto",
+    "debug.proto",
+    "device_filters.proto",
+    "device_properties.proto",
+    "graph_debug_info.proto",
+    "queue_runner.proto",
+    "rewriter_config.proto",
+    "tensor_bundle.proto",
+    "saver.proto",
+    "verifier_config.proto",
+]
+
+[
+    [
+        tf_pyclif_proto_library(
+            name = "%s_pyclif" % proto_name,
+            proto_lib = ":for_core_protos",
+            proto_srcfile = "%s.proto" % proto_name,
+            visibility = ["//visibility:public"],
+        ),
+    ]
+    for proto_name in [
+        "config",
+        "device_properties",
+        "graph_debug_info",
+        "meta_graph",
+        "saved_model",
+    ]
+]
+
+tf_proto_library(
+    name = "autotuning_proto",
+    srcs = ["autotuning.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
+
+tf_proto_library(
+    name = "conv_autotuning_proto",
+    srcs = ["conv_autotuning.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = [
+        "//tensorflow/stream_executor:dnn_proto",
+    ],
+)
+
+tf_proto_library_cc(
+    name = "worker_proto",
+    srcs = ["worker.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+    visibility = ["//visibility:public"],
+)
+
+tf_proto_library_cc(
+    name = "worker_service_proto",
+    srcs = ["worker_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_stubby_versions = ["2"],
+    protodeps = [":worker_proto"],
+)
+
+tf_proto_library_cc(
+    name = "master_proto",
+    srcs = ["master.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+    visibility = ["//tensorflow:internal"],
+)
+
+tf_proto_library_cc(
+    name = "master_service_proto",
+    srcs = ["master_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_stubby_versions = ["2"],
+    protodeps = [":master_proto"],
+)
+
+tf_proto_library_cc(
+    name = "eager_service_proto",
+    srcs = ["eager_service.proto"],
+    has_services = 1,
+    cc_api_version = 2,
+    cc_grpc_version = 1,
+    cc_stubby_versions = ["2"],
+    protodeps = tf_additional_all_protos(),
+)
+
+tf_proto_library_cc(
+    name = "replay_log_proto",
+    srcs = ["replay_log.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        ":master_proto",
+    ] + tf_additional_all_protos(),
+)
+
+tf_proto_library(
+    name = "error_codes_proto_impl",
+    srcs = ["error_codes.proto"],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+)
+
+exports_files(
+    srcs = ["error_codes.proto"] + COMMON_PROTO_SRCS + [
+        # Protos which are not needed on mobile builds, but should be included
+        # in protos_all.
+        #
+        # Note that some protos are in neither core_proto_srcs nor this
+        # filegroup; e.g. ones with individual proto_library targets.
+        "control_flow.proto",
+        # TODO(ebrevdo): Re-enable once CriticalSection is in core.
+        # "critical_section.proto",
+        "data/experimental/snapshot.proto",
+        "debug_event.proto",
+        "meta_graph.proto",
+        "named_tensor.proto",
+        "remote_tensor_handle.proto",
+        "saved_model.proto",
+        "saved_object_graph.proto",
+        "struct.proto",
+        "tensorflow_server.proto",
+        "trackable_object_graph.proto",
+        "transport_options.proto",
+    ],
+)
+
+tf_proto_library(
+    name = "for_core_protos",
+    srcs = COMMON_PROTO_SRCS + [
+        # Protos which are not needed on mobile builds, but should be included
+        # in protos_all.
+        #
+        # Note that some protos are in neither core_proto_srcs nor this
+        # filegroup; e.g. ones with individual proto_library targets.
+        "control_flow.proto",
+        # TODO(ebrevdo): Re-enable once CriticalSection is in core.
+        # "critical_section.proto",
+        "data/experimental/snapshot.proto",
+        "debug_event.proto",
+        "meta_graph.proto",
+        "named_tensor.proto",
+        "remote_tensor_handle.proto",
+        "saved_model.proto",
+        "saved_object_graph.proto",
+        "struct.proto",
+        "tensorflow_server.proto",
+        "trackable_object_graph.proto",
+        "transport_options.proto",
+    ],
+    cc_api_version = 2,
+    make_default_target_header_only = True,
+    protodeps = [
+        ":error_codes_proto_impl",
+        "//tensorflow/core/framework:protos_all",
+    ],
+)
diff --git a/tensorflow/go/saved_model.go b/tensorflow/go/saved_model.go
index 7aa1e83cbc4..64ae82e3b01 100644
--- a/tensorflow/go/saved_model.go
+++ b/tensorflow/go/saved_model.go
@@ -22,7 +22,7 @@ import (
 	"unsafe"
 
 	"github.com/golang/protobuf/proto"
-	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto"
+	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"
 )
 
 // #include <stdlib.h>
diff --git a/tensorflow/go/signature.go b/tensorflow/go/signature.go
index 8aac0e2ec93..c2db0c75247 100644
--- a/tensorflow/go/signature.go
+++ b/tensorflow/go/signature.go
@@ -16,7 +16,7 @@ limitations under the License.
 
 package tensorflow
 
-import corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto"
+import corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"
 
 // #include "tensorflow/c/c_api.h"
 import "C"
diff --git a/tensorflow/go/signature_test.go b/tensorflow/go/signature_test.go
index e6927f3cebd..f9fa8427819 100644
--- a/tensorflow/go/signature_test.go
+++ b/tensorflow/go/signature_test.go
@@ -20,9 +20,9 @@ import (
 	"fmt"
 	"testing"
 
-	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/core_protos_go_proto"
 	tspb "github.com/tensorflow/tensorflow/tensorflow/go/core/framework/tensor_shape_go_proto"
 	typb "github.com/tensorflow/tensorflow/tensorflow/go/core/framework/types_go_proto"
+	corepb "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf/for_core_protos_go_proto"
 )
 
 func TestSignatureFromProto(t *testing.T) {
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 0b046ea8d61..11da45fbcbb 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -655,15 +655,15 @@ tf_python_pybind_extension(
         "@com_google_absl//absl/types:optional",
     ] + if_static(
         extra_deps = [
-            "//tensorflow/core:eager_service_proto_cc",
-            "//tensorflow/core:master_proto_cc",
-            "//tensorflow/core:worker_proto_cc",
+            "//tensorflow/core/protobuf:eager_service_proto_cc",
+            "//tensorflow/core/protobuf:master_proto_cc",
+            "//tensorflow/core/protobuf:worker_proto_cc",
             "//tensorflow/core:version_lib",
         ],
         otherwise = [
-            "//tensorflow/core:eager_service_proto_cc_headers_only",
-            "//tensorflow/core:master_proto_cc_headers_only",
-            "//tensorflow/core:worker_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:eager_service_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:master_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:worker_proto_cc_headers_only",
         ],
     ),
 )
@@ -8049,14 +8049,14 @@ tf_python_pybind_extension(
         "//tensorflow/core/platform",
     ] + if_static(
         extra_deps = [
-            "//tensorflow/core:eager_service_proto_cc",
-            "//tensorflow/core:master_proto_cc",
-            "//tensorflow/core:worker_proto_cc",
+            "//tensorflow/core/protobuf:eager_service_proto_cc",
+            "//tensorflow/core/protobuf:master_proto_cc",
+            "//tensorflow/core/protobuf:worker_proto_cc",
         ],
         otherwise = [
-            "//tensorflow/core:eager_service_proto_cc_headers_only",
-            "//tensorflow/core:master_proto_cc_headers_only",
-            "//tensorflow/core:worker_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:eager_service_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:master_proto_cc_headers_only",
+            "//tensorflow/core/protobuf:worker_proto_cc_headers_only",
         ],
     ),
 )

From 1186e3f2098793952aa82bf356dfe51b967fb26c Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 12 May 2020 13:49:58 -0700
Subject: [PATCH 054/412] The callback slowness warning has started firing in
 many situations where only built-in callbacks are called (possibly only due
 to logging). For the time being, its threshold must be increased.

PiperOrigin-RevId: 311195250
Change-Id: Idff476f4650970b372bc11a25b043825b17742d5
---
 tensorflow/python/keras/callbacks.py      | 16 +++++++++++-----
 tensorflow/python/keras/callbacks_test.py |  4 ++--
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 6748a572805..db326ea32f0 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -307,14 +307,20 @@ class CallbackList(object):
       end_hook_name = hook_name
       begin_hook_name = 'on_{mode}_batch_begin'.format(mode=mode)
 
-      threshold_time = 0.5 * batch_time
+      threshold_time = 1.5 * batch_time
       warning_msg = ('Callbacks method `{hook}` is slow compared to '
-                     'the batch time. Check your callbacks.')
+                     'the batch time (batch time: {batch_time:.4f}s vs '
+                     '`{hook}` time: {cbk_time:.4f}s). Check your callbacks.')
       if self._timing[begin_hook_name] > threshold_time:
-        logging.warning(warning_msg.format(hook=begin_hook_name))
+        logging.warning(warning_msg.format(
+            hook=begin_hook_name,
+            batch_time=batch_time,
+            cbk_time=self._timing[begin_hook_name]))
       if self._timing[end_hook_name] > threshold_time:
-        logging.warning(warning_msg.format(hook=end_hook_name))
-
+        logging.warning(warning_msg.format(
+            hook=end_hook_name,
+            batch_time=batch_time,
+            cbk_time=self._timing[end_hook_name]))
       self._check_timing = False
       self._batch_start_time = None
 
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 9d15f87ed79..2f1256ee3ee 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -302,8 +302,8 @@ class KerasCallbacksTest(keras_parameterized.TestCase):
           epochs=10,
           callbacks=[SleepCallback()])
     warning_msg = ('Callbacks method `on_train_batch_end` is slow compared '
-                   'to the batch time. Check your callbacks.')
-    self.assertIn(warning_msg, warning_messages)
+                   'to the batch time')
+    self.assertIn(warning_msg, '\n'.join(warning_messages))
 
   @keras_parameterized.run_with_all_model_types(exclude_models='functional')
   @keras_parameterized.run_all_keras_modes

From 1638fe218d6003345460e33b7a38a8a322887d79 Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Tue, 12 May 2020 14:04:36 -0700
Subject: [PATCH 055/412] Fix for adhering to latest clang style guide.

PiperOrigin-RevId: 311197936
Change-Id: I014b041ff03f656587651da9a4977688d501d330
---
 tensorflow/core/framework/shape_inference_testutil.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 40a6d53d223..361f7ed13c1 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_SHAPE_INFERENCE_TESTUTIL_H_
 
 #include <vector>
+
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -90,7 +91,7 @@ class ShapeInferenceTestutil {
         ::tensorflow::shape_inference::ShapeInferenceTestutil::InferShapes( \
             op, i, "e")                                                     \
             .error_message();                                               \
-    const std::string& substring = error_substring;                         \
+    const std::string substring = error_substring;                          \
     EXPECT_NE("", error_message);                                           \
     EXPECT_TRUE(absl::StrContains(error_message, substring))                \
         << "Expected to see '" << substring << "' in '" << error_message    \

From f581c55e4d01e4ebdeaebf6c095aff547745d893 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Tue, 12 May 2020 14:16:20 -0700
Subject: [PATCH 056/412] Introduce persistent, read-only TFLite tensor type

Several operators (rank, shape) are critical for preserving the ability
to resize graphs correctly at runtime. However, introduction of such ops
in the graph currently makes it impossible to fully propagate shapes when
tensors are allocated. This also prevents delegation of the graph for most
delegates, as it introduces dynamic shapes.

Introduce a new, persistent tensor type that can be treated as "constant"
at the time of TfLiteRegistration::Prepare. This tensor type is allocated
immediately when requested, similar to a dynamic tensor, but promises that
its contents will be populated after the "producing" node is prepared, and
that it won't change across subsequent evals.

Update Rank/Shape operators to use this tensor allocation type.

A follow-up CL will introduce a new pseudo-constant tensor check
that can be used by various kernels to avoid making them dynamic.

PiperOrigin-RevId: 311199934
Change-Id: I050704be7d1ff264fc1a852efade53d4021cb034
---
 tensorflow/lite/c/common.c                    |  6 ++-
 tensorflow/lite/c/common.h                    | 14 +++++--
 tensorflow/lite/core/subgraph.cc              |  9 +++--
 tensorflow/lite/kernels/kernel_util.h         | 12 ++++++
 tensorflow/lite/kernels/rank.cc               | 18 ++++++---
 tensorflow/lite/kernels/rank_test.cc          | 13 +++++--
 tensorflow/lite/kernels/shape.cc              | 17 +++++---
 tensorflow/lite/kernels/shape_test.cc         | 13 +++++--
 .../lite/micro/micro_optional_debug_tools.cc  |  2 +
 tensorflow/lite/optional_debug_tools.cc       |  2 +
 tensorflow/lite/python/lite_test.py           | 39 +++++++++++++++++--
 .../benchmark/experimental/c/c_api_types.h    | 14 +++++--
 12 files changed, 129 insertions(+), 30 deletions(-)

diff --git a/tensorflow/lite/c/common.c b/tensorflow/lite/c/common.c
index f70a60002dd..e6b47896528 100644
--- a/tensorflow/lite/c/common.c
+++ b/tensorflow/lite/c/common.c
@@ -79,7 +79,8 @@ TfLiteFloatArray* TfLiteFloatArrayCreate(int size) {
 void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); }
 
 void TfLiteTensorDataFree(TfLiteTensor* t) {
-  if (t->allocation_type == kTfLiteDynamic) {
+  if (t->allocation_type == kTfLiteDynamic ||
+      t->allocation_type == kTfLitePersistentRo) {
     free(t->data.raw);
   }
   t->data.raw = NULL;
@@ -172,7 +173,8 @@ void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims,
 }
 
 void TfLiteTensorRealloc(size_t num_bytes, TfLiteTensor* tensor) {
-  if (tensor->allocation_type != kTfLiteDynamic) {
+  if (tensor->allocation_type != kTfLiteDynamic &&
+      tensor->allocation_type != kTfLitePersistentRo) {
     return;
   }
   // TODO(b/145340303): Tensor data should be aligned.
diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h
index 9657c7e564c..ab150e87d93 100644
--- a/tensorflow/lite/c/common.h
+++ b/tensorflow/lite/c/common.h
@@ -321,15 +321,23 @@ typedef union TfLitePtrUnion {
   void* data;
 } TfLitePtrUnion;
 
-// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
-// data (or data externally allocated). kTfLiteArenaRw is arena allocated
-// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
+// Memory allocation strategies.
+//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
+//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
+//        and available during eval.
+//  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
+//        only available during eval.
+//  * kTfLiteDynamic: Allocated during eval, or for string tensors.
+//  * kTfLitePersistentRo: Allocated and populated during prepare. This is
+//        useful for tensors that can be computed during prepare and treated
+//        as constant inputs for downstream ops (also in prepare).
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
   kTfLiteArenaRw,
   kTfLiteArenaRwPersistent,
   kTfLiteDynamic,
+  kTfLitePersistentRo,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 4cebd059a80..7f4e0e286ea 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -1183,7 +1183,8 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
   // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
   if (tensor->allocation_type == kTfLiteArenaRw ||
       tensor->allocation_type == kTfLiteDynamic ||
-      tensor->allocation_type == kTfLiteArenaRwPersistent) {
+      tensor->allocation_type == kTfLiteArenaRwPersistent ||
+      tensor->allocation_type == kTfLitePersistentRo) {
     tensor_resized_since_op_invoke_ |=
         TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
     if (tensor->type != kTfLiteString) {
@@ -1195,14 +1196,16 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
         return kTfLiteError;
       }
 
-      // Realloc space for kTfLiteDynamic tensors.
+      // Realloc space for heap-allocated tensors.
       TfLiteTensorRealloc(bytesRequired, tensor);
       tensor->bytes = bytesRequired;
     }
     if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
     tensor->dims = new_size;
 
-    if (tensor->allocation_type != kTfLiteDynamic) {
+    // Reset arena-allocated tensors; they will be allocated later.
+    if (tensor->allocation_type == kTfLiteArenaRw ||
+        tensor->allocation_type == kTfLiteArenaRwPersistent) {
       tensor->data.raw = nullptr;
     }
   } else {
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index ad068ddd3fd..5793b08616d 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -87,6 +87,10 @@ inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
 }
 
 // Determines whether tensor is constant.
+// TODO(b/138199592): Introduce new query which checks for constant OR
+// persistent-read-only, which would be useful for most tensor kernels that
+// are potentially dynamic based on the input tensor value availability at the
+// time of prepare.
 inline bool IsConstantTensor(const TfLiteTensor* tensor) {
   return tensor->allocation_type == kTfLiteMmapRo;
 }
@@ -105,6 +109,14 @@ inline void SetTensorToDynamic(TfLiteTensor* tensor) {
   }
 }
 
+// Sets tensor to persistent and read-only.
+inline void SetTensorToPersistentRo(TfLiteTensor* tensor) {
+  if (tensor->allocation_type != kTfLitePersistentRo) {
+    tensor->allocation_type = kTfLitePersistentRo;
+    tensor->data.raw = nullptr;
+  }
+}
+
 // Determines whether it is a hybrid op - one that has float inputs and
 // quantized weights.
 inline bool IsHybridOp(const TfLiteTensor* input, const TfLiteTensor* weight) {
diff --git a/tensorflow/lite/kernels/rank.cc b/tensorflow/lite/kernels/rank.cc
index 8e27ebcc325..53fd92f1682 100644
--- a/tensorflow/lite/kernels/rank.cc
+++ b/tensorflow/lite/kernels/rank.cc
@@ -30,19 +30,23 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   output->type = kTfLiteInt32;
 
+  // By design, the input shape is always known at the time of Prepare, even
+  // if the preceding op that generates |input| is dynamic. Thus, we can
+  // always compute the rank immediately, without waiting for Eval.
+  SetTensorToPersistentRo(output);
+
   // Rank produces a 0-D int32 Tensor representing the rank of input.
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(0);
-  return context->ResizeTensor(context, output, output_size);
-}
+  TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_size));
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TF_LITE_ENSURE_EQ(context, NumDimensions(output), 0);
 
+  // Immediately propagate the known rank to the output tensor. This allows
+  // downstream ops that rely on the value to use it during prepare.
   if (output->type == kTfLiteInt32) {
     int32_t* output_data = GetTensorData<int32_t>(output);
     *output_data = NumDimensions(input);
@@ -53,6 +57,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
 }  // namespace rank
 
 TfLiteRegistration* Register_RANK() {
diff --git a/tensorflow/lite/kernels/rank_test.cc b/tensorflow/lite/kernels/rank_test.cc
index f3dc97126ba..5373a0a66fe 100644
--- a/tensorflow/lite/kernels/rank_test.cc
+++ b/tensorflow/lite/kernels/rank_test.cc
@@ -43,6 +43,9 @@ class RankOpModel : public SingleOpModel {
 
   std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+  TfLiteAllocationType GetOutputAllocationType() const {
+    return interpreter_->tensor(interpreter_->outputs()[0])->allocation_type;
+  }
 
  private:
   int input_;
@@ -51,6 +54,13 @@ class RankOpModel : public SingleOpModel {
 
 TEST(RankOpTest, InputTypeFloat) {
   RankOpModel model({1, 3, 1, 3, 5}, TensorType_FLOAT32);
+  ASSERT_EQ(model.GetOutputAllocationType(), kTfLitePersistentRo);
+
+  // Unlike most ops, Rank populates outputs in Prepare().
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
+  EXPECT_TRUE(model.GetOutputShape().empty());
+
+  // Invoke is superfluous and shouldn't change the output.
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
@@ -59,7 +69,6 @@ TEST(RankOpTest, InputTypeFloat) {
 
 TEST(RankOpTest, InputTypeInt) {
   RankOpModel model({1, 3, 1, 3, 5}, TensorType_INT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({5}));
   EXPECT_TRUE(model.GetOutputShape().empty());
@@ -67,7 +76,6 @@ TEST(RankOpTest, InputTypeInt) {
 
 TEST(RankOpTest, ScalarTensor) {
   RankOpModel model({}, TensorType_FLOAT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({0}));
   EXPECT_TRUE(model.GetOutputShape().empty());
@@ -75,7 +83,6 @@ TEST(RankOpTest, ScalarTensor) {
 
 TEST(RankOpTest, EmptyTensor) {
   RankOpModel model({1, 0}, TensorType_FLOAT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({2}));
   EXPECT_TRUE(model.GetOutputShape().empty());
diff --git a/tensorflow/lite/kernels/shape.cc b/tensorflow/lite/kernels/shape.cc
index 88794fefac4..d979f083f70 100644
--- a/tensorflow/lite/kernels/shape.cc
+++ b/tensorflow/lite/kernels/shape.cc
@@ -54,19 +54,22 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteError;
   }
 
+  // By design, the input shape is always known at the time of Prepare, even
+  // if the preceding op that generates |input| is dynamic. Thus, we can
+  // always compute the shape immediately, without waiting for Eval.
+  SetTensorToPersistentRo(output);
+
   // Shape always produces a 1-dimensional output tensor, where each output
   // element is the length of the corresponding input tensor's dimension.
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(1);
   output_size->data[0] = NumDimensions(input);
-  return context->ResizeTensor(context, output, output_size);
-}
+  TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_size));
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TFLITE_DCHECK_EQ(NumDimensions(output), 1);
   TFLITE_DCHECK_EQ(SizeOfDimension(output, 0), NumDimensions(input));
 
+  // Immediately propagate the known shape to the output tensor. This allows
+  // downstream ops that rely on the value to use it during prepare.
   switch (output->type) {
     case kTfLiteInt32:
       ExtractShape(input, GetTensorData<int32_t>(output));
@@ -81,6 +84,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
 }  // namespace shape
 
 TfLiteRegistration* Register_SHAPE() {
diff --git a/tensorflow/lite/kernels/shape_test.cc b/tensorflow/lite/kernels/shape_test.cc
index 6a7dad4d3e0..3eeb83f5000 100644
--- a/tensorflow/lite/kernels/shape_test.cc
+++ b/tensorflow/lite/kernels/shape_test.cc
@@ -45,6 +45,9 @@ class ShapeOpModel : public SingleOpModel {
   int32_t GetOutputSize() { return GetTensorSize(output_); }
   std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+  TfLiteAllocationType GetOutputAllocationType() const {
+    return interpreter_->tensor(interpreter_->outputs()[0])->allocation_type;
+  }
 
  private:
   int input_;
@@ -54,6 +57,13 @@ class ShapeOpModel : public SingleOpModel {
 TEST(ShapeOpTest, OutTypeInt) {
   ShapeOpModel<int32_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32,
                               TensorType_INT32);
+  ASSERT_EQ(model.GetOutputAllocationType(), kTfLitePersistentRo);
+
+  // Unlike most ops, Rank populates outputs in Prepare().
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5}));
+
+  // Invoke is superfluous and shouldn't change the output.
   model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
@@ -63,7 +73,6 @@ TEST(ShapeOpTest, OutTypeInt) {
 TEST(ShapeOpTest, OutTypeInt64) {
   ShapeOpModel<int64_t> model({1, 3, 1, 3, 5}, TensorType_FLOAT32,
                               TensorType_INT64);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 3, 1, 3, 5}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({5}));
@@ -71,7 +80,6 @@ TEST(ShapeOpTest, OutTypeInt64) {
 
 TEST(ShapeOpTest, ScalarTensor) {
   ShapeOpModel<int32_t> model({}, TensorType_FLOAT32, TensorType_INT32);
-  model.Invoke();
 
   EXPECT_EQ(model.GetOutputSize(), 0);
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({0}));
@@ -79,7 +87,6 @@ TEST(ShapeOpTest, ScalarTensor) {
 
 TEST(ShapeOpTest, EmptyTensor) {
   ShapeOpModel<int32_t> model({1, 0}, TensorType_FLOAT32, TensorType_INT32);
-  model.Invoke();
 
   EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 0}));
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.cc b/tensorflow/lite/micro/micro_optional_debug_tools.cc
index 70f16c78d79..42c42aea9f8 100644
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.cc
@@ -95,6 +95,8 @@ const char* AllocTypeName(TfLiteAllocationType type) {
       return "kTfLiteArenaRw";
     case kTfLiteArenaRwPersistent:
       return "kTfLiteArenaRwPersistent";
+    case kTfLitePersistentRo:
+      return "kTfLitePersistentRo";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index c5ccdb98390..2e25b0a17f7 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -77,6 +77,8 @@ const char* AllocTypeName(TfLiteAllocationType type) {
       return "kTfLiteArenaRw";
     case kTfLiteArenaRwPersistent:
       return "kTfLiteArenaRwPersistent";
+    case kTfLitePersistentRo:
+      return "kTfLitePersistentRo";
   }
   return "(invalid)";
 }
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 9ddd09edca6..1bcb2ce0ee4 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -269,9 +269,7 @@ class FromSessionTest(TestModels, parameterized.TestCase):
                                                   [out_tensor])
     converter.inference_input_type = lite_constants.QUANTIZED_UINT8
     converter.inference_type = lite_constants.FLOAT
-    converter.quantized_input_stats = {
-        'Placeholder': (0., 1.)
-    }  # mean, std_dev
+    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -1327,6 +1325,41 @@ class FromSessionTest(TestModels, parameterized.TestCase):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
+  def testResizeWithShape(self):
+    with ops.Graph().as_default():
+      # Construct a graph with a dynamically shapped input and an internal node
+      # that relies on the output of that input's shape.
+      in_tensor = array_ops.placeholder(
+          shape=[None, None], dtype=dtypes.float32)
+      in_tensor2 = [[1, 2], [3, 4]]
+      out_tensor = array_ops.reshape(in_tensor2, array_ops.shape(in_tensor))
+      sess = session.Session()
+
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    converter.experimental_new_converter = True
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertTrue(([1, 1] == input_details[0]['shape']).all())
+    self.assertTrue(([-1, -1] == input_details[0]['shape_signature']).all())
+
+    # Resize tensor and invoke.
+    interpreter.resize_tensor_input(0, [4])
+    interpreter.allocate_tensors()
+    interpreter.invoke()
+
+    # The output should be reshaped properly according to the resized input.
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(np.int32, output_details[0]['dtype'])
+    self.assertTrue(([4] == output_details[0]['shape']).all())
+    output_data = interpreter.get_tensor(output_details[0]['index'])
+    self.assertTrue(([1, 2, 3, 4] == output_data).all())
+
   def testResizingIntermediateDynamicTensor(self):
     # This is a regression test for the case where shape of dynamic output
     # tensors changes between invocations.
diff --git a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
index 9657c7e564c..ab150e87d93 100644
--- a/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
+++ b/tensorflow/lite/tools/benchmark/experimental/c/c_api_types.h
@@ -321,15 +321,23 @@ typedef union TfLitePtrUnion {
   void* data;
 } TfLitePtrUnion;
 
-// Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
-// data (or data externally allocated). kTfLiteArenaRw is arena allocated
-// data. kTfLiteDynamic is for tensors that are allocated during evaluation.
+// Memory allocation strategies.
+//  * kTfLiteMmapRo: Read-only memory-mapped data, or data externally allocated.
+//  * kTfLiteArenaRw: Arena allocated with no guarantees about persistence,
+//        and available during eval.
+//  * kTfLiteArenaRwPersistent: Arena allocated but persistent across eval, and
+//        only available during eval.
+//  * kTfLiteDynamic: Allocated during eval, or for string tensors.
+//  * kTfLitePersistentRo: Allocated and populated during prepare. This is
+//        useful for tensors that can be computed during prepare and treated
+//        as constant inputs for downstream ops (also in prepare).
 typedef enum TfLiteAllocationType {
   kTfLiteMemNone = 0,
   kTfLiteMmapRo,
   kTfLiteArenaRw,
   kTfLiteArenaRwPersistent,
   kTfLiteDynamic,
+  kTfLitePersistentRo,
 } TfLiteAllocationType;
 
 // The delegates should use zero or positive integers to represent handles.

From ec2837b2a112ae3ada2c10173c12ed9b2f129b02 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 14:23:34 -0700
Subject: [PATCH 057/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311201348
Change-Id: I35ed38a57bbcf68e980b69a50190a033d2b34d4e
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 7b73cfa737035ea1934fbfd364a4aae34cf7ceb0 Mon Sep 17 00:00:00 2001
From: Meghna Natraj <mnatraj@google.com>
Date: Tue, 12 May 2020 14:28:24 -0700
Subject: [PATCH 058/412] Update TensorFlow Lite external documentation

PiperOrigin-RevId: 311202267
Change-Id: Ibbca1409b33469e8a3d407e330dafc7ae079089d
---
 .../lite/g3doc/convert/1x_compatibility.md    | 131 ++++++------
 .../performance/post_training_quantization.md | 190 ++++++++++--------
 2 files changed, 169 insertions(+), 152 deletions(-)

diff --git a/tensorflow/lite/g3doc/convert/1x_compatibility.md b/tensorflow/lite/g3doc/convert/1x_compatibility.md
index adb2af4d8ad..9f9f277a8d9 100644
--- a/tensorflow/lite/g3doc/convert/1x_compatibility.md
+++ b/tensorflow/lite/g3doc/convert/1x_compatibility.md
@@ -1,30 +1,32 @@
-# TensorFlow 1.x compatibility
+# TensorFlow 1.x Compatibility <a name="differences"></a>
 
-The `tf.lite.TFLiteConverter` was updated between TensorFlow 1.X and 2.0. This
-document explains the differences between the 1.X and 2.0 versions of the
-converter, and provides information about how to use the 1.X version if
-required.
+The `tf.lite.TFLiteConverter` Python API was updated between TensorFlow 1.x and
+2.x. This document explains the differences between the two versions, and
+provides information about how to use the 1.x version if required.
 
-## Summary of changes in Python API between 1.X and 2.0 <a name="differences"></a>
-
-The following section summarizes the changes in the Python API from 1.X to 2.0.
 If any of the changes raise concerns, please file a
-[GitHub issue](https://github.com/tensorflow/tensorflow/issues).
+[GitHub Issue](https://github.com/tensorflow/tensorflow/issues).
 
-### Formats supported by `TFLiteConverter`
+Note: We highly recommend that you
+[migrate your TensorFlow 1.x code to TensorFlow 2.x code](https://www.tensorflow.org/guide/migrate)
+.
 
-The 2.0 version of the converter supports SavedModel and Keras model files
-generated in both 1.X and 2.0. However, the conversion process no longer
-supports "frozen graph" `GraphDef` files generated in 1.X.
+## Model formats
 
-#### Converting frozen graphs
+#### SavedModel and Keras
 
-Users who want to convert frozen graph `GraphDef` files (`.pb` files) to
-TensorFlow Lite should use `tf.compat.v1.lite.TFLiteConverter`.
+The `tf.lite.TFLiteConverter` API supports SavedModel and Keras HDF5 files
+generated in both TensorFlow 1.x and 2.x.
 
-The following snippet shows a frozen graph file being converted:
+#### Frozen Graph
+
+Note: TensorFlow 2.x no longer supports the generation of frozen graph models.
+
+The `tf.compat.v1.lite.TFLiteConverter` API supports frozen graph models
+generated in TensorFlow 1.x, as shown below:
 
 ```python
+import tensorflow as tf
 # Path to the frozen graph file
 graph_def_file = 'frozen_graph.pb'
 # A list of the names of the model's input tensors
@@ -32,70 +34,68 @@ input_arrays = ['input_name']
 # A list of the names of the model's output tensors
 output_arrays = ['output_name']
 # Load and convert the frozen graph
-converter = lite.TFLiteConverter.from_frozen_graph(
+converter = tf.lite.TFLiteConverter.from_frozen_graph(
   graph_def_file, input_arrays, output_arrays)
 tflite_model = converter.convert()
 # Write the converted model to disk
 open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
-### Quantization-aware training
+## Converter attributes
 
-The following attributes and methods associated with
-[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/quantize)
-have been removed from `TFLiteConverter` in TensorFlow 2.0:
+#### Renamed attributes
 
-*   `inference_type`
-*   `inference_input_type`
-*   `quantized_input_stats`
-*   `default_ranges_stats`
-*   `reorder_across_fake_quant`
-*   `change_concat_input_ranges`
-*   `post_training_quantize` - Deprecated in the 1.X API
-*   `get_input_arrays()`
+The following 1.x attribute has been renamed in 2.x.
 
-The rewriter function that supports quantization-aware training does not support
-models generated by TensorFlow 2.0. Additionally, TensorFlow Lite’s quantization
-API is being reworked and streamlined in a direction that supports
-quantization-aware training through the Keras API. These attributes will be
-removed in the 2.0 API until the new quantization API is launched. Users who
-want to convert models generated by the rewriter function can use
-`tf.compat.v1.lite.TFLiteConverter`.
+*   `target_ops` has been renamed to `target_spec.supported_ops` - In 2.x, in
+    line with future additions to the optimization framework, it has become an
+    attribute of `TargetSpec` and has been renamed to `supported_ops`.
 
-### Changes to `TFLiteConverter` attributes
+#### Unsupported attributes
 
-The `target_ops` attribute has become an attribute of `TargetSpec` and renamed
-to `supported_ops` in line with future additions to the optimization framework.
+The following 1.x attributes have been removed in 2.x.
 
-Additionally, the following attributes have been removed:
-
-*   `drop_control_dependency` (default: `True`)
-*   _Graph visualization_ - The recommended approach for visualizing a
-    TensorFlow Lite graph in TensorFlow 2.0 will be to use
-    [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py).
-    Unlike GraphViz, it enables users to visualize the graph after post training
-    quantization has occurred. The following attributes related to graph
-    visualization will be removed:
+*   _Quantization_ - In 2.x,
+    [quantize aware training](https://www.tensorflow.org/model_optimization/guide/quantization/training)
+    is supported through the Keras API and
+    [post training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)
+    uses fewer streamlined converter flags. Thus, the following attributes and
+    methods related to quantization have been removed:
+    *   `inference_type`
+    *   `quantized_input_stats`
+    *   `post_training_quantize`
+    *   `default_ranges_stats`
+    *   `reorder_across_fake_quant`
+    *   `change_concat_input_ranges`
+    *   `get_input_arrays()`
+*   _Visualization_ - In 2.x, the recommended approach for visualizing a
+    TensorFlow Lite graph is to use
+    [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
+    . Unlike GraphViz, it enables users to visualize the graph after post
+    training quantization has occurred. Thus, the following attributes related
+    to graph visualization have been removed:
     *   `output_format`
     *   `dump_graphviz_dir`
     *   `dump_graphviz_video`
+*   _Frozen graph_ - In 2.x, the frozen graph model format has been removed.
+    Thus, the following attribute related to frozen graphs has been removed:
+    *   `drop_control_dependency`
 
-### General API changes
+## Unsupported APIs
 
-The following section explains several significant API changes between
-TensorFlow 1.X and 2.0.
+The following section explains several significant features in 1.x that have
+been removed in 2.x.
 
-#### Conversion methods
+#### Conversion APIs
 
-The following methods that were previously deprecated in 1.X will no longer be
-exported in 2.0:
+The following methods were deprecated in 1.x and have been removed in 2.x:
 
 *   `lite.toco_convert`
 *   `lite.TocoConverter`
 
-#### `lite.constants`
+#### `lite.constants` API
 
-The `lite.constants` API was removed in 2.0 in order to decrease duplication
+The `lite.constants` API was removed in 2.x in order to decrease duplication
 between TensorFlow and TensorFlow Lite. The following list maps the
 `lite.constant` type to the TensorFlow type:
 
@@ -106,12 +106,15 @@ between TensorFlow and TensorFlow Lite. The following list maps the
 *   `lite.constants.STRING`: `tf.string`
 *   `lite.constants.QUANTIZED_UINT8`: `tf.uint8`
 
-Additionally, `lite.constants.TFLITE` and `lite.constants.GRAPHVIZ_DOT` were
-removed due to the deprecation of the `output_format` flag in `TFLiteConverter`.
+Additionally, the deprecation of the `output_format` flag in `TFLiteConverter`
+led to the removal of the following constants:
 
-#### `lite.OpHint`
+*   `lite.constants.TFLITE`
+*   `lite.constants.GRAPHVIZ_DOT`
 
-The `OpHint` API is currently not available in 2.0 due to an incompatibility
-with the 2.0 APIs. This API enables conversion of LSTM based models. Support for
-LSTMs in 2.0 is being investigated. All related `lite.experimental` APIs have
-been removed due to this issue.
+#### `lite.OpHint` API
+
+The `OpHint` API is currently unsupported due to an incompatibility with the 2.x
+APIs. This API enables conversion of LSTM based models. Support for LSTMs in 2.x
+is being investigated. All related `lite.experimental` APIs have been removed
+due to this issue.
diff --git a/tensorflow/lite/g3doc/performance/post_training_quantization.md b/tensorflow/lite/g3doc/performance/post_training_quantization.md
index 194d102d43d..a526be75b61 100644
--- a/tensorflow/lite/g3doc/performance/post_training_quantization.md
+++ b/tensorflow/lite/g3doc/performance/post_training_quantization.md
@@ -4,51 +4,44 @@ Post-training quantization is a conversion technique that can reduce model size
 while also improving CPU and hardware accelerator latency, with little
 degradation in model accuracy. You can perform these techniques using an
 already-trained float TensorFlow model when you convert it to TensorFlow Lite
-format.
+format using the [TensorFlow Lite Converter](../convert/).
 
 Note: The procedures on this page require TensorFlow 1.15 or higher.
 
-
-### Optimization options
+### Optimization Methods
 
 There are several post-training quantization options to choose from. Here is a
 summary table of the choices and the benefits they provide:
 
-| Technique                 | Benefits                  | Hardware            |
-| ------------------------- | ------------------------- | ------------------- |
-| Dynamic range             | 4x smaller, 2-3x speedup, | CPU                 |
-: quantization              : accuracy                  :                     :
-| Full integer quantization | 4x smaller, 3x+ speedup   | CPU, Edge TPU, etc. |
-| Float16 quantization      | 2x smaller, potential GPU | CPU/GPU             |
-:                           : acceleration              :                     :
+| Technique            | Benefits                  | Hardware         |
+| -------------------- | ------------------------- | ---------------- |
+| Dynamic range        | 4x smaller, 2-3x speedup  | CPU              |
+: quantization         :                           :                  :
+| Full integer         | 4x smaller, 3x+ speedup   | CPU, Edge TPU,   |
+: quantization         :                           : Microcontrollers :
+| Float16 quantization | 2x smaller, potential GPU | CPU, GPU         |
+:                      : acceleration              :                  :
 
 This decision tree can help determine which post-training quantization method is
 best for your use case:
 
 ![post-training optimization options](images/optimization.jpg)
 
-Alternatively, you might achieve higher accuracy if you perform
-[quantization-aware training](
-https://github.com/tensorflow/tensorflow/tree/r1.14/tensorflow/contrib/quantize).
-However, doing so requires some model modifications to add fake quantization
-nodes, whereas the post-training quantization techniques on this page use an
-existing pre-trained model.
-
 ### Dynamic range quantization
 
 The simplest form of post-training quantization statically quantizes only the
-weights from floating point to 8-bits of precision. This technique is enabled as
-an option in the [TensorFlow Lite converter](../convert/):
+weights from floating point to integer, which has 8-bits of precision:
 
-```
+<pre>
 import tensorflow as tf
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]</b>
 tflite_quant_model = converter.convert()
-```
+</pre>
 
-At inference, weights are converted from 8-bits of precision to floating point and
-computed using floating-point kernels. This conversion is done once and cached to reduce latency.
+At inference, weights are converted from 8-bits of precision to floating point
+and computed using floating-point kernels. This conversion is done once and
+cached to reduce latency.
 
 To further improve latency, "dynamic-range" operators dynamically quantize
 activations based on their range to 8-bits and perform computations with 8-bit
@@ -58,89 +51,105 @@ point, so that the speedup with dynamic-range ops is less than a full
 fixed-point computation. Dynamic-range ops are available for the most
 compute-intensive operators in a network:
 
-*  [tf.contrib.layers.fully_connected](https://www.tensorflow.org/api_docs/python/tf/contrib/layers/fully_connected)
-*  [tf.nn.conv2d](https://www.tensorflow.org/api_docs/python/tf/nn/conv2d)
-*  [tf.nn.embedding_lookup](https://www.tensorflow.org/api_docs/python/tf/nn/embedding_lookup)
-*  [BasicRNN](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/BasicRNNCell)
-*  [tf.nn.bidirectional_dynamic_rnn for BasicRNNCell type](https://www.tensorflow.org/api_docs/python/tf/nn/bidirectional_dynamic_rnn)
-*  [tf.nn.dynamic_rnn for LSTM and BasicRNN Cell types](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn)
+*   `tf.keras.layers.Dense`
+*   `tf.keras.layers.Conv2D`
+*   `tf.keras.layers.LSTM`
+*   `tf.nn.embedding_lookup`
+*   `tf.compat.v1.nn.rnn_cell.BasicRNNCell`
+*   `tf.compat.v1.nn.bidirectional_dynamic_rnn`
+*   `tf.compat.v1.nn.dynamic_rnn`
 
-
-### Full integer quantization of weights and activations
+### Full integer quantization
 
 You can get further latency improvements, reductions in peak memory usage, and
-access to integer only hardware accelerators by making sure all model math is
-quantized.
+access to integer only hardware devices or accelerators by making sure all model
+math is integer quantized.
 
 To do this, you need to measure the dynamic range of activations and inputs by
-supplying a representative data set. You can simply create an input data
-generator and provide it to our converter. For example:
+supplying sample input data to the converter. Refer to the
+`representative_dataset_gen()` function used in the following code.
 
-```
+#### Integer with float fallback (using default float input/output)
+
+In order to fully integer quantize a model, but use float operators when they
+don't have an integer implementation (to ensure conversion occurs smoothly), use
+the following steps:
+
+<pre>
 import tensorflow as tf
-
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
 def representative_dataset_gen():
   for _ in range(num_calibration_steps):
     # Get sample input data as a numpy array in a method of your choosing.
     yield [input]
-
-converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.representative_dataset = representative_dataset_gen
+converter.representative_dataset = representative_dataset_gen</b>
 tflite_quant_model = converter.convert()
-```
+</pre>
 
-The resulting model should be fully quantized, but any
-ops that do not have quantized implementations are left in
-floating point. This allows conversion to occur smoothly, but the model won't be
-compatible with accelerators that require full integer quantization.
+Note: This won't be compatible with integer only devices (such as 8-bit
+microcontrollers) and accelerators (such as the Coral Edge TPU). For convenience
+during inference, the input and output still remain float in order to have the
+same interface as the original float only model.
 
-Additionally, the model still uses float input and output for convenience.
+#### Integer only
 
-To ensure compatibility with some accelerators (such as the Coral Edge TPU), you
-can enforce full integer quantization for all ops and use integer input and
-output by adding the following lines before you convert:
+*This is a common use case for
+[TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers)
+and [Coral Edge TPUs](https://coral.ai/).*
 
-```
-converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-converter.inference_input_type = tf.uint8
-converter.inference_output_type = tf.uint8
-```
+Additionally, to ensure compatibility with integer only devices (such as 8-bit
+microcontrollers) and accelerators (such as the Coral Edge TPU), you can enforce
+full integer quantization for all ops including the input and output, by using
+the following steps:
 
-The first line makes the converter throw an error if it encounters an operation
-it cannot currently quantize.
-
-Note: `target_spec.supported_ops` was previously `target_ops` in the Python API.
-
-
-### Float16 quantization of weights
-
-You can reduce the size of a floating point model by quantizing the weights to
-float16, the IEEE standard for 16-bit floating point numbers. The advantages of
-this quantization are as follows:
-
--   reduce model size by up to half (since all weights are now half the original
-    size)
--   minimal loss in accuracy
--   some delegates (e.g. the GPU delegate) can operate directly on float16 data,
-    which results in faster execution than float32 computations.
-
-This quantization may not be a good choice if you need maximum performance (a
-quantization to fixed point math would be better in that case). To enable
-float16 quantization of weights, specify "DEFAULT" optimization as above and
-then specify that float16 is in supported types for the target_spec:
-
-```
+<pre>
 import tensorflow as tf
 converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
 converter.optimizations = [tf.lite.Optimize.DEFAULT]
-converter.target_spec.supported_types = [tf.lite.constants.FLOAT16]
+def representative_dataset_gen():
+  for _ in range(num_calibration_steps):
+    # Get sample input data as a numpy array in a method of your choosing.
+    yield [input]
+converter.representative_dataset = representative_dataset_gen
+<b>converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]</b>
+<b>converter.inference_input_type = tf.int8</b>  # or tf.uint8
+<b>converter.inference_output_type = tf.int8</b>  # or tf.uint8
 tflite_quant_model = converter.convert()
-```
+</pre>
 
-By default, a float16 quantized model will "dequantize" the weights values to
-float32 when run on the CPU. The GPU delegate will not perform this
-dequantization, since it can operate on float16 data.
+Note: The converter will throw an error if it encounters an operation it cannot
+currently quantize.
+
+### Float16 quantization
+
+You can reduce the size of a floating point model by quantizing the weights to
+float16, the IEEE standard for 16-bit floating point numbers. To enable float16
+quantization of weights, use the following steps:
+
+<pre>
+import tensorflow as tf
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+<b>converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_types = [tf.lite.constants.FLOAT16]</b>
+tflite_quant_model = converter.convert()
+</pre>
+
+The advantages of this quantization are as follows:
+
+*   Reduce model size by up to half (since all weights are now half the original
+    size).
+*   Minimal loss in accuracy.
+*   Supports some delegates (e.g. the GPU delegate) can operate directly on
+    float16 data, which results in faster execution than float32 computations.
+
+The disadvantages of this quantization are as follows:
+
+*   Not a good choice for maximum performance (a quantization to fixed point
+    math would be better in that case).
+*   By default, a float16 quantized model will "dequantize" the weights values
+    to float32 when run on the CPU. (Note that the GPU delegate will not perform
+    this dequantization, since it can operate on float16 data.)
 
 ### Model accuracy
 
@@ -152,13 +161,18 @@ accuracy of the quantized model to verify that any degradation in accuracy is
 within acceptable limits. There is a tool to evaluate
 [TensorFlow Lite model accuracy](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/accuracy/ilsvrc/README.md){:.external}.
 
-If the accuracy drop is too high, consider using
-[quantization aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize){:.external}.
+Alternatively, if the accuracy drop is too high, consider using
+[quantization aware training](https://www.tensorflow.org/model_optimization/guide/quantization/training)
+. However, doing so requires modifications during model training to add fake
+quantization nodes, whereas the post-training quantization techniques on this
+page use an existing pre-trained model.
 
 ### Representation for quantized tensors
 
 8-bit quantization approximates floating point values using the following
-formula. `real_value = (int8_value - zero_point) * scale`.
+formula.
+
+$$real\_value = (int8\_value - zero\_point) \times scale$$
 
 The representation has two main parts:
 

From 1e07fa6448c01346054812ba0f0f71717f8156ff Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Tue, 12 May 2020 14:49:02 -0700
Subject: [PATCH 059/412] Fix Resize Nearest Neighbor versioning. align_corners
 support was added in version 3.

PiperOrigin-RevId: 311206537
Change-Id: Ief12bdbbbedf5cf390315c5ee50a57e2000001ee
---
 tensorflow/lite/toco/tflite/operator.cc       |  2 +
 .../lite/tools/versioning/op_version.cc       | 12 +++-
 tensorflow/lite/tools/versioning/op_version.h |  1 +
 .../lite/tools/versioning/op_version_test.cc  | 60 +++++++++++++++++++
 4 files changed, 74 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 57b791a1a94..917fd24c952 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -1118,6 +1118,7 @@ class ResizeBilinear
         GetVersioningOpSig(builtin_op(), op_signature);
     op_sig.options.resize.half_pixel_centers =
         resize_bilinear_op.half_pixel_centers;
+    op_sig.options.resize.align_corners = resize_bilinear_op.align_corners;
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
@@ -1147,6 +1148,7 @@ class ResizeNearestNeighbor
     ::tflite::OpSignature op_sig =
         GetVersioningOpSig(builtin_op(), op_signature);
     op_sig.options.resize.half_pixel_centers = resize_nn_op.half_pixel_centers;
+    op_sig.options.resize.align_corners = resize_nn_op.align_corners;
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 56aa9d5d0a9..9022afca629 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -363,13 +363,20 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
     case BuiltinOperator_RESIZE_BILINEAR:
-    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
       if (op_sig.options.resize.half_pixel_centers) {
         return 3;
       } else if (op_sig.input_types.at(0) == TensorType_INT8) {
         return 2;
       }
       return 1;
+    case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
+      if (op_sig.options.resize.half_pixel_centers ||
+          op_sig.options.resize.align_corners) {
+        return 3;
+      } else if (op_sig.input_types.at(0) == TensorType_INT8) {
+        return 2;
+      }
+      return 1;
 
     case BuiltinOperator_MAXIMUM:
     case BuiltinOperator_MINIMUM:
@@ -612,6 +619,8 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
       if (resize_bilinear_option) {
         op_sig.options.resize.half_pixel_centers =
             resize_bilinear_option->half_pixel_centers();
+        op_sig.options.resize.align_corners =
+            resize_bilinear_option->align_corners();
       }
     } break;
     case BuiltinOperator_RESIZE_NEAREST_NEIGHBOR: {
@@ -620,6 +629,7 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
       if (resize_nn_option) {
         op_sig.options.resize.half_pixel_centers =
             resize_nn_option->half_pixel_centers();
+        op_sig.options.resize.align_corners = resize_nn_option->align_corners();
       }
     } break;
     // TODO(b/150176627): Add tests for GetOpSignature.
diff --git a/tensorflow/lite/tools/versioning/op_version.h b/tensorflow/lite/tools/versioning/op_version.h
index fba6c943462..4b0fe8836e2 100644
--- a/tensorflow/lite/tools/versioning/op_version.h
+++ b/tensorflow/lite/tools/versioning/op_version.h
@@ -48,6 +48,7 @@ typedef struct {
     } lstm;
     struct {
       bool half_pixel_centers;
+      bool align_corners;
     } resize;
     struct {
       int32_t num_dims;
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 7d9039ff848..f0d8259d764 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -594,4 +594,64 @@ TEST(OpVersionTEst, VersioningFillTest) {
                                                         TensorType_INT32}};
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
 }
+TEST(OpVersionTest, VersioningResizeBilinearTest) {
+  // Default.
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_BILINEAR,
+      .input_types =
+          std::vector<TensorType>{TensorType_FLOAT32, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // align_corners=true is still version 1.
+  fake_op_sig.options.resize.align_corners = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // half_pixel_centers=true must be version 3.
+  fake_op_sig.options.resize.align_corners = false;
+  fake_op_sig.options.resize.half_pixel_centers = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // int8 input is version 2.
+  fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_BILINEAR,
+      .input_types = std::vector<TensorType>{TensorType_INT8, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig.options.resize.half_pixel_centers = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+}
+TEST(OpVersionTest, VersioningResizeNearestNeighborTest) {
+  // Default.
+  OpSignature fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      .input_types =
+          std::vector<TensorType>{TensorType_FLOAT32, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_FLOAT32},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  // align_corners=true is version 3.
+  fake_op_sig.options.resize.align_corners = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // half_pixel_centers=true must be version 3.
+  fake_op_sig.options.resize.align_corners = false;
+  fake_op_sig.options.resize.half_pixel_centers = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  // int8 input is version 2.
+  fake_op_sig = {
+      .op = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+      .input_types = std::vector<TensorType>{TensorType_INT8, TensorType_INT32},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig.options.resize.align_corners = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+}
 }  // namespace tflite

From c5caa29b5e6d10079020673a0dbd0035214df94d Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Tue, 12 May 2020 14:50:15 -0700
Subject: [PATCH 060/412] Make `core.Tensor` the base type for Tensor and
 replace the `register_dense_tensor_like` with direct subclassing.

PiperOrigin-RevId: 311206817
Change-Id: Id8ae234516d5409d6b70612a99f9f0b3ed53dc7e
---
 tensorflow/BUILD                              |  7 +++
 tensorflow/python/BUILD                       |  1 +
 tensorflow/python/distribute/values.py        | 10 ++--
 tensorflow/python/distribute/values_test.py   | 11 ++--
 tensorflow/python/framework/ops.py            | 56 ++-----------------
 tensorflow/python/framework/ops_test.py       | 50 -----------------
 tensorflow/python/framework/tensor_util.py    |  3 +-
 tensorflow/python/keras/engine/training_v1.py |  3 +-
 .../experimental/autocast_variable.py         |  4 +-
 tensorflow/python/ops/array_ops.py            | 15 ++---
 .../python/ops/resource_variable_ops.py       |  7 +--
 tensorflow/python/ops/variable_scope.py       |  5 +-
 tensorflow/python/ops/variables.py            |  5 +-
 tensorflow/python/profiler/BUILD              |  1 +
 tensorflow/python/types/BUILD                 |  5 +-
 .../api/golden/v1/tensorflow.-tensor.pbtxt    |  1 +
 .../api/golden/v2/tensorflow.-tensor.pbtxt    |  1 +
 17 files changed, 52 insertions(+), 133 deletions(-)

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index bf3af3c31b4..ab4316d5ed0 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -530,6 +530,13 @@ package_group(name = "ndarray_tensor_allow_list")
 # TODO(b/154762408) Remove this package group once it's no longer needed.
 package_group(name = "composite_tensor_whitelist")
 
+# Packages that use private types symbols, until they are exported.
+# TODO(b/154650521) Remove.
+package_group(
+    name = "types_whitelist",
+    packages = ["//learning/deepmind/tensorflow/replicator/..."],
+)
+
 filegroup(
     name = "intel_binary_blob",
     data = if_mkl_ml(
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 11da45fbcbb..a49e4b74def 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -230,6 +230,7 @@ py_library(
         "//tensorflow/python/tools:module_util",
         "//tensorflow/python/tools/api/generator:create_python_api",
         "//tensorflow/python/tpu:tpu_noestimator",
+        "//tensorflow/python/types",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 4fe3d287ccc..444915aa123 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.types import core
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -422,7 +423,8 @@ class DistributedVarOp(object):
     return hash((self.name, self.graph, self.traceback, self.type))
 
 
-class DistributedVariable(DistributedDelegate, variables_lib.Variable):
+class DistributedVariable(DistributedDelegate, variables_lib.Variable,
+                          core.Tensor):
   """Holds a map from replica to variables."""
 
   # TODO(josh11b): Support changing the set of variables if e.g. if new
@@ -741,9 +743,6 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable):
     pass
 
 
-ops.register_dense_tensor_like_type(DistributedVariable)
-
-
 def _validate_colocate_extended(v, extended):
   variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
   if variable_strategy.extended is not extended:
@@ -1380,7 +1379,7 @@ def value_container(val):
   return val
 
 
-class AggregatingVariable(variables_lib.Variable):
+class AggregatingVariable(variables_lib.Variable, core.Tensor):
   """A wrapper around a variable that aggregates updates across replicas."""
 
   def __init__(self, strategy, v, aggregation):
@@ -1649,4 +1648,3 @@ def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
 
 ops.register_tensor_conversion_function(AggregatingVariable,
                                         _tensor_conversion_aggregate)
-ops.register_dense_tensor_like_type(AggregatingVariable)
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index daa7e5563d3..67ed86b4047 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -56,6 +56,7 @@ from tensorflow.python.saved_model.model_utils import mode_keys
 from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training.tracking import util as trackable_utils
+from tensorflow.python.types import core
 from tensorflow.python.util import nest
 
 
@@ -623,10 +624,10 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
       v = variables_lib.Variable(
           0., synchronization=synchronization, aggregation=aggregation)
     # In cross replica context.
-    self.assertTrue(ops.is_dense_tensor_like(v))
+    self.assertIsInstance(v, core.Tensor)
     # In replica context.
     distribution.run(
-        lambda v: self.assertTrue(ops.is_dense_tensor_like(v)), args=(v,))
+        lambda v: self.assertIsInstance(v, core.Tensor), args=(v,))
 
   def testAssignReturnValueIsTensorLike(self, distribution, synchronization,
                                         aggregation):
@@ -645,9 +646,9 @@ class DistributedVariableTest(test.TestCase, parameterized.TestCase):
       # values is not allowed when aggregation is SUM. See
       # `cross_device_ops.reduce_non_distributed_value`.
       delta = array_ops.identity(1.)
-      self.assertTrue(ops.is_dense_tensor_like(v.assign(delta)))
-      self.assertTrue(ops.is_dense_tensor_like(v.assign_sub(delta)))
-      self.assertTrue(ops.is_dense_tensor_like(v.assign_add(delta)))
+      self.assertIsInstance(v.assign(delta), core.Tensor)
+      self.assertIsInstance(v.assign_sub(delta), core.Tensor)
+      self.assertIsInstance(v.assign_add(delta), core.Tensor)
 
     # In cross replica context we return a PerReplica which is not Tensor like
     # yet.
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 9b8f7cf4fde..43652d51eae 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -62,6 +62,7 @@ from tensorflow.python.framework import versions
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.platform import app
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.types import core as core_tf_types
 from tensorflow.python.types import internal
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
@@ -213,53 +214,11 @@ def _as_graph_element(obj):
   return None
 
 
-_TENSOR_LIKE_TYPES = tuple()
-
-
+# Deprecated - do not use.
+# This API to avoid breaking estimator and tensorflow-mesh which depend on this
+# internal API. The stub should be safe to use after TF 2.3 is released.
 def is_dense_tensor_like(t):
-  """EXPERIMENTAL: Returns true if `t` implements the tensor interface.
-
-  See `register_dense_tensor_like_type()` for the current definition of a
-  "tensor-like type".
-
-  Args:
-    t: An object.
-
-  Returns:
-    True iff `t` is an instance of one of the registered "tensor-like" types.
-  """
-  return isinstance(t, _TENSOR_LIKE_TYPES)
-
-
-def register_dense_tensor_like_type(tensor_type):
-  """EXPERIMENTAL: Registers `tensor_type` as implementing the tensor interface.
-
-  A "tensor-like type" can represent a single dense tensor, and implements
-  the `name`, `dtype` and `shape` properties.
-
-  Args:
-    tensor_type: A type implementing the tensor interface.
-
-  Raises:
-    TypeError: If `tensor_type` does not implement the tensor interface.
-  """
-  if not (hasattr(tensor_type, "name") and
-          isinstance(tensor_type.name, property)):
-    raise TypeError("Type %s does not define a `name` property" %
-                    tensor_type.__name__)
-  if not (hasattr(tensor_type, "dtype") and
-          isinstance(tensor_type.dtype, property)):
-    raise TypeError("Type %s does not define a `dtype` property" %
-                    tensor_type.__name__)
-  if not (hasattr(tensor_type, "shape") and
-          isinstance(tensor_type.shape, property)):
-    raise TypeError("Type %s does not define a `shape` property" %
-                    tensor_type.__name__)
-  # We expect this list to be small, so choose quadratic complexity
-  # for registration, so that we have a tuple that can be used for
-  # more efficient `isinstance` checks later.
-  global _TENSOR_LIKE_TYPES
-  _TENSOR_LIKE_TYPES = tuple(list(_TENSOR_LIKE_TYPES) + [tensor_type])
+  return isinstance(t, core_tf_types.Tensor)
 
 
 def uid():
@@ -304,7 +263,7 @@ def disable_tensor_equality():
 
 # TODO(mdan): This object should subclass Symbol, not just Tensor.
 @tf_export("Tensor")
-class Tensor(internal.NativeObject):
+class Tensor(internal.NativeObject, core_tf_types.Tensor):
   """A tensor is a multidimensional array of elements represented by a
 
   `tf.Tensor` object.  All elements are of a single known data type.
@@ -1305,9 +1264,6 @@ class _EagerTensorBase(Tensor):
 EagerTensor = pywrap_tfe.TFE_Py_InitEagerTensor(_EagerTensorBase)
 
 
-register_dense_tensor_like_type(Tensor)
-
-
 @tf_export(v1=["convert_to_tensor"])
 def convert_to_tensor_v1(value,
                          dtype=None,
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 20f58a00cfe..322df8ffac8 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -3268,56 +3268,6 @@ class DeprecatedTest(test_util.TensorFlowTestCase):
         test_ops.old()
 
 
-class DenseTensorLikeTypeTest(test_util.TensorFlowTestCase):
-
-  @test_util.disable_tfrt("Graph is not supported yet.")
-  def testSuccess(self):
-    op = ops.Operation(
-        ops._NodeDef("FloatOutput", "myop"), ops.Graph(), [], [dtypes.float32])
-    t = op.outputs[0]
-    self.assertTrue(ops.is_dense_tensor_like(t))
-
-    v = variables.Variable([17])
-    self.assertTrue(ops.is_dense_tensor_like(v))
-
-  class BadClassNoName(object):
-    pass
-
-  class BadClassBadName(object):
-
-    def name(self):
-      pass
-
-  class BadClassNoDtype(object):
-
-    @property
-    def name(self):
-      pass
-
-  class BadClassBadDtype(object):
-
-    @property
-    def name(self):
-      pass
-
-    def dtype(self):
-      pass
-
-  def testBadClass(self):
-    with self.assertRaisesRegexp(TypeError, "`name`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassNoName)
-    with self.assertRaisesRegexp(TypeError, "`name`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassBadName)
-    with self.assertRaisesRegexp(TypeError, "`dtype`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassNoDtype)
-    with self.assertRaisesRegexp(TypeError, "`dtype`"):
-      ops.register_dense_tensor_like_type(
-          DenseTensorLikeTypeTest.BadClassBadDtype)
-
-
 class NameScopeTest(test_util.TensorFlowTestCase):
 
   def testStripAndPrependScope(self):
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 50388595c3d..968b635250a 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -26,6 +26,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.types import core
 from tensorflow.python.types import internal
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
@@ -1009,7 +1010,7 @@ def is_tensor(x):  # pylint: disable=invalid-name
     `True` if `x` is a tensor or "tensor-like", `False` if not.
   """
   return (isinstance(x, internal.NativeObject) or
-          ops.is_dense_tensor_like(x) or
+          isinstance(x, core.Tensor) or
           getattr(x, "is_tensor_like", False))
 
 
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 0a40ce3899b..16188af833a 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -62,6 +62,7 @@ from tensorflow.python.ops.losses import util as tf_losses_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
+from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
@@ -3143,7 +3144,7 @@ def _convert_scipy_sparse_tensor(value, expected_input):
     The possibly-converted 'value'.
   """
   if issparse is not None and issparse(value):
-    if ops.is_dense_tensor_like(expected_input):
+    if isinstance(expected_input, core.Tensor):
       if ops.executing_eagerly_outside_functions():
         # In TF2 we do not silently densify sparse matrices.
         raise ValueError('A SciPy sparse matrix was passed to a model '
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index c43ca21ea06..29e5a68c854 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -23,9 +23,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.types import core
 
 
-class AutoCastVariable(variables.Variable):
+class AutoCastVariable(variables.Variable, core.Tensor):
   """Variable that will cast itself to a different dtype in applicable contexts.
 
   This class wraps a floating-point `tf.Variable`. It emulates the variable
@@ -417,7 +418,6 @@ class AutoCastVariable(variables.Variable):
 
 ops.register_tensor_conversion_function(AutoCastVariable,
                                         AutoCastVariable._dense_var_to_tensor)  # pylint:disable=protected-access
-ops.register_dense_tensor_like_type(AutoCastVariable)
 
 
 def create_autocast_variable(variable):
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 33aac84d77f..1cb6fdbd726 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import gen_math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_array_ops import *
 from tensorflow.python.ops.gen_array_ops import reverse_v2 as reverse  # pylint: disable=unused-import
+from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
@@ -1381,13 +1382,13 @@ def _autopacking_helper(list_or_tuple, dtype, name):
   if context.executing_eagerly():
     # NOTE: Fast path when all the items are tensors, this doesn't do any type
     # checking.
-    if all(ops.is_dense_tensor_like(elem) for elem in list_or_tuple):
+    if all(isinstance(elem, core.Tensor) for elem in list_or_tuple):
       return gen_array_ops.pack(list_or_tuple, name=name)
   must_pack = False
   converted_elems = []
   with ops.name_scope(name) as scope:
     for i, elem in enumerate(list_or_tuple):
-      if ops.is_dense_tensor_like(elem):
+      if isinstance(elem, core.Tensor):
         if dtype is not None and elem.dtype.base_dtype != dtype:
           raise TypeError("Cannot convert a list containing a tensor of dtype "
                           "%s to %s (Tensor is: %r)" %
@@ -1396,7 +1397,7 @@ def _autopacking_helper(list_or_tuple, dtype, name):
         must_pack = True
       elif isinstance(elem, (list, tuple)):
         converted_elem = _autopacking_helper(elem, dtype, str(i))
-        if ops.is_dense_tensor_like(converted_elem):
+        if isinstance(converted_elem, core.Tensor):
           must_pack = True
         converted_elems.append(converted_elem)
       else:
@@ -1404,7 +1405,7 @@ def _autopacking_helper(list_or_tuple, dtype, name):
     if must_pack:
       elems_as_tensors = []
       for i, elem in enumerate(converted_elems):
-        if ops.is_dense_tensor_like(elem):
+        if isinstance(elem, core.Tensor):
           elems_as_tensors.append(elem)
         else:
           # NOTE(mrry): This is inefficient, but it enables us to
@@ -1429,7 +1430,7 @@ def _get_dtype_from_nested_lists(list_or_tuple):
     such object exists.
   """
   for elem in list_or_tuple:
-    if ops.is_dense_tensor_like(elem):
+    if isinstance(elem, core.Tensor):
       return elem.dtype.base_dtype
     elif isinstance(elem, (list, tuple)):
       maybe_dtype = _get_dtype_from_nested_lists(elem)
@@ -1441,7 +1442,7 @@ def _get_dtype_from_nested_lists(list_or_tuple):
 def _cast_nested_seqs_to_dtype(dtype):
 
   def _maybe_cast(elem):
-    if ops.is_dense_tensor_like(elem):
+    if isinstance(elem, core.Tensor):
       if dtype != elem.dtype.base_dtype:
         elem = gen_math_ops.cast(elem, dtype)
     return elem
@@ -1455,7 +1456,7 @@ _NON_AUTOPACKABLE_TYPES.add(np.ndarray)
 
 def _should_not_autopack(v):
   # The condition we really want is
-  #    ops.is_dense_tensor_like(...)
+  #    any(isinstance(elem, core.Tensor))
   # but it is >5x slower due to abc.ABCMeta.__instancecheck__.
   # pylint: disable=unidiomatic-typecheck
   # TODO(slebedev): add nest.all?
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index f99f886f210..d8a7765a208 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -49,6 +49,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.ops.gen_resource_variable_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.types import core
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
 
@@ -330,7 +331,7 @@ def variable_accessed(variable):
     tape.variable_accessed(variable)
 
 
-class BaseResourceVariable(variables.VariableV1):
+class BaseResourceVariable(variables.VariableV1, core.Tensor):
   """A python variable from an existing handle."""
 
   # TODO(wangpeng): Deprecate `constraint` when callers no long pass it in.
@@ -1830,7 +1831,6 @@ def _dense_var_to_tensor(var, dtype=None, name=None, as_ref=False):
 # allowing instances of the class to be used as tensors.
 ops.register_tensor_conversion_function(BaseResourceVariable,
                                         _dense_var_to_tensor)
-ops.register_dense_tensor_like_type(BaseResourceVariable)
 
 
 class _UnreadVariable(BaseResourceVariable):
@@ -1955,9 +1955,6 @@ class _UnreadVariable(BaseResourceVariable):
     return self._parent_op
 
 
-ops.register_dense_tensor_like_type(_UnreadVariable)
-
-
 @ops.RegisterGradient("ReadVariableOp")
 def _ReadGrad(_, grad):
   """Gradient for read op."""
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index d65cd235ca8..81c3f9a2f70 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_contextlib
@@ -1000,7 +1001,7 @@ class _VariableStore(object):
     return initializer, initializing_from_value
 
 
-class _LazyEvalTensor(object):
+class _LazyEvalTensor(core.Tensor):
   """A Tensor-like object that only evaluates its thunk when used."""
 
   def __init__(self, thunk):
@@ -1069,8 +1070,6 @@ session.register_session_run_conversion_functions(
     lambda fetch: ([fetch._master_tensor], lambda fetched_vals: fetched_vals[0])  # pylint: disable=protected-access
     )
 
-ops.register_dense_tensor_like_type(_LazyEvalTensor)
-
 
 # To stop regularization, use this regularizer
 @tf_export(v1=["no_regularizer"])
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 1080778e3d3..d3df0659b5a 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -47,6 +47,7 @@ from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.types import core
 
 
 def default_variable_creator(_, **kwds):
@@ -264,6 +265,7 @@ class VariableMetaclass(type):
 
 
 @tf_export("Variable", v1=[])
+# TODO(mdan): This should subclass core.Tensor, and not all its subclasses?
 class Variable(six.with_metaclass(VariableMetaclass, trackable.Trackable)):
   """See the [variable guide](https://tensorflow.org/guide/variable).
 
@@ -1551,7 +1553,7 @@ class VariableV1(Variable):
 
 
 # TODO(apassos): do not repeat all comments here
-class RefVariable(VariableV1):
+class RefVariable(VariableV1, core.Tensor):
   """Ref-based implementation of variables."""
 
   def __init__(
@@ -3032,7 +3034,6 @@ class PartitionedVariable(object):
 # allowing instances of the class to be used as tensors.
 ops.register_tensor_conversion_function(RefVariable,
                                         RefVariable._TensorConversionFunction)  # pylint: disable=protected-access
-ops.register_dense_tensor_like_type(RefVariable)
 
 
 @tf_export(v1=["global_variables"])
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index e5ca60843e3..b6565f594c9 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -226,6 +226,7 @@ py_library(
     deps = [
         "//tensorflow/python:util",
         "//tensorflow/python/profiler/internal:_pywrap_traceme",
+        "//tensorflow/python/types",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/types/BUILD b/tensorflow/python/types/BUILD
index f35ca7fb803..e93bf5c10b3 100644
--- a/tensorflow/python/types/BUILD
+++ b/tensorflow/python/types/BUILD
@@ -27,6 +27,9 @@ py_strict_library(
         "internal.py",
     ],
     srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:types_whitelist",
+    ],
     deps = [],
 )
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
index 4a30fae1da9..9315973e51d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.Tensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
+  is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "OVERLOADABLE_OPERATORS"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
index 4a30fae1da9..9315973e51d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.Tensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
+  is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "OVERLOADABLE_OPERATORS"

From 65321b89c7898fb5184a64d1f42066fa14c7175f Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 12 May 2020 14:53:22 -0700
Subject: [PATCH 061/412] Disable collective ops xla test on gpu

PiperOrigin-RevId: 311207584
Change-Id: Ibb51f5ee646edbc39d65af8a47495b21751604be
---
 tensorflow/compiler/xla/tests/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index c8a242c156a..1ad1f8363cf 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1909,7 +1909,7 @@ xla_test(
         # This test is tagged "manual" because it requires multiple GPUs, and
         # Forge only supports single-GPU tests.  Guitar skips "manual" tests
         # unless they're also tagged "guitar".
-        "guitar",
+        #  "guitar",  # Re-enable after b/156405690 is fixed.
         "manual",
         "multi_gpu",
         "no_oss",

From f66f384729b2a2f70fd01902f49b0b7a95be9f26 Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <laurenzo@google.com>
Date: Tue, 12 May 2020 15:01:00 -0700
Subject: [PATCH 062/412] Make CHLO->HLO patterns extend OpRewritePattern vs
 OpConversionPattern.

* In the absence of type conversion, this is more generally compatible (ie. with the greedy rewriter).
* Consistent with the rest of the legalize_tf patterns.

PiperOrigin-RevId: 311209137
Change-Id: I3a409dbc307c141753c73ae7731276c61a2728d0
---
 .../xla/transforms/chlo_legalize_to_hlo.cc    | 41 +++++++++----------
 1 file changed, 19 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
index a20511a95fc..0c9585a817f 100644
--- a/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/chlo_legalize_to_hlo.cc
@@ -33,24 +33,23 @@ namespace {
 // Converts binary ops that statically are determined to not broadcast directly
 // to the corresponding xla_hlo non-broadcasting op.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
-struct ConvertTrivialNonBroadcastBinaryOp
-    : public OpConversionPattern<ChloOpTy> {
-  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ChloOpTy op, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
+struct ConvertTrivialNonBroadcastBinaryOp : public OpRewritePattern<ChloOpTy> {
+  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ChloOpTy op,
+                                PatternRewriter &rewriter) const override {
     // Only rewrite for statically determinable non-broadcasting cases.
-    auto lhs = operands[0].getType().dyn_cast<RankedTensorType>();
-    auto rhs = operands[1].getType().dyn_cast<RankedTensorType>();
-    if (!lhs || !rhs) return failure();
+    auto lhs_type = op.lhs().getType().template dyn_cast<RankedTensorType>();
+    auto rhs_type = op.rhs().getType().template dyn_cast<RankedTensorType>();
+    if (!lhs_type || !rhs_type) return failure();
 
     // Requires rank broadcast.
-    if (lhs.getRank() != rhs.getRank()) return failure();
+    if (lhs_type.getRank() != rhs_type.getRank()) return failure();
     // Any dynamic dimension may require broadcasting and requires more
     // analysis.
-    if (!lhs.hasStaticShape() || !rhs.hasStaticShape()) return failure();
+    if (!lhs_type.hasStaticShape() || !rhs_type.hasStaticShape())
+      return failure();
 
-    for (auto extents : llvm::zip(lhs.getShape(), rhs.getShape())) {
+    for (auto extents : llvm::zip(lhs_type.getShape(), rhs_type.getShape())) {
       auto lhs_extent = std::get<0>(extents);
       auto rhs_extent = std::get<1>(extents);
       if (lhs_extent != rhs_extent) {
@@ -58,9 +57,8 @@ struct ConvertTrivialNonBroadcastBinaryOp
       }
     }
 
-    rewriter.replaceOp(
-        op, {Adaptor::CreateOp(op, op.getResult().getType(), operands[0],
-                               operands[1], rewriter)});
+    rewriter.replaceOp(op, {Adaptor::CreateOp(op, op.getResult().getType(),
+                                              op.lhs(), op.rhs(), rewriter)});
     return success();
   }
 };
@@ -83,14 +81,13 @@ struct ConvertTrivialNonBroadcastBinaryOp
 // Whether that is of any practical benefit remains to be seen.
 template <typename ChloOpTy, typename HloOpTy, typename Adaptor>
 struct ConvertRankedDynamicBroadcastBinaryOp
-    : public OpConversionPattern<ChloOpTy> {
-  using OpConversionPattern<ChloOpTy>::OpConversionPattern;
-  LogicalResult matchAndRewrite(
-      ChloOpTy op, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
+    : public OpRewritePattern<ChloOpTy> {
+  using OpRewritePattern<ChloOpTy>::OpRewritePattern;
+  LogicalResult matchAndRewrite(ChloOpTy op,
+                                PatternRewriter &rewriter) const override {
     // Only support ranked operands.
-    Value lhs = operands[0];
-    Value rhs = operands[1];
+    Value lhs = op.lhs();
+    Value rhs = op.rhs();
     auto lhs_type = lhs.getType().dyn_cast<RankedTensorType>();
     auto rhs_type = rhs.getType().dyn_cast<RankedTensorType>();
     auto result_type =

From f38355dab31bb466e9fdc900089dcd4abba536d6 Mon Sep 17 00:00:00 2001
From: Prakalp Srivastava <prakalps@google.com>
Date: Tue, 12 May 2020 15:06:44 -0700
Subject: [PATCH 063/412] Add TF_AllTypesMatch trait in TensorFlow dialect.

TF_AllTypesMatch trait takes a list of operands/results/attributes and verifies that they have cast compatible types i.e., a single runtime type for all the values. It handles Resource subtypes as well and assumes all variant subtypes are cast compatible.

AllTypesMatch trait on TF dialect ops can be replaced with this trait which handles unranked/dynamic shapes and TF subtypes.

PiperOrigin-RevId: 311210249
Change-Id: I0d0dec247ff256c0b23aa9b5750912eaefc064f5
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |   6 +-
 .../compiler/mlir/tensorflow/ir/tf_op_base.td |  10 +
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  55 +-----
 .../compiler/mlir/tensorflow/ir/tf_types.cc   | 174 ++++++++++++++----
 .../compiler/mlir/tensorflow/ir/tf_types.h    |   6 +
 .../mlir/tensorflow/tests/tf-ops.mlir         |  27 ++-
 6 files changed, 181 insertions(+), 97 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 9a29fa4f8a9..bddf064f5c6 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -1765,7 +1765,7 @@ of corresponding 3-element vectors is cross-multiplied independently.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_CrossReplicaSumOp : TF_Op<"CrossReplicaSum", [AllTypesMatch<["input", "output"]>, NoSideEffect]> {
+def TF_CrossReplicaSumOp : TF_Op<"CrossReplicaSum", [NoSideEffect, TF_AllTypesMatch<["input", "output"]>]> {
   let summary = "An Op to sum inputs across replicated TPU instances.";
 
   let description = [{
@@ -1789,7 +1789,7 @@ and `B, D, F, H` as group 1. Thus we get the outputs:
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_CumsumOp : TF_Op<"Cumsum", [AllTypesMatch<["x", "out"]>, NoSideEffect]> {
+def TF_CumsumOp : TF_Op<"Cumsum", [NoSideEffect, TF_AllTypesMatch<["x", "out"]>]> {
   let summary = "Compute the cumulative sum of the tensor `x` along `axis`.";
 
   let description = [{
@@ -4350,7 +4350,7 @@ cublas.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
-def TF_MatrixBandPartOp : TF_Op<"MatrixBandPart", [AllTypesMatch<["input", "band"]>, NoSideEffect]> {
+def TF_MatrixBandPartOp : TF_Op<"MatrixBandPart", [NoSideEffect, TF_AllTypesMatch<["input", "band"]>]> {
   let summary = [{
 Copy a tensor setting everything outside a central band in each innermost matrix to zero.
   }];
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index cb17341cefd..cd20cc79c17 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -70,6 +70,16 @@ class TF_OpIsBroadcastableToRes<int opId, int resId> : And<[
               "$_op.getOperand(" # opId # ").getType(), "
               "$_op.getResult(" # resId # ").getType())">]>;
 
+
+class TF_AllTypesMatchPred<list<string> values> :
+    CPred<"TF::AreCastCompatible(llvm::makeArrayRef({"# StrJoin<values>.result #"}))">;
+
+class TF_AllTypesMatch<list<string> names> :
+    PredOpTrait<
+        "all of {" # StrJoin<names>.result # "} have dynamically equal types ",
+        TF_AllTypesMatchPred<
+            !foreach(n, names, !subst("$_self", "$" # n, "$_self.getType()"))>>;
+
 //===----------------------------------------------------------------------===//
 // TensorFlow op definitions
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 85baff5e0d7..82ddc80875a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -110,47 +110,6 @@ static inline bool HasRankAtMost(Value value, int64_t rank) {
   return !type || type.getRank() <= rank;
 }
 
-// Returns true if the given pair of TensorFlow types can be cast to one
-// another. In other words, a single run-time value is legal for both the types.
-// For example, tensor<*xf32> and tensor<3xf32> are cast compatible.
-static bool AreCastCompatible(Type a, Type b) {
-  if (TensorCastOp::areCastCompatible(a, b)) return true;
-
-  // Resource types may optionally contain subtypes information that does not
-  // match. Check subtypes compatibility when possible, otherwise treat them as
-  // compatible.
-  auto a_or_element_type = getElementTypeOrSelf(a);
-  auto b_or_element_type = getElementTypeOrSelf(b);
-
-  auto a_kind = a_or_element_type.getKind();
-  auto b_kind = b_or_element_type.getKind();
-
-  if (a_kind == TensorFlowTypes::RESOURCE &&
-      b_kind == TensorFlowTypes::RESOURCE) {
-    auto a_resource_type = a_or_element_type.dyn_cast<ResourceType>();
-    auto b_resource_type = b_or_element_type.dyn_cast<ResourceType>();
-    bool a_has_subtype = !a_resource_type.getSubtypes().empty();
-    bool b_has_subtype = !b_resource_type.getSubtypes().empty();
-
-    if (!a_has_subtype || !b_has_subtype) return true;
-
-    assert(a_resource_type.getSubtypes().size() <= 1 &&
-           "Resource type must have at most one subtype");
-    assert(b_resource_type.getSubtypes().size() <= 1 &&
-           "Resource type must have at most one subtype");
-
-    return TensorCastOp::areCastCompatible(
-        a_resource_type.getSubtypes().front(),
-        b_resource_type.getSubtypes().front());
-  }
-
-  // Variant types may optionally contain subtypes information that need not
-  // match.  It is also not possible to compare subtypes for compatibility as
-  // their interpretation depends on the ops operating on them. So, accept all
-  // pairs of variant types.
-  return a_kind == TensorFlowTypes::VARIANT &&
-         b_kind == TensorFlowTypes::VARIANT;
-}
 
 static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
   return dim_or_rank == -1;
@@ -1413,7 +1372,7 @@ static LogicalResult Verify(DynamicStitchOp op) {
       auto expected_out_ty =
           RankedTensorType::get(expected_shape, out_ty.getElementType());
 
-      if (!AreCastCompatible(out_ty, expected_out_ty)) {
+      if (!AreCastCompatible({out_ty, expected_out_ty})) {
         return op.emitOpError() << "has invalid output type; should be "
                                    "compatible with inferred type "
                                 << expected_out_ty;
@@ -1814,14 +1773,14 @@ static LogicalResult Verify(IfOp op) {
   for (unsigned i = 0; i < expectedNumInputs; ++i) {
     auto operandType = op.getOperand(i + 1).getType().cast<TensorType>();
     auto thenInputType = thenFuncType.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible(operandType, thenInputType))
+    if (!AreCastCompatible({operandType, thenInputType}))
       return op.emitError(
           llvm::formatv("then branch input type {0} is incompatible with "
                         "operand type {1} at index {2}",
                         thenInputType, operandType, i));
 
     auto elseInputType = elseFuncType.getInput(i).cast<TensorType>();
-    if (!AreCastCompatible(operandType, elseInputType))
+    if (!AreCastCompatible({operandType, elseInputType}))
       return op.emitError(
           llvm::formatv("else branch input type {0} is incompatible with "
                         "operand type {1} at index {2}",
@@ -1829,7 +1788,7 @@ static LogicalResult Verify(IfOp op) {
 
     // If branches have incompatible input types that means that no tensor can
     // serve as input to both the functions. Hence, the op is invalid.
-    if (!AreCastCompatible(thenInputType, elseInputType))
+    if (!AreCastCompatible({thenInputType, elseInputType}))
       return op.emitError(llvm::formatv(
           "branches inputs have incompatible types {0} and {1} at index {2}",
           thenInputType, elseInputType, i));
@@ -1845,14 +1804,14 @@ static LogicalResult Verify(IfOp op) {
   for (unsigned i = 0; i < expectedNumResults; ++i) {
     auto resultType = op.getResult(i).getType().cast<TensorType>();
     auto thenResultType = thenFuncType.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible(thenResultType, resultType))
+    if (!AreCastCompatible({thenResultType, resultType}))
       return op.emitError(
           llvm::formatv("then branch result type {0} is incompatible with op "
                         "result type {1} at index {2}",
                         thenResultType, resultType, i));
 
     auto elseResultType = elseFuncType.getResult(i).cast<TensorType>();
-    if (!AreCastCompatible(elseResultType, resultType))
+    if (!AreCastCompatible({elseResultType, resultType}))
       return op.emitError(
           llvm::formatv("else branch result type {0} is incompatible with op "
                         "result type {1} at index {2}",
@@ -3789,7 +3748,7 @@ static LogicalResult Verify(WhileOp op) {
         auto aType = a.second[idx];
         auto bType = b.second[idx];
 
-        if (!AreCastCompatible(aType, bType))
+        if (!AreCastCompatible({aType, bType}))
           return op.emitError(llvm::formatv(
               "{0} type {1} is incompatible with {2} type {3} at index {4}",
               a.first, aType, b.first, bType, idx));
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
index 6c3cd7fac92..d312e5e409b 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.cc
@@ -28,6 +28,134 @@ llvm::Optional<llvm::ArrayRef<int64_t>> GetShape(mlir::Value value) {
   if (shaped_type.hasRank()) return shaped_type.getShape();
   return llvm::None;
 }
+
+// Merges cast compatible shapes and returns a more refined shape. The two
+// shapes are cast compatible if they have the same rank and at each dimension,
+// either both have same size or one of them is dynamic. Returns false if the
+// given shapes are not cast compatible. The refined shape is same or more
+// precise than the two input shapes.
+bool GetCastCompatibleShape(llvm::ArrayRef<int64_t> a_shape,
+                            llvm::ArrayRef<int64_t> b_shape,
+                            llvm::SmallVectorImpl<int64_t>* refined_shape) {
+  if (a_shape.size() != b_shape.size()) return false;
+  int64_t rank = a_shape.size();
+  refined_shape->reserve(rank);
+  for (auto dims : llvm::zip(a_shape, b_shape)) {
+    int64_t dim1 = std::get<0>(dims);
+    int64_t dim2 = std::get<1>(dims);
+
+    if (mlir::ShapedType::isDynamic(dim1)) {
+      refined_shape->push_back(dim2);
+      continue;
+    }
+    if (mlir::ShapedType::isDynamic(dim2)) {
+      refined_shape->push_back(dim1);
+      continue;
+    }
+    if (dim1 == dim2) {
+      refined_shape->push_back(dim1);
+      continue;
+    }
+    return false;
+  }
+  return true;
+}
+
+// Given two types `a` and `b`, returns a refined type which is cast compatible
+// with both `a` and `b` and is equal to or more precise than both of them. It
+// returns empty Type if the input types are not cast compatible.
+//
+// The two types are considered cast compatible if they have dynamically equal
+// shapes and element type. For element types that do not have subtypes, they
+// must be equal. However for TensorFlow types such as Resource and Variant,
+// that also have subtypes, we recursively check for subtype compatibilty for
+// Resource types and assume all variant types are cast compatible. If either
+// one of `a` or `b` have empty subtypes, they are considered cast compatible.
+//
+// The returned type is same or more precise than the input types. For example,
+// if `a` and `b` are cast compatible types tensor<2x?x?xf32> and
+// tensor<?x4x?xf32> respectively, the returned type is tensor<2x4x?xf32>.
+//
+// Provides option to ignore ref types on 'a'. This is useful for TF ops that
+// might allow operands to either be same as result type or be a ref type
+// corresponding to it.
+mlir::Type GetCastCompatibleType(mlir::Type a, mlir::Type b,
+                                 bool may_ignore_ref_type_a) {
+  // Fast path if everything is equal.
+  if (a == b) return b;
+
+  auto a_tt = a.dyn_cast<mlir::TensorType>();
+  auto b_tt = b.dyn_cast<mlir::TensorType>();
+
+  // If only one of a or b is a tensor type, they are incompatible.
+  if (static_cast<bool>(a_tt) ^ static_cast<bool>(b_tt)) return nullptr;
+
+  // For non-tensor types, we do not need to worry about shape and can return
+  // early.
+  if (!a_tt && !b_tt) {
+    // Remove ref types.
+    if (may_ignore_ref_type_a) {
+      if (auto ref_type = a.dyn_cast<mlir::TF::TensorFlowRefType>()) {
+        a = ref_type.RemoveRef();
+        if (a == b) return a;
+      }
+    }
+    if (a.getKind() != b.getKind()) return nullptr;
+
+    // If either is not a type that contain subtypes then the types are not cast
+    // compatible.
+    auto a_wst = a.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
+    auto b_wst = b.dyn_cast<mlir::TF::TensorFlowTypeWithSubtype>();
+    if (!a_wst || !b_wst) return nullptr;
+
+    // For Variant types we are more permissive right now and accept all pairs
+    // of Variant types. If we are more constrainted and check compatibility of
+    // subtypes, we might reject valid graphs.
+    // TODO(prakalps): Variant doesn't have a subtype, we assign it
+    // one, so we should only assign it one when we know the subtype. Then we
+    // can be more constrained and check subtypes for cast compatibility as
+    // well.
+    if (a.isa<mlir::TF::VariantType>()) return a;
+
+    // For Resource types, we recursively check the subtypes for cast
+    // compatibility, if possible. Otherwise treat them as compatible.
+    auto a_wst_st = a_wst.GetSubtypes();
+    auto b_wst_st = b_wst.GetSubtypes();
+    if (a_wst_st.empty() || b_wst_st.empty()) return a;
+    if (a_wst_st.size() != b_wst_st.size()) return nullptr;
+    llvm::SmallVector<mlir::TensorType, 4> refined_subtypes;
+    for (auto subtypes : llvm::zip(a_wst_st, b_wst_st)) {
+      mlir::Type refined_st =
+          GetCastCompatibleType(std::get<0>(subtypes), std::get<1>(subtypes),
+                                /*may_ignore_ref_type_a=*/false);
+      if (!refined_st) return nullptr;
+      refined_subtypes.push_back(refined_st.cast<mlir::TensorType>());
+    }
+
+    return mlir::TF::ResourceType::get(refined_subtypes, a.getContext());
+  }
+
+  // For tensor types, check compatibility of both element type and shape.
+  mlir::Type refined_element_ty = GetCastCompatibleType(
+      a_tt.getElementType(), b_tt.getElementType(), may_ignore_ref_type_a);
+  if (!refined_element_ty) return nullptr;
+
+  if (!a_tt.hasRank() && !b_tt.hasRank()) {
+    return mlir::UnrankedTensorType::get(refined_element_ty);
+  }
+  if (!a_tt.hasRank()) {
+    return mlir::RankedTensorType::get(b_tt.getShape(), refined_element_ty);
+  }
+  if (!b_tt.hasRank()) {
+    return mlir::RankedTensorType::get(a_tt.getShape(), refined_element_ty);
+  }
+
+  llvm::SmallVector<int64_t, 8> refined_shape;
+  if (!GetCastCompatibleShape(a_tt.getShape(), b_tt.getShape(), &refined_shape))
+    return nullptr;
+
+  return mlir::RankedTensorType::get(refined_shape, refined_element_ty);
+}
 }  // namespace
 
 namespace mlir {
@@ -224,44 +352,16 @@ bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs) {
 
 bool HasCompatibleElementTypes(Type lhs, Type rhs,
                                bool may_ignore_ref_type_lhs) {
-  // Fast path if everything is equal.
-  if (lhs == rhs) return true;
+  return GetCastCompatibleType(lhs, rhs, may_ignore_ref_type_lhs) != nullptr;
+}
 
-  // In TF all values are tensors.
-  auto lhs_tt = lhs.cast<TensorType>();
-  auto rhs_tt = rhs.cast<TensorType>();
-
-  // Verify matching element types. These should be identical dynamically,
-  // so this allows for types not yet fully refined.
-  auto lhs_et = lhs_tt.getElementType();
-  auto rhs_et = rhs_tt.getElementType();
-  if (lhs_et == rhs_et) return true;
-
-  // Remove ref types.
-  if (may_ignore_ref_type_lhs) {
-    if (auto ref_type = lhs_et.dyn_cast<TF::TensorFlowRefType>()) {
-      lhs_et = ref_type.RemoveRef();
-      if (lhs_et == rhs_et) return true;
-    }
-  }
-
-  if (lhs_et.getKind() != rhs_et.getKind()) return false;
-
-  // If either is not type that contain subtypes then the element types don't
-  // match.
-  auto lhs_wst = lhs_et.dyn_cast<TF::TensorFlowTypeWithSubtype>();
-  auto rhs_wst = rhs_et.dyn_cast<TF::TensorFlowTypeWithSubtype>();
-  if (!lhs_wst || !rhs_wst) return false;
-
-  // Consider the subtype recursively.
-  auto lhs_wst_st = lhs_wst.GetSubtypes();
-  auto rhs_wst_st = rhs_wst.GetSubtypes();
-  if (lhs_wst_st.empty() || rhs_wst_st.empty()) return true;
-  if (lhs_wst_st.size() != rhs_wst_st.size()) return false;
-  for (auto subtypes : llvm::zip(lhs_wst_st, rhs_wst_st)) {
-    if (!HasCompatibleElementTypes(std::get<0>(subtypes),
-                                   std::get<1>(subtypes)))
-      return false;
+bool AreCastCompatible(ArrayRef<Type> types) {
+  Type common = types.front();
+  for (auto type : types.drop_front()) {
+    Type refined_type =
+        GetCastCompatibleType(common, type, /*may_ignore_ref_type_a=*/false);
+    if (!refined_type) return false;
+    common = refined_type;
   }
   return true;
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
index d1e6a74a0c5..4c99aae4706 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_types.h
@@ -313,6 +313,12 @@ bool BroadcastCompatible(ArrayRef<Type> lhs, ArrayRef<Type> rhs);
 bool HasCompatibleElementTypes(Type lhs, Type rhs,
                                bool may_ignore_ref_type_lhs = false);
 
+// Returns true if all TensorFlow types can be cast to one
+// another. In other words, a single run-time value is legal for both the types.
+// For example, tensor<*xf32>, tensor<?xf32> and tensor<3xf32> are cast
+// compatible.
+bool AreCastCompatible(ArrayRef<Type> types);
+
 }  // end namespace TF
 }  // end namespace mlir
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 118ce2e8645..ffa287e0e53 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -881,20 +881,29 @@ func @testValidMatrixBandPartOpUnranked(%arg0: tensor<*xbf16>, %arg1: tensor<i64
 
 // -----
 
-// Test invalid tf.MatrixBandPart
-func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
-  // expected-error @+1 {{op failed to verify that all of {input, band} have same type}}
-  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64xbf16>
-  return %0 : tensor<64x64xbf16>
+// Test valid tf.MatrixBandPart
+// CHECK-LABEL: func @testValidMatrixBandPartOpUnrankedBand
+func @testValidMatrixBandPartOpUnrankedBand(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<*xbf16> {
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<*xbf16>
+  return %0 : tensor<*xbf16>
+}
+
+// -----
+
+// Test valid tf.MatrixBandPart
+// CHECK-LABEL: func @testValidMatrixBandPartOpCompatibleDynamicShapes
+func @testValidMatrixBandPartOpCompatibleDynamicShapes(%arg0: tensor<?x10x?xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<?x?x8xbf16> {
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<?x10x?xbf16>, tensor<i64>, tensor<i64>) -> tensor<?x?x8xbf16>
+  return %0 : tensor<?x?x8xbf16>
 }
 
 // -----
 
 // Test invalid tf.MatrixBandPart
-func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<*xbf16> {
-  // expected-error @+1 {{op failed to verify that all of {input, band} have same type}}
-  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<*xbf16>
-  return %0 : tensor<*xbf16>
+func @testInvalidMatrixBandPartOp(%arg0: tensor<64x64x64xbf16>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<64x64xbf16> {
+  // expected-error @+1 {{op failed to verify that all of {input, band} have dynamically equal types}}
+  %0 = "tf.MatrixBandPart"(%arg0, %arg1, %arg2) : (tensor<64x64x64xbf16>, tensor<i64>, tensor<i64>) -> tensor<64x64xbf16>
+  return %0 : tensor<64x64xbf16>
 }
 
 // -----

From 9a43ab39f2db65d5526773c9c6b45f2087e4c1c7 Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Tue, 12 May 2020 15:41:09 -0700
Subject: [PATCH 064/412] Modify op version in optimize only if convertor
 version < quantized version.

PiperOrigin-RevId: 311216743
Change-Id: Iaac04750d0d302e9bba11b223c2885d6a36d74b3
---
 tensorflow/lite/tools/optimize/model_utils.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/tools/optimize/model_utils.cc b/tensorflow/lite/tools/optimize/model_utils.cc
index 26dcff222bd..ae868cf21b8 100644
--- a/tensorflow/lite/tools/optimize/model_utils.cc
+++ b/tensorflow/lite/tools/optimize/model_utils.cc
@@ -134,8 +134,10 @@ void SetOperatorCodeVersion(ModelT* model) {
       OperatorCodeT* op_code = model->operator_codes[op->opcode_index].get();
       operator_property::OperatorProperty property =
           operator_property::GetOperatorProperty(model, subgraph_idx, op_idx);
-      if (property.quantizable) {
-        // Only update the versions of quantizable operations.
+      if (property.quantizable && op_code->version < property.version) {
+        // Only update the versions of quantizable operations if the original
+        // version is lesser than minimum quantized one mentioned by
+        // OperatorProperty.
         op_code->version = property.version;
       }
     }

From ce11d03f84bb182c0eb4bdda0d838c58c83a9e24 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 12 May 2020 15:51:38 -0700
Subject: [PATCH 065/412] [XLA/GPU] Make Thunk::Initialize() happen at
 compile-time, not run-time. This simplifies GpuExecutable for MLIR
 transition.

PiperOrigin-RevId: 311218613
Change-Id: I42aaca015689b19c8a6343f1cac50451e6d0cf84
---
 tensorflow/compiler/xla/service/gpu/BUILD     | 12 +++++------
 .../xla/service/gpu/amdgpu_compiler.cc        | 10 +++++----
 .../xla/service/gpu/amdgpu_compiler.h         |  2 +-
 .../xla/service/gpu/conditional_thunk.cc      |  4 ++--
 .../xla/service/gpu/conditional_thunk.h       |  2 +-
 .../compiler/xla/service/gpu/for_thunk.cc     |  4 ++--
 .../compiler/xla/service/gpu/for_thunk.h      |  2 +-
 .../compiler/xla/service/gpu/gpu_compiler.cc  | 21 ++++++-------------
 .../compiler/xla/service/gpu/gpu_compiler.h   |  7 ++++---
 .../xla/service/gpu/gpu_executable.cc         | 10 +++++----
 .../compiler/xla/service/gpu/gpu_executable.h | 18 ++++++++++------
 .../compiler/xla/service/gpu/gpu_types.h      | 17 ---------------
 .../compiler/xla/service/gpu/kernel_thunk.cc  |  9 ++++----
 .../compiler/xla/service/gpu/kernel_thunk.h   |  4 +++-
 .../xla/service/gpu/nvptx_compiler.cc         | 11 ++++++----
 .../compiler/xla/service/gpu/nvptx_compiler.h |  2 +-
 .../xla/service/gpu/sequential_thunk.cc       |  4 ++--
 .../xla/service/gpu/sequential_thunk.h        |  2 +-
 tensorflow/compiler/xla/service/gpu/thunk.h   |  5 +++--
 .../compiler/xla/service/gpu/while_thunk.cc   |  6 +++---
 .../compiler/xla/service/gpu/while_thunk.h    |  2 +-
 .../service/mlir_gpu/mlir_compiler_impl.cc    |  7 +++----
 .../compiler/xla/tests/llvm_compiler_test.cc  |  7 ++++---
 23 files changed, 78 insertions(+), 90 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index bff8734de5f..0f6b2cb72e6 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -17,15 +17,15 @@ load(
     "tf_cuda_library",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm",
-    "if_rocm_is_configured",
-)
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "if_rocm_is_configured",
+)
 load("//tensorflow:tensorflow.bzl", "if_nccl")
 
 package(
@@ -86,7 +86,6 @@ cc_library(
     name = "gpu_types",
     hdrs = ["gpu_types.h"],
     deps = [
-        "//tensorflow/compiler/xla:types",
         "@com_google_absl//absl/types:variant",
     ],
 )
@@ -406,7 +405,6 @@ cc_library(
     deps = [
         ":buffer_allocations",
         ":gpu_executable_run_options",
-        ":gpu_types",
         ":hlo_execution_profiler",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla/service:hlo",
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
index 485aff0c4d8..974db02b1b3 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
@@ -104,9 +104,11 @@ GpuVersion AMDGPUCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
   return isa_version;
 }
 
-StatusOr<GpuTargetBinary> AMDGPUCompiler::CompileTargetBinary(
-    const HloModule* module, llvm::Module* llvm_module, GpuVersion gpu_version,
-    se::StreamExecutor* stream_exec) {
+StatusOr<std::pair<std::string, std::vector<uint8>>>
+AMDGPUCompiler::CompileTargetBinary(const HloModule* module,
+                                    llvm::Module* llvm_module,
+                                    GpuVersion gpu_version,
+                                    se::StreamExecutor* stream_exec) {
   if (rocdl_dir_.empty()) {
     // Compute rocdl_dir_ just once and cache it in this member.
     rocdl_dir_ = GetROCDLDir(module->config());
@@ -127,7 +129,7 @@ StatusOr<GpuTargetBinary> AMDGPUCompiler::CompileTargetBinary(
     user_post_optimization_hook_(*llvm_module);
   }
 
-  return GpuTargetBinary{"", std::move(hsaco)};
+  return std::pair<std::string, std::vector<uint8>>("", std::move(hsaco));
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
index 9033585763b..acc5e021e3d 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.h
@@ -39,7 +39,7 @@ class AMDGPUCompiler : public GpuCompiler {
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
 
-  StatusOr<GpuTargetBinary> CompileTargetBinary(
+  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
       const HloModule* hlo_module, llvm::Module* llvm_module,
       GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index 5e7d89c7aee..e31f45942b1 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -50,7 +50,7 @@ void ConditionalThunk::ComputeAnnotations() {
   }
 }
 
-Status ConditionalThunk::Initialize(const GpuTargetBinary& target_binary,
+Status ConditionalThunk::Initialize(const GpuExecutable& executable,
                                     se::StreamExecutor* executor) {
   if (branch_index_is_bool_) {
     TF_RET_CHECK(branch_thunks_.size() == 2);
@@ -58,7 +58,7 @@ Status ConditionalThunk::Initialize(const GpuTargetBinary& target_binary,
     TF_RET_CHECK(!branch_thunks_.empty());
   }
   for (auto& branch_thunk : branch_thunks_) {
-    TF_RETURN_IF_ERROR(branch_thunk->Initialize(target_binary, executor));
+    TF_RETURN_IF_ERROR(branch_thunk->Initialize(executable, executor));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index ba69e1a38ec..404e2131eff 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -52,7 +52,7 @@ class ConditionalThunk : public Thunk {
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuTargetBinary& target_binary,
+  Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index aacc9deb739..0a97f668b38 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -39,9 +39,9 @@ void ForThunk::ComputeAnnotations() {
   body_thunk_sequence_->ComputeAnnotations();
 }
 
-Status ForThunk::Initialize(const GpuTargetBinary& target_binary,
+Status ForThunk::Initialize(const GpuExecutable& executable,
                             se::StreamExecutor* executor) {
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(target_binary, executor));
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index 57657b6825f..57402f70627 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -38,7 +38,7 @@ class ForThunk : public Thunk {
   ForThunk& operator=(const ForThunk&) = delete;
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuTargetBinary& target_binary,
+  Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 533ff52a90d..5f6dfd7d3a5 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -565,7 +565,8 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
 
   GpuVersion gpu_version = GetGpuVersion(stream_exec);
 
-  TF_ASSIGN_OR_RETURN(GpuTargetBinary backend_result,
+  using BackendCompileResult = std::pair<std::string, std::vector<uint8>>;
+  TF_ASSIGN_OR_RETURN(BackendCompileResult backend_result,
                       CompileTargetBinary(module.get(), &llvm_module,
                                           gpu_version, stream_exec));
 
@@ -577,11 +578,6 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
                             thunk_schedule->ToString());
   }
 
-  std::vector<Thunk*> thunks;
-  for (Thunk* thunk : thunk_schedule->TotalOrder()) {
-    thunks.push_back(thunk);
-  }
-
   std::unique_ptr<HloProfileIndexMap> profile_index_map;
   std::unique_ptr<HloProfilePrinterData> profile_printer;
 
@@ -601,19 +597,14 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
   }
 
   auto* gpu_executable = new GpuExecutable(
-      std::move(backend_result), gpu_version, std::move(thunk_schedule),
-      std::move(module), std::move(buffer_assignment),
-      std::move(profile_printer), std::move(profile_index_map));
+      backend_result.first, backend_result.second, gpu_version,
+      std::move(thunk_schedule), std::move(module),
+      std::move(buffer_assignment), std::move(profile_printer),
+      std::move(profile_index_map));
   if (embed_ir_in_executable) {
     DCHECK_NE("", ir_module_string_before_opt);
     gpu_executable->set_ir_module_string(ir_module_string_before_opt);
   }
-
-  for (Thunk* thunk : thunks) {
-    TF_RETURN_IF_ERROR(
-        thunk->Initialize(gpu_executable->target_binary(), stream_exec));
-  }
-
   return std::unique_ptr<Executable>(gpu_executable);
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index deb5d785777..b52af5392d1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -74,9 +74,10 @@ class GpuCompiler : public LLVMCompiler {
 
   virtual GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) = 0;
 
-  virtual StatusOr<GpuTargetBinary> CompileTargetBinary(
-      const HloModule* hlo_module, llvm::Module* llvm_module,
-      GpuVersion gpu_version, se::StreamExecutor* stream_exec) = 0;
+  virtual StatusOr<std::pair<std::string, std::vector<uint8>>>
+  CompileTargetBinary(const HloModule* hlo_module, llvm::Module* llvm_module,
+                      GpuVersion gpu_version,
+                      se::StreamExecutor* stream_exec) = 0;
 
   Status PrepareHloModuleForIrEmitting(HloModule* hlo_module);
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index ebd3630635b..2df6b50d361 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -52,15 +52,16 @@ using ::tensorflow::profiler::ScopedAnnotation;
 // Implementation note: HLO profiling is always enabled for GPU executables,
 // since we can use timers around thunks.
 GpuExecutable::GpuExecutable(
-    GpuTargetBinary target_binary, GpuVersion gpu_version,
-    std::unique_ptr<const ThunkSchedule> thunk_schedule,
+    const string& text, const std::vector<uint8>& binary,
+    GpuVersion gpu_version, std::unique_ptr<const ThunkSchedule> thunk_schedule,
     std::shared_ptr<HloModule> hlo_module,
     std::shared_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
                  std::move(hlo_profile_index_map)),
-      target_binary_(std::move(target_binary)),
+      text_(text),
+      binary_(binary),
       gpu_version_(gpu_version),
       thunk_schedule_(std::move(thunk_schedule)),
       assignment_(std::move(assignment)) {
@@ -175,6 +176,7 @@ Status GpuExecutable::ExecuteThunks(
     // module, we won't get any data, but that's probably an OK trade-off.
     ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
 
+    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
     int32 stream_no =
         thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
     se::Stream* stream =
@@ -467,7 +469,7 @@ const InstructionValueSet& GpuExecutable::GetRootValueSet() const {
 int64 GpuExecutable::SizeOfGeneratedCodeInBytes() {
   // Non-empty PTX but empty cubin: compilation must have failed, return
   // "unknown".
-  if (binary().empty() && !text().empty()) {
+  if (binary().empty() && !text_.empty()) {
     return -1;
   }
   return binary().size();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 29441c60b04..045a36c099b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -52,7 +52,8 @@ class GpuExecutable : public Executable {
   // We need to share ownership of hlo_module and assignment with profiler to
   // safely keep a reference to these objects during tracing period, thus they
   // are passed as shared pointers.
-  GpuExecutable(GpuTargetBinary target_binary, GpuVersion gpu_version,
+  GpuExecutable(const string& text, const std::vector<uint8>& binary,
+                GpuVersion gpu_version,
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
                 std::shared_ptr<HloModule> hlo_module,
                 std::shared_ptr<const BufferAssignment> assignment,
@@ -72,14 +73,12 @@ class GpuExecutable : public Executable {
 
   // Returns the compiled code for the computation. The compiled code is PTX in
   // Cuda and unused empty string in ROCm.
-  const string& text() const { return target_binary_.text; }
+  const string& text() const { return text_; }
 
   // Returns the binary stored in this GpuExecutable. The binary is cubin in
   // Cuda, and HSA code object in ROCm. It may be empty, in which case
   // compilation is left up to the GPU driver.
-  const std::vector<uint8>& binary() const { return target_binary_.binary; }
-
-  const GpuTargetBinary& target_binary() const { return target_binary_; }
+  const std::vector<uint8>& binary() const { return binary_; }
 
   // ExecuteAsyncOnStream will fail if the compute capability of the stream
   // doesn't match the compute capability passed to this object's constructor.
@@ -132,7 +131,14 @@ class GpuExecutable : public Executable {
   // This string should be modified only before ExecuteOnStream.
   string ir_module_string_;
 
-  const GpuTargetBinary target_binary_;
+  // The compiled code for the computation.
+  const string text_;
+
+  // The GPU machine code for the computation, targeting GPUs at
+  // compute_capability_.
+  //
+  // May be empty, in which case we leave compilation up to the GPU driver.
+  const std::vector<uint8> binary_;
 
   // The GPU version for compute compatibility check.
   GpuVersion gpu_version_;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_types.h b/tensorflow/compiler/xla/service/gpu/gpu_types.h
index 5c8b8093d65..1c51040fb82 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_types.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_types.h
@@ -16,11 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TYPES_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_TYPES_H_
 
-#include <string>
-#include <vector>
-
 #include "absl/types/variant.h"
-#include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace gpu {
@@ -29,19 +25,6 @@ namespace gpu {
 // it comprises a pair of integers denoting major and minor version.
 // On ROCm platform, it comprises one integer for AMD GCN ISA version.
 using GpuVersion = absl::variant<std::pair<int, int>, int>;
-
-// A struct to carry around compiled results by the GPU assembler.
-struct GpuTargetBinary {
-  GpuTargetBinary(const GpuTargetBinary& other) = delete;
-  GpuTargetBinary(GpuTargetBinary&& other) = default;
-
-  // The text format of the compiled result, e.g. PTX.
-  std::string text;
-
-  // The actual compiled binary.
-  std::vector<tensorflow::uint8> binary;
-};
-
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index 0b5010ea66b..d976b5d8d4d 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -41,7 +42,7 @@ KernelThunk::KernelThunk(absl::Span<const BufferAllocation* const> args,
       kernel_name_(kernel_name),
       unroll_factor_(unroll_factor) {}
 
-Status KernelThunk::Initialize(const GpuTargetBinary& target_binary,
+Status KernelThunk::Initialize(const GpuExecutable& executable,
                                se::StreamExecutor* executor) {
   tensorflow::mutex_lock lock(mutex_);
 
@@ -54,10 +55,8 @@ Status KernelThunk::Initialize(const GpuTargetBinary& target_binary,
   if (kernel_cache_.end() == it) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<se::KernelBase> kernel,
-        CreateKernel(kernel_name_, args_.size(), target_binary.text,
-                     target_binary.binary, executor));
-    CHECK(!target_binary.binary.empty());
-    CHECK(kernel);
+        CreateKernel(kernel_name_, args_.size(), executable.text(),
+                     executable.binary(), executor));
 
     kernel_cache_.emplace(executor, std::move(kernel));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index 97a1d08a57e..88351881f3a 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -35,6 +35,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+class GpuExecutable;
+
 // This class stores everything that StreamExecutor needs for launching a
 // kernel. It implements the ExecuteOnStream interface for GpuExecutable to
 // invoke the corresponding kernel.
@@ -56,7 +58,7 @@ class KernelThunk : public Thunk {
   int unroll_factor() const { return unroll_factor_; }
   void SetLaunchDimensions(const LaunchDimensions& launch_dims);
 
-  Status Initialize(const GpuTargetBinary& target_binary,
+  Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index cf6fe9292e5..0196267d904 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -295,9 +295,11 @@ GpuVersion NVPTXCompiler::GetGpuVersion(se::StreamExecutor* stream_exec) {
   return std::make_pair(cc_major, cc_minor);
 }
 
-StatusOr<GpuTargetBinary> NVPTXCompiler::CompileTargetBinary(
-    const HloModule* module, llvm::Module* llvm_module, GpuVersion gpu_version,
-    se::StreamExecutor* stream_exec) {
+StatusOr<std::pair<std::string, std::vector<uint8>>>
+NVPTXCompiler::CompileTargetBinary(const HloModule* module,
+                                   llvm::Module* llvm_module,
+                                   GpuVersion gpu_version,
+                                   se::StreamExecutor* stream_exec) {
   std::pair<int, int> compute_capability =
       absl::get<std::pair<int, int>>(gpu_version);
 
@@ -338,7 +340,8 @@ StatusOr<GpuTargetBinary> NVPTXCompiler::CompileTargetBinary(
       stream_exec, ptx, compute_capability.first, compute_capability.second,
       module->config());
 
-  return GpuTargetBinary{std::move(ptx), std::move(cubin)};
+  return std::pair<std::string, std::vector<uint8>>(std::move(ptx),
+                                                    std::move(cubin));
 }
 
 std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index ec550b5b2ff..e69be947522 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -48,7 +48,7 @@ class NVPTXCompiler : public GpuCompiler {
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) override;
 
-  StatusOr<GpuTargetBinary> CompileTargetBinary(
+  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
       const HloModule* hlo_module, llvm::Module* llvm_module,
       GpuVersion gpu_version, se::StreamExecutor* stream_exec) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
index bd260336c28..025ca60ef0c 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@@ -34,10 +34,10 @@ void SequentialThunk::ComputeAnnotations() {
   }
 }
 
-Status SequentialThunk::Initialize(const GpuTargetBinary& target_binary,
+Status SequentialThunk::Initialize(const GpuExecutable& executable,
                                    se::StreamExecutor* executor) {
   for (auto& thunk : thunks_) {
-    TF_RETURN_IF_ERROR(thunk->Initialize(target_binary, executor));
+    TF_RETURN_IF_ERROR(thunk->Initialize(executable, executor));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index b5475664733..3abb82c0b66 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -40,7 +40,7 @@ class SequentialThunk : public Thunk {
   const std::vector<std::unique_ptr<Thunk>>& thunks() const { return thunks_; }
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuTargetBinary& target_binary,
+  Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index 7aff9ca47b7..e9be41b74de 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -31,6 +30,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+class GpuExecutable;
+
 // Thunk acts as the bridge between IrEmitter and GpuExecutable. It stores the
 // metadata IrEmitter generates for GpuExecutable to invoke an HloInstruction.
 //
@@ -96,7 +97,7 @@ class Thunk {
   // This may be called multiple times.  Its main purpose is to give us a chance
   // to do initialization outside of ExecuteOnStream() so that the
   // time spent initializing doesn't count towards our execution profile.
-  virtual Status Initialize(const GpuTargetBinary& /*target_binary*/,
+  virtual Status Initialize(const GpuExecutable& /*executable*/,
                             se::StreamExecutor* /*executor*/) {
     return Status::OK();
   }
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index 2650508093e..4134cd39832 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -45,11 +45,11 @@ void WhileThunk::ComputeAnnotations() {
   body_thunk_sequence_->ComputeAnnotations();
 }
 
-Status WhileThunk::Initialize(const GpuTargetBinary& target_binary,
+Status WhileThunk::Initialize(const GpuExecutable& executable,
                               se::StreamExecutor* executor) {
   TF_RETURN_IF_ERROR(
-      condition_thunk_sequence_->Initialize(target_binary, executor));
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(target_binary, executor));
+      condition_thunk_sequence_->Initialize(executable, executor));
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index 77ee0104a1f..31db01b72ba 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -47,7 +47,7 @@ class WhileThunk : public Thunk {
   WhileThunk& operator=(const WhileThunk&) = delete;
 
   void ComputeAnnotations() override;
-  Status Initialize(const GpuTargetBinary& target_binary,
+  Status Initialize(const GpuExecutable& executable,
                     se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
index 667cdef8f6c..35ac3b2bf63 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@@ -549,11 +549,10 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(
   }
 
   // TODO(b/137624192): Add profiling support.
-
   return {absl::make_unique<GpuExecutable>(
-      xla::gpu::GpuTargetBinary{ptx, cubin}, GetGpuVersion(stream_exec),
-      std::move(thunk_schedule), emission_context.releaseHloModule(),
-      std::move(buffer_assignment), nullptr, nullptr)};
+      ptx, cubin, GetGpuVersion(stream_exec), std::move(thunk_schedule),
+      emission_context.releaseHloModule(), std::move(buffer_assignment),
+      nullptr, nullptr)};
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index 16ed02296b7..1947f517bd9 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -55,15 +55,16 @@ class GpuDummyCompiler : public GpuCompiler {
 
   GpuVersion GetGpuVersion(se::StreamExecutor* stream_exec) { return 0; }
 
-  StatusOr<GpuTargetBinary> CompileTargetBinary(
+  StatusOr<std::pair<std::string, std::vector<uint8>>> CompileTargetBinary(
       const HloModule* hlo_module, llvm::Module* llvm_module,
-      GpuVersion gpu_version, se::StreamExecutor* stream_exec) override {
+      GpuVersion gpu_version, se::StreamExecutor* stream_exec) {
     if (user_post_optimization_hook_) {
       user_post_optimization_hook_(*llvm_module);
     }
 
     std::vector<uint8> compiled_results;
-    return GpuTargetBinary{"", std::move(compiled_results)};
+    return std::pair<std::string, std::vector<uint8>>(
+        "", std::move(compiled_results));
   }
 };
 }  // namespace gpu

From 8788846283c461a7475af4ce1921d5a2d8b075c3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 15:55:20 -0700
Subject: [PATCH 066/412] Fix `alignment_hint < sizeof(void*)` in
 BuiltinDataAllocator::Allocate

Handle the case when `alignof(T)` return value `< sizeof(void*)` and causes
the fail of `aligned_alloc()`. Fix by using `sizeof(void*)` as `alignment_hint`
in this case.

PiperOrigin-RevId: 311219237
Change-Id: Ib5d9c194ac00f17f4f3a47bf98cba0afbdce5840
---
 tensorflow/lite/BUILD                  |  1 +
 tensorflow/lite/interpreter_builder.cc | 11 ++++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index a2ab4854165..14babee2da7 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -253,6 +253,7 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/experimental/resource",
+        "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/nnapi:nnapi_implementation",
         "//tensorflow/lite/schema:schema_fbs",
     ] + select({
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index e32e0768995..fb87702fd13 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
@@ -209,7 +210,15 @@ class MallocDataAllocator : public BuiltinDataAllocator {
  public:
   void* Allocate(size_t size, size_t alignment_hint) override {
 #ifdef TFLITE_USE_STD_ALIGNED_ALLOC
-    return aligned_alloc(alignment_hint, size);
+    // Ensure that alignment is a power of two and a multiple of sizeof(void *)
+    // and that size is an integral multiple of alignment.
+    size_t used_alignment = std::max(alignment_hint, sizeof(void*));
+    size_t used_size =
+        ((size + used_alignment - 1) / used_alignment) * used_alignment;
+    TFLITE_DCHECK(
+        (used_alignment != 0) &&
+        ((used_alignment & (used_alignment - 1)) == 0));  // is power-of-two
+    return aligned_alloc(used_alignment, used_size);
 #else
     return malloc(size);
 #endif

From 3200e57b9cacf2883cfd28b18a9edf71bafaefca Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 12 May 2020 15:59:52 -0700
Subject: [PATCH 067/412] Update the OSS image to pickup latest tf estimator
 PIP package.

PiperOrigin-RevId: 311220029
Change-Id: I1bbcdf92f410a074d6918cf61cfb380daa74339e
---
 ....rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 |  7 +++++++
 .../Dockerfile.rbe.ubuntu16.04-manylinux2010      | 15 +++++++--------
 .../toolchains/preconfig/generate/containers.bzl  |  4 ++--
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
index df4b847b6f7..91d501109d0 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010
@@ -75,6 +75,13 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 RUN python3.8 get-pip.py
 RUN python3.8 -m pip install --upgrade pip setuptools wheel
 
+# Overwrite include paths that are generated for the multipython image.
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
+
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
+
 # Make apt work with python 3.6.
 RUN cp /usr/lib/python3/dist-packages/apt_pkg.cpython-35m-x86_64-linux-gnu.so \
        /usr/lib/python3/dist-packages/apt_pkg.so
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010 b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
index 516129ccd43..a14b9ac2a3e 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.ubuntu16.04-manylinux2010
@@ -73,13 +73,12 @@ RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 RUN python3.8 get-pip.py
 RUN python3.8 -m pip install --upgrade pip setuptools wheel
 
-# TODO(klimek): Figure out a better way to get the right include paths
-# forwarded when we install new packages.
-RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-linux-gnu/python2.7"
+# Overwrite include paths that are generated for the multipython image.
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python2.7" "/dt7/usr/include/x86_64-linux-gnu/python2.7"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python2.7" "/dt8/usr/include/x86_64-linux-gnu/python2.7"
 
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt7/usr/include/x86_64-linux-gnu/python3.6m"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.6m" "/dt8/usr/include/x86_64-linux-gnu/python3.6m"
 
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
-RUN ln -s "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt7/usr/include/x86_64-linux-gnu/python3.8"
+RUN ln -sf "/usr/include/x86_64-linux-gnu/python3.8" "/dt8/usr/include/x86_64-linux-gnu/python3.8"
\ No newline at end of file
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 9be398f5f2d..8e6f48df99e 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -2,13 +2,13 @@
 container_digests = {
     "ubuntu16.04": "sha256:b90dcf2f35f3354909f4491bdf019c110b4b4d95ef0395ebf178bc5d523a4208",
     "centos6": "sha256:d09c12fb26fbbe8398b4973260c75172eb67d509dae9d6f4ad54279b7d6b0494",
-    "ubuntu16.04-manylinux2010": "sha256:b5227c4069980005336dd5cf04e3122974984da3396a514a06d7db3a7ae7b2f9",
+    "ubuntu16.04-manylinux2010": "sha256:d5b056506e14eb216b6e27988814617a09dea77ec1ab46972072038f9df3e728",
     "cuda10.0-cudnn7-ubuntu14.04": "sha256:d433e1221f802dac393bc8652fabcc63aa46896cd920bb888ae0e2002fe6b756",
     "cuda10.0-cudnn7-centos7": "sha256:a453b7147a60928a8345689eae48916a746b3578b5e831bfa151f0529d469c88",
     "cuda10.0-cudnn7-centos6": "sha256:a1909ba09c703340ee0074ce63dd94fe8fea48035a25264677907a609e2375e0",
     "cuda10.1-cudnn7-centos6": "sha256:454b899657e87893ee5e68dc0f87df59b6a0a7418ae09cafcc3dd65ac71feca9",
     "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:5812d9d0ef0a3276fc5faaf4cd01f3d6e03d635893a6e2d2e04f6f01d626c432",
-    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:cc7f760195d7bbe283b45ae740409751d0b74d8ffbdc2f7a3cb62c71a71fbe25",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:1e4e888f14a3d5b127151f7970487613a46ca957babe0432786627c78c0b1a36",
     "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:13aa5e700bb609521cd4365d4152d7d8f4118cae7ce174ce7d54cc529e21766a",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",

From 1afe51a60cbda6fc42b157f6393063052208da70 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 16:02:45 -0700
Subject: [PATCH 068/412] [tf.data] Update the node destructor to solve the
 stack overflow problem.

PiperOrigin-RevId: 311220597
Change-Id: I7efaa889a27e52c0d05bec9778a7f40976a5e90e
---
 tensorflow/core/framework/model.h | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 1c3b64f4a0d..97ac9dd35ae 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -142,7 +142,31 @@ class Node {
         metrics_(name_),
         output_(args.output.get()) {}
 
-  virtual ~Node() { FlushMetrics(); }
+  virtual ~Node() {
+    // Clear the sub-nodes instead of relying on implicit shared pointer
+    // destructor to avoid potential stack overflow when the tree is deep.
+    std::deque<std::shared_ptr<Node>> queue;
+    {
+      mutex_lock l(mu_);
+      while (inputs_.size() > 0) {
+        queue.push_back(inputs_.front());
+        inputs_.pop_front();
+      }
+    }
+    while (!queue.empty()) {
+      auto node = queue.back();
+      queue.pop_back();
+      {
+        mutex_lock l(node->mu_);
+        while (node->inputs_.size() > 0) {
+          queue.push_back(node->inputs_.front());
+          node->inputs_.pop_front();
+        }
+      }
+    }
+
+    FlushMetrics();
+  }
 
   // Adds an input.
   void add_input(std::shared_ptr<Node> node) TF_LOCKS_EXCLUDED(mu_) {

From 2ca6769ae8b49f4e17cdbe7c9976da84ec3e3ca7 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 12 May 2020 16:09:03 -0700
Subject: [PATCH 069/412] Update docstring to remove confusing term
 num_split/size_split, based on review

Also removes unnecessary `...`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/array_ops.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 8aa5d66f402..523020df772 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1919,27 +1919,25 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
   See also `tf.unstack`.
 
-  If `num_or_size_splits` is an integer,  we call it num_split and
-  `value` is split along the dimension `axis` into `num_split` smaller
-  tensors. This requires that `value.shape[axis]` is divisible by `num_split`.
+  If `num_or_size_splits` is an integer,  then `value` is split along the
+  dimension `axis` into `num_or_size_splits` smaller tensors. This requires that
+  `value.shape[axis]` is divisible by `num_or_size_splits`.
 
-  If `num_or_size_splits` is a 1-D Tensor (or list), we call it `size_splits`
-  and `value` is split into `len(size_splits)` elements. The shape of the `i`-th
+  If `num_or_size_splits` is a 1-D Tensor (or list), then `value` is split into
+  `len(num_or_size_splits)` elements. The shape of the `i`-th
   element has the same size as the `value` except along dimension `axis` where
-  the size is `size_splits[i]`.
+  the size is `num_or_size_splits[i]`.
 
   For example:
 
   >>> x = tf.Variable(tf.random.uniform([5, 30], -1, 1))
   >>>
-  >>> # Split `x` into 3 tensors along dimension 1:
-  ...
+  >>> # Split `x` into 3 tensors along dimension 1
   >>> s0, s1, s2 = tf.split(x, num_or_size_splits=3, axis=1)
   >>> tf.shape(s0).numpy()
   array([ 5, 10], dtype=int32)
   >>>
   >>> # Split `x` into 3 tensors with sizes [4, 15, 11] along dimension 1
-  ...
   >>> split0, split1, split2 = tf.split(x, [4, 15, 11], 1)
   >>> tf.shape(split0).numpy()
   array([5, 4], dtype=int32)

From 3924ce6cc0afa3c7b46c70b16ed3284082fc0ece Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Tue, 12 May 2020 16:14:32 -0700
Subject: [PATCH 070/412] Enable Reduce Mean op in Hexagon delegate for
 uint8/int8. Added requantize op after Mean to make sure the output is with
 correct scale.

PiperOrigin-RevId: 311222782
Change-Id: Idf0a627fd1da3bc13d68b276dbcf8cc07011c435
---
 .../experimental/delegates/hexagon/README.md  |  1 +
 .../hexagon/builders/reduce_builder.cc        | 68 +++++++++++--------
 .../hexagon/builders/tests/reduce_test.cc     | 52 +++++++++-----
 .../experimental/delegates/hexagon/utils.cc   |  7 +-
 4 files changed, 82 insertions(+), 46 deletions(-)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/README.md b/tensorflow/lite/experimental/delegates/hexagon/README.md
index 07f1a92bdec..a97342c9fdc 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/README.md
+++ b/tensorflow/lite/experimental/delegates/hexagon/README.md
@@ -80,6 +80,7 @@ are verified in `IsNodeSupportedByHexagon`:
 * L2Normalization (without any activation)
 * Logistic (aka Sigmoid)
 * MaxPool2D (without any activation) (b/129276536)
+* Mean
 * MirrorPad
 * Mul (without any activation) (b/129276536)
 * Neg
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc
index 8401f76cf4d..066c82560a8 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/reduce_builder.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_nn/hexagon_nn.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace delegates {
@@ -35,9 +36,7 @@ TfLiteStatus ReduceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
   tensor_id = inputs->data[0];
   const auto& input_tensor = context->tensors[tensor_id];
   AddInput(graph_builder_->GetHexagonTensorId(tensor_id));
-  ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_,
-                              std::numeric_limits<uint8_t>::min(),
-                              std::numeric_limits<uint8_t>::max());
+  ComputeMinAndMaxQuantValues(input_tensor, &input_min_, &input_max_);
   auto* input_min_const = graph_builder_->AddConstNodeWithData(
       quant_bound_shape, reinterpret_cast<char*>(&input_min_),
       sizeof(input_min_));
@@ -63,37 +62,48 @@ TfLiteStatus ReduceOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
     return kTfLiteError;
   }
 
+  auto& output_tensor = context->tensors[outputs->data[0]];
   int output_batch_size, output_height_size, output_width_size,
       output_depth_size;
   GetDims(&output_batch_size, &output_height_size, &output_width_size,
-          &output_depth_size, context->tensors[outputs->data[0]].dims);
+          &output_depth_size, output_tensor.dims);
 
-  // Hexagon's sum-reduction outputs int32, so we shrink it down to UInt8.
-  if (op_node_.op_type == OP_QuantizedSum_8to32) {
-    const auto& reduce_out = AddOutput(sizeof(int32_t), 4,
-                                       {output_batch_size, output_height_size,
-                                        output_width_size, output_depth_size});
-    const auto& reduce_out_min = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    const auto& reduce_out_max = AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  float output_min = -1, output_max = -1;
+  ComputeMinAndMaxQuantValues(output_tensor, &output_min, &output_max);
+  auto* output_min_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&output_min),
+      sizeof(output_min));
+  auto* output_max_const = graph_builder_->AddConstNodeWithData(
+      quant_bound_shape, reinterpret_cast<char*>(&output_max),
+      sizeof(output_max));
+  // Min/max values for output tensor.
+  AddInput(TensorID(output_min_const->GetID(), 0));
+  AddInput(TensorID(output_max_const->GetID(), 0));
 
-    auto* quantize_output_op = graph_builder_->AddNode(GetTFLiteNodeID());
-    quantize_output_op->SetOpType(OP_QuantizeDownAndShrinkRange_32to8);
-    quantize_output_op->AddInput(reduce_out);
-    quantize_output_op->AddInput(reduce_out_min);
-    quantize_output_op->AddInput(reduce_out_max);
-    node_output_ =
-        quantize_output_op->AddOutput(sizeof(uint8_t), 4,
-                                      {output_batch_size, output_height_size,
-                                       output_width_size, output_depth_size});
-    quantize_output_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    quantize_output_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  } else {
-    node_output_ = AddOutput(sizeof(uint8_t), 4,
-                             {output_batch_size, output_height_size,
-                              output_width_size, output_depth_size});
-    AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-    AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  }
+  // Add outputs
+  size_t output_element_size = 0;
+  TF_LITE_ENSURE_STATUS(
+      GetSizeOfType(context, output_tensor.type, &output_element_size));
+  auto mean_output = AddOutput(output_element_size, 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+  auto mean_out_min = AddOutput(output_element_size, 4, {1, 1, 1, 1});
+  auto mean_out_max = AddOutput(output_element_size, 4, {1, 1, 1, 1});
+  // Mean op doesn't honor the passed min/max for output, so we need
+  // to add requantize.
+  auto* requantize_op = graph_builder_->AddNode(GetTFLiteNodeID());
+  requantize_op->SetOpType(OP_Requantize_8to8);
+  requantize_op->AddInput(mean_output);
+  requantize_op->AddInput(mean_out_min);
+  requantize_op->AddInput(mean_out_max);
+  requantize_op->AddInput(TensorID(output_min_const->GetID(), 0));
+  requantize_op->AddInput(TensorID(output_max_const->GetID(), 0));
+  node_output_ =
+      requantize_op->AddOutput(sizeof(uint8_t), 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+  requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  requantize_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
 
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc
index 7e4f95ffa96..a3cd8c8255b 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/reduce_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 namespace tflite {
 using testing::ElementsAreArray;
 
-// TODO(b/148390890): All tests are disabled, enable after fix is availabel
-// and op is enabled.
+// TODO(b/148390890): Reduce Sum tests are disabled, enable after fix is
+// available and op is enabled.
 class ReduceOpModel : public SingleOpModelWithHexagon {
  public:
   ReduceOpModel(BuiltinOperator type, const TensorData& input,
@@ -49,32 +49,52 @@ class ReduceOpModel : public SingleOpModelWithHexagon {
   int output_;
 };
 
-TEST(ReduceOpModel, DISABLED_MeanNotKeepDims) {
+template <TensorType Tensor_Type, typename input_type>
+void TestMeanImpl() {
   float kQuantizedTolerance = 2.0 / 255;
   std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
-  ReduceOpModel m(BuiltinOperator_MEAN,
-                  {TensorType_UINT8, {1, 1, 3, 2}, -1.0, 1.0},
-                  {TensorType_UINT8, {2}, -1.0, 1.0}, {1}, {2}, false);
-  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  ReduceOpModel m(BuiltinOperator_MEAN, {Tensor_Type, {1, 1, 3, 2}, -1.0, 1.0},
+                  {Tensor_Type, {2}, -1.0, 1.0}, {1}, {2}, false);
+  m.QuantizeAndPopulate<input_type>(m.Input(), data);
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<input_type>();
   m.ApplyDelegateAndInvoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 2}));
   EXPECT_THAT(
-      m.GetDequantizedOutput<uint8_t>(),
-      ElementsAreArray(ArrayFloatNear({0.4, 0.4}, kQuantizedTolerance)));
+      m.GetDequantizedOutput<input_type>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
 }
 
-TEST(ReduceOpModel, DISABLED_MeanKeepDims) {
+TEST(ReduceOpModel, MeanNotKeepDims_Uint8) {
+  TestMeanImpl<TensorType_UINT8, uint8_t>();
+}
+
+TEST(ReduceOpModel, MeanNotKeepDims_Int8) {
+  TestMeanImpl<TensorType_INT8, int8_t>();
+}
+
+template <TensorType Tensor_Type, typename input_type>
+void TestMeanKeppDimsImpl() {
   float kQuantizedTolerance = 2.0 / 255;
   std::vector<float> data = {0.4, 0.2, 0.3, 0.4, 0.5, 0.6};
-  ReduceOpModel m(BuiltinOperator_MEAN,
-                  {TensorType_UINT8, {1, 1, 3, 2}, -1.0, 1.0},
-                  {TensorType_UINT8, {3}, -1.0, 1.0}, {1}, {3}, true);
-  m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
+  ReduceOpModel m(BuiltinOperator_MEAN, {Tensor_Type, {1, 1, 3, 2}, -1.0, 1.0},
+                  {Tensor_Type, {3}, -1.0, 1.0}, {1}, {3}, true);
+  m.QuantizeAndPopulate<input_type>(m.Input(), data);
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<input_type>();
   m.ApplyDelegateAndInvoke();
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 3, 1}));
   EXPECT_THAT(
-      m.GetDequantizedOutput<uint8_t>(),
-      ElementsAreArray(ArrayFloatNear({0.3, 0.35, 0.55}, kQuantizedTolerance)));
+      m.GetDequantizedOutput<input_type>(),
+      ElementsAreArray(ArrayFloatNear(reference_output, kQuantizedTolerance)));
+}
+
+TEST(ReduceOpModel, MeanKeepDims_Int8) {
+  TestMeanKeppDimsImpl<TensorType_INT8, int8_t>();
+}
+
+TEST(ReduceOpModel, MeanKeepDims_Uint8) {
+  TestMeanKeppDimsImpl<TensorType_UINT8, uint8_t>();
 }
 
 TEST(ReduceOpModel, DISABLED_SumNotKeepDims) {
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index d9d14804b49..1df0a6df66c 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -80,6 +80,7 @@ bool CheckOpVersion(const TfLiteRegistration* registration) {
     case kTfLiteBuiltinL2Normalization:
     case kTfLiteBuiltinLogistic:
     case kTfLiteBuiltinMaxPool2d:
+    case kTfLiteBuiltinMean:
     case kTfLiteBuiltinMirrorPad:
     case kTfLiteBuiltinMul:
     case kTfLiteBuiltinPad:
@@ -154,11 +155,15 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       return IsActivationReluOrNone(sub_params->activation);
     }
     case kTfLiteBuiltinSum:
-    case kTfLiteBuiltinMean: {
       // TODO(b/139277813): Enable these when they pass unit tests. These seem
       // to recompute the output min/max instead of taking them as inputs, which
       // causes an unexpected shift in dequantized values.
       return false;
+    case kTfLiteBuiltinMean: {
+      return InputsWithCorrectTypes(
+                 node, context,
+                 {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteInt32}}) &&
+             IsConstantTensor(GetInput(context, node, 1));
     }
     case kTfLiteBuiltinMirrorPad: {
       if (!InputsWithCorrectTypes(

From bf0c10e3c4a23b49108c08e8fe32bbb71070e69f Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Tue, 12 May 2020 16:36:53 -0700
Subject: [PATCH 071/412] Refactor Softmax and use new memory API.

PiperOrigin-RevId: 311226484
Change-Id: Id044b77f385d6606d272c263d46aae76466e9987
---
 .../micro/kernels/xtensa_hifimini/softmax.cc  | 65 ++++++++-----------
 1 file changed, 27 insertions(+), 38 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
index c77e9d1173c..c95fd0e40a4 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
@@ -34,9 +34,6 @@ namespace {
 // registrations for selective types (e.g. compile without float support), this
 // can be removed. Otherwise, any HiFi specific optimizations should land here.
 
-// This size will work for both the hotword (1) and ambient music (0):
-static SoftmaxParams kStaticOpData;
-
 TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
                                     const TfLiteTensor* input,
                                     TfLiteTensor* output,
@@ -47,11 +44,13 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
     } else {
       if (output->type == kTfLiteInt16) {
-        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -32768);
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                          std::numeric_limits<int16_t>::min());
         // NOTE: Current int16 softmax output does not require symmetric scaling
         // - so no need to verify scale here.
       } else {
-        TF_LITE_ENSURE_EQ(context, output->params.zero_point, -128);
+        TF_LITE_ENSURE_EQ(context, output->params.zero_point,
+                          std::numeric_limits<int8_t>::min());
         TF_LITE_ENSURE(context, output->params.scale == 1.f / 256);
       }
     }
@@ -71,29 +70,18 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-TfLiteStatus SoftmaxQuantized(TfLiteContext* context, const TfLiteTensor* input,
-                              TfLiteTensor* output,
-                              const SoftmaxParams& op_params) {
-  switch (output->type) {
-    case kTfLiteInt16:
-      tflite::reference_ops::Softmax(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int16_t>(output));
-      return kTfLiteOk;
-    case kTfLiteInt8:
-      tflite::reference_ops::Softmax(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
-      return kTfLiteOk;
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(output->type), output->type);
-      return kTfLiteError;
-  }
-}
-
 }  // namespace
 
+void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
+  TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
+  void* data = nullptr;
+  if (context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams),
+                                        &data) == kTfLiteError) {
+    return nullptr;
+  }
+  return data;
+}
+
 TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   auto* params = static_cast<TfLiteSoftmaxParams*>(node->builtin_data);
 
@@ -103,10 +91,8 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
-  // TODO(b/132070898): Use statically slotted SoftmaxParams structures until a
-  // scratch memory API is ready.
-  SoftmaxParams* op_params = &kStaticOpData;
-  node->user_data = op_params;
+  TFLITE_DCHECK(node->user_data != nullptr);
+  SoftmaxParams* op_params = static_cast<SoftmaxParams*>(node->user_data);
 
   TF_LITE_ENSURE_STATUS(
       CalculateSoftmaxOpData(context, input, output, params, op_params));
@@ -120,19 +106,22 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
-  switch (input->type) {
-    case kTfLiteInt8:
-      return SoftmaxQuantized(context, input, output, *op_params);
-    default:
-      TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
-                         TfLiteTypeGetName(input->type), input->type);
-      return kTfLiteError;
+  if (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) {
+    // TODO(b/155656675): Const ref params can be slow on xtensa.
+    tflite::reference_ops::Softmax(
+        *op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+        GetTensorShape(output), GetTensorData<int16_t>(output));
+    return kTfLiteOk;
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
+                       TfLiteTypeGetName(input->type), input->type);
+    return kTfLiteError;
   }
 }
 }  // namespace activations
 
 TfLiteRegistration* Register_SOFTMAX() {
-  static TfLiteRegistration r = {/*init=*/nullptr,
+  static TfLiteRegistration r = {/*init=*/activations::SoftmaxInit,
                                  /*free=*/nullptr,
                                  /*prepare=*/activations::SoftmaxPrepare,
                                  /*invoke=*/activations::SoftmaxEval,

From 194efd1d28235fa15d26ce395f3ef72919183f0b Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 12 May 2020 16:40:59 -0700
Subject: [PATCH 072/412] Disable collective ops xla test on gpu

PiperOrigin-RevId: 311227213
Change-Id: Ib9b84515e22e86561ae63c4d94ed49d3e4573c7a
---
 tensorflow/compiler/xla/tests/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 1ad1f8363cf..c8a242c156a 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1909,7 +1909,7 @@ xla_test(
         # This test is tagged "manual" because it requires multiple GPUs, and
         # Forge only supports single-GPU tests.  Guitar skips "manual" tests
         # unless they're also tagged "guitar".
-        #  "guitar",  # Re-enable after b/156405690 is fixed.
+        "guitar",
         "manual",
         "multi_gpu",
         "no_oss",

From 2d452266176737db8c4dedb7a9e6521c2beb1d49 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 12 May 2020 16:52:16 -0700
Subject: [PATCH 073/412] Move tf.keras.layers.featureDenseFeature back to
 Keras package.

PiperOrigin-RevId: 311229082
Change-Id: I3317086f3b6c53da0f6d0cc4f5558afcd74b264b
---
 tensorflow/python/feature_column/BUILD        |  20 -
 .../feature_column/feature_column_lib.py      |   8 +-
 .../feature_column/feature_column_v2_test.py  | 291 ------------
 .../feature_column/keras_integration_test.py  |   2 +-
 .../sequence_feature_column_test.py           |  49 ---
 .../feature_column/serialization_test.py      |  66 ---
 tensorflow/python/keras/feature_column/BUILD  |  78 ++++
 .../python/keras/feature_column/__init__.py   |   0
 .../feature_column/dense_features.py          |   5 -
 .../feature_column/dense_features_test.py     | 416 +++++++++++++++++-
 .../feature_column/dense_features_v2.py       |   7 +-
 .../feature_column/dense_features_v2_test.py  |   2 +-
 ...equence_feature_column_integration_test.py |   2 +-
 .../python/keras/layers/serialization.py      |  18 +-
 .../saving/saved_model/saved_model_test.py    |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   4 +-
 17 files changed, 509 insertions(+), 463 deletions(-)
 create mode 100644 tensorflow/python/keras/feature_column/__init__.py
 rename tensorflow/python/{ => keras}/feature_column/dense_features.py (97%)
 rename tensorflow/python/{ => keras}/feature_column/dense_features_test.py (62%)
 rename tensorflow/python/{ => keras}/feature_column/dense_features_v2.py (94%)
 rename tensorflow/python/{ => keras}/feature_column/dense_features_v2_test.py (99%)

diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index d67cdf9cc06..786c26c009a 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -55,8 +55,6 @@ py_library(
 py_library(
     name = "feature_column_v2",
     srcs = [
-        "dense_features.py",
-        "dense_features_v2.py",
         "feature_column_v2.py",
         "sequence_feature_column.py",
         "serialization.py",
@@ -126,15 +124,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "dense_features_test",
-    srcs = ["dense_features_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":feature_column_test_main_lib",
-    ],
-)
-
 py_library(
     name = "feature_column_test_main_lib",
     srcs = ["feature_column_test.py"],
@@ -177,15 +166,6 @@ tf_py_test(
     deps = [":feature_column_v2_test_main_lib"],
 )
 
-tf_py_test(
-    name = "dense_features_v2_test",
-    srcs = ["dense_features_v2_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":feature_column_v2_test_main_lib",
-    ],
-)
-
 py_library(
     name = "feature_column_v2_test_main_lib",
     srcs = ["feature_column_v2_test.py"],
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index afe14f55bfc..bda20ff3f2c 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import,g-bad-import-order
-# We import dense_features_v2 first so that the V1 DenseFeatures is the default
-# if users directly import feature_column_lib.
-from tensorflow.python.feature_column.dense_features_v2 import *
-from tensorflow.python.feature_column.dense_features import *
 from tensorflow.python.feature_column.feature_column import *
 from tensorflow.python.feature_column.feature_column_v2 import *
 from tensorflow.python.feature_column.sequence_feature_column import *
 from tensorflow.python.feature_column.serialization import *
+# We import dense_features_v2 first so that the V1 DenseFeatures is the default
+# if users directly import feature_column_lib.
+from tensorflow.python.keras.feature_column.dense_features_v2 import *
+from tensorflow.python.keras.feature_column.dense_features import *
 from tensorflow.python.keras.feature_column.sequence_feature_column import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index fe769850fb0..a13f38a5203 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,7 +31,6 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
@@ -5582,23 +5581,6 @@ class IndicatorColumnTest(test.TestCase):
       self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
       self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
-    with ops.Graph().as_default():
-      features = {
-          'animal':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-      net = df.DenseFeatures([animal])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     animal = fc.indicator_column(
@@ -6271,156 +6253,6 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
                           self.evaluate(predictions))
 
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True
-      }, {
-          'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False
-      })
-  @test_util.run_deprecated_v1
-  def test_dense_features(self, use_safe_embedding_lookup):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        use_safe_embedding_lookup=use_safe_embedding_lookup)
-
-    # Provide sparse input and get dense result.
-    l = df.DenseFeatures((embedding_column,))
-    dense_features = l({'aaa': sparse_input})
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in trainable_vars]))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-    if use_safe_embedding_lookup:
-      self.assertIn('SparseFillEmptyRows',
-                    [x.type for x in ops.get_default_graph().get_operations()])
-    else:
-      self.assertNotIn(
-          'SparseFillEmptyRows',
-          [x.type for x in ops.get_default_graph().get_operations()])
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_not_trainable(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=False)
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures((embedding_column,))({
-        'aaa': sparse_input
-    })
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    self.assertItemsEqual([],
-                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
@@ -7326,129 +7158,6 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
       self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
-  def _test_dense_features(self, trainable=True):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 4)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [0]
-        # example 1, ids []
-        indices=((0, 0),),
-        values=(0,),
-        dense_shape=(2, 5))
-    sparse_input_c = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 1), (1, 1), (1, 3)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_d = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids []
-        indices=((0, 1),),
-        values=(2,),
-        dense_shape=(2, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0:
-        # A ids [2], embedding = [7, 11]
-        # B ids [0], embedding = [1, 2]
-        # C ids [2], embedding = [7, 11]
-        # D ids [2], embedding = [7, 11]
-        (7., 11., 1., 2., 7., 11., 7., 11.),
-        # example 1:
-        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # B ids [], embedding = [0, 0]
-        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # D ids [], embedding = [0, 0]
-        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
-    )
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    categorical_column_c = fc.categorical_column_with_identity(
-        key='ccc', num_buckets=vocabulary_size)
-    categorical_column_d = fc.categorical_column_with_identity(
-        key='ddd', num_buckets=vocabulary_size)
-
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
-        [categorical_column_c, categorical_column_d],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-
-    features = {
-        'aaa': sparse_input_a,
-        'bbb': sparse_input_b,
-        'ccc': sparse_input_c,
-        'ddd': sparse_input_d
-    }
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures(
-        feature_columns=(embedding_column_b, embedding_column_a,
-                         embedding_column_c, embedding_column_d))(
-                             features)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-        tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    if trainable:
-      self.assertItemsEqual(
-          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = global_vars
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values,
-                        self.evaluate(shared_embedding_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    self._test_dense_features()
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_no_trainable(self):
-    self._test_dense_features(trainable=False)
-
   @test_util.run_deprecated_v1
   def test_serialization(self):
 
diff --git a/tensorflow/python/feature_column/keras_integration_test.py b/tensorflow/python/feature_column/keras_integration_test.py
index e0677e84e50..456c0204350 100644
--- a/tensorflow/python/feature_column/keras_integration_test.py
+++ b/tensorflow/python/feature_column/keras_integration_test.py
@@ -23,12 +23,12 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import dense_features_v2
 from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column import dense_features_v2
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.premade import linear
 from tensorflow.python.keras.premade import wide_deep
diff --git a/tensorflow/python/feature_column/sequence_feature_column_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
index 3d5d24ec03a..d0cf5ee7670 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -24,7 +24,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import serialization
@@ -111,54 +110,6 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesTest(test.TestCase):
-  """Tests DenseFeatures with sequence feature columns."""
-
-  def test_embedding_column(self):
-    """Tests that error is raised for sequence embedding column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
-
-    input_layer = dense_features.DenseFeatures([embedding_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In embedding_column: aaa_embedding\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-  def test_indicator_column(self):
-    """Tests that error is raised for sequence indicator column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
-
-    input_layer = dense_features.DenseFeatures([indicator_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In indicator_column: aaa_indicator\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-
 def _assert_sparse_tensor_value(test_case, expected, actual):
   _assert_sparse_tensor_indices_shape(test_case, expected, actual)
 
diff --git a/tensorflow/python/feature_column/serialization_test.py b/tensorflow/python/feature_column/serialization_test.py
index 78b72746ac9..881ca0cca5e 100644
--- a/tensorflow/python/feature_column/serialization_test.py
+++ b/tensorflow/python/feature_column/serialization_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
 from tensorflow.python.framework import test_util
@@ -114,71 +113,6 @@ class FeatureColumnSerializationTest(test.TestCase):
     self.assertIs(new_price.normalizer_fn, _custom_fn)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_get_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_identity(
-                key='b', num_buckets=3), dimension=2)]
-    orig_layer = dense_features.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    self.assertEqual(config['name'], orig_layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertLen(config['feature_columns'], 2)
-    self.assertEqual(
-        config['feature_columns'][0]['class_name'], 'NumericColumn')
-    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
-    self.assertEqual(
-        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_from_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
-                'b', vocabulary_list=['1', '2', '3']), dimension=2),
-            fc.indicator_column(fc.categorical_column_with_hash_bucket(
-                key='c', hash_bucket_size=3))]
-    orig_layer = dense_features.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = dense_features.DenseFeatures.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 3)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
-    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
-    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
-
-  def test_crossed_column(self):
-    a = fc.categorical_column_with_vocabulary_list(
-        'a', vocabulary_list=['1', '2', '3'])
-    b = fc.categorical_column_with_vocabulary_list(
-        'b', vocabulary_list=['1', '2', '3'])
-    ab = fc.crossed_column([a, b], hash_bucket_size=2)
-    cols = [fc.indicator_column(ab)]
-
-    orig_layer = dense_features.DenseFeatures(cols)
-    config = orig_layer.get_config()
-
-    new_layer = dense_features.DenseFeatures.from_config(config)
-
-    self.assertLen(new_layer._feature_columns, 1)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class LinearModelLayerSerializationTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/feature_column/BUILD b/tensorflow/python/keras/feature_column/BUILD
index 650efcceb52..94097c28d73 100644
--- a/tensorflow/python/keras/feature_column/BUILD
+++ b/tensorflow/python/keras/feature_column/BUILD
@@ -12,11 +12,88 @@ exports_files(["LICENSE"])
 
 py_library(
     name = "feature_column",
+    srcs = ["__init__.py"],
     deps = [
+        ":dense_features",
+        ":dense_features_v2",
         ":sequence_feature_column",
     ],
 )
 
+py_library(
+    name = "dense_features",
+    srcs = [
+        "dense_features.py",
+    ],
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras:backend",
+    ],
+)
+
+py_library(
+    name = "dense_features_v2",
+    srcs = [
+        "dense_features_v2.py",
+    ],
+    deps = [
+        ":dense_features",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+tf_py_test(
+    name = "dense_features_test",
+    srcs = ["dense_features_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+tf_py_test(
+    name = "dense_features_v2_test",
+    srcs = ["dense_features_v2_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
 py_library(
     name = "sequence_feature_column",
     srcs = ["sequence_feature_column.py"],
@@ -59,6 +136,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
+        ":dense_features",
         ":sequence_feature_column",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/python/keras/feature_column/__init__.py b/tensorflow/python/keras/feature_column/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/python/feature_column/dense_features.py b/tensorflow/python/keras/feature_column/dense_features.py
similarity index 97%
rename from tensorflow/python/feature_column/dense_features.py
rename to tensorflow/python/keras/feature_column/dense_features.py
index 6feef185815..820f1a6b1b7 100644
--- a/tensorflow/python/feature_column/dense_features.py
+++ b/tensorflow/python/keras/feature_column/dense_features.py
@@ -23,7 +23,6 @@ import json
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
-from tensorflow.python.keras.layers import serialization as layer_serialization
 from tensorflow.python.util import serialization
 from tensorflow.python.util.tf_export import keras_export
 
@@ -173,7 +172,3 @@ class DenseFeatures(fc._BaseFeaturesLayer):  # pylint: disable=protected-access
           cols_to_output_tensors[column] = processed_tensors
         output_tensors.append(processed_tensors)
     return self._verify_and_concat_tensors(output_tensors)
-
-
-layer_serialization.inject_feature_column_v1_objects(
-    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/feature_column/dense_features_test.py b/tensorflow/python/keras/feature_column/dense_features_test.py
similarity index 62%
rename from tensorflow/python/feature_column/dense_features_test.py
rename to tensorflow/python/keras/feature_column/dense_features_test.py
index 7cd523dcc14..ec07964bcbe 100644
--- a/tensorflow/python/feature_column/dense_features_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_test.py
@@ -18,19 +18,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import partitioned_variables
@@ -676,5 +678,417 @@ class DenseFeaturesTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
+class IndicatorColumnTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = df.DenseFeatures([animal])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+
+class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False
+      })
+  @test_util.run_deprecated_v1
+  def test_dense_features(self, use_safe_embedding_lookup):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        use_safe_embedding_lookup=use_safe_embedding_lookup)
+
+    # Provide sparse input and get dense result.
+    l = df.DenseFeatures((embedding_column,))
+    dense_features = l({'aaa': sparse_input})
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in trainable_vars]))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_not_trainable(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures((embedding_column,))({
+        'aaa': sparse_input
+    })
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+
+class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
+
+  def _test_dense_features(self, trainable=True):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 4)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [0]
+        # example 1, ids []
+        indices=((0, 0),),
+        values=(0,),
+        dense_shape=(2, 5))
+    sparse_input_c = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 1), (1, 1), (1, 3)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_d = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids []
+        indices=((0, 1),),
+        values=(2,),
+        dense_shape=(2, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0:
+        # A ids [2], embedding = [7, 11]
+        # B ids [0], embedding = [1, 2]
+        # C ids [2], embedding = [7, 11]
+        # D ids [2], embedding = [7, 11]
+        (7., 11., 1., 2., 7., 11., 7., 11.),
+        # example 1:
+        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # B ids [], embedding = [0, 0]
+        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # D ids [], embedding = [0, 0]
+        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    categorical_column_c = fc.categorical_column_with_identity(
+        key='ccc', num_buckets=vocabulary_size)
+    categorical_column_d = fc.categorical_column_with_identity(
+        key='ddd', num_buckets=vocabulary_size)
+
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
+        [categorical_column_c, categorical_column_d],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+
+    features = {
+        'aaa': sparse_input_a,
+        'bbb': sparse_input_b,
+        'ccc': sparse_input_c,
+        'ddd': sparse_input_d
+    }
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures(
+        feature_columns=(embedding_column_b, embedding_column_a,
+                         embedding_column_c, embedding_column_d))(
+                             features)
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(
+        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+        tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if trainable:
+      self.assertCountEqual(
+          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
+    shared_embedding_vars = global_vars
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values,
+                        self.evaluate(shared_embedding_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    self._test_dense_features()
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_no_trainable(self):
+    self._test_dense_features(trainable=False)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_get_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_identity(
+                key='b', num_buckets=3), dimension=2)]
+    orig_layer = df.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    self.assertEqual(config['name'], orig_layer.name)
+    self.assertEqual(config['trainable'], trainable)
+    self.assertLen(config['feature_columns'], 2)
+    self.assertEqual(
+        config['feature_columns'][0]['class_name'], 'NumericColumn')
+    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
+    self.assertEqual(
+        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_from_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
+                'b', vocabulary_list=['1', '2', '3']), dimension=2),
+            fc.indicator_column(fc.categorical_column_with_hash_bucket(
+                key='c', hash_bucket_size=3))]
+    orig_layer = df.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    new_layer = df.DenseFeatures.from_config(config)
+
+    self.assertEqual(new_layer.name, orig_layer.name)
+    self.assertEqual(new_layer.trainable, trainable)
+    self.assertLen(new_layer._feature_columns, 3)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a')
+    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
+    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
+    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
+
+  def test_crossed_column(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        'a', vocabulary_list=['1', '2', '3'])
+    b = fc.categorical_column_with_vocabulary_list(
+        'b', vocabulary_list=['1', '2', '3'])
+    ab = fc.crossed_column([a, b], hash_bucket_size=2)
+    cols = [fc.indicator_column(ab)]
+
+    orig_layer = df.DenseFeatures(cols)
+    config = orig_layer.get_config()
+
+    new_layer = df.DenseFeatures.from_config(config)
+
+    self.assertLen(new_layer._feature_columns, 1)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SequenceFeatureColumnsTest(test.TestCase):
+  """Tests DenseFeatures with sequence feature columns."""
+
+  def test_embedding_column(self):
+    """Tests that error is raised for sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a, dimension=2)
+
+    input_layer = df.DenseFeatures([embedding_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+  def test_indicator_column(self):
+    """Tests that error is raised for sequence indicator column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+
+    input_layer = df.DenseFeatures([indicator_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/feature_column/dense_features_v2.py b/tensorflow/python/keras/feature_column/dense_features_v2.py
similarity index 94%
rename from tensorflow/python/feature_column/dense_features_v2.py
rename to tensorflow/python/keras/feature_column/dense_features_v2.py
index 405c5d63249..e4dc22f1bbe 100644
--- a/tensorflow/python/feature_column/dense_features_v2.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2.py
@@ -18,10 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.layers import serialization as layer_serialization
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -94,7 +93,3 @@ class DenseFeatures(dense_features.DenseFeatures):
     # We would like to call Layer.build and not _DenseFeaturesHelper.build.
     # pylint: disable=protected-access
     super(fc._BaseFeaturesLayer, self).build(None)  # pylint: disable=bad-super-call
-
-
-layer_serialization.inject_feature_column_v2_objects(
-    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/feature_column/dense_features_v2_test.py b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
similarity index 99%
rename from tensorflow/python/feature_column/dense_features_v2_test.py
rename to tensorflow/python/keras/feature_column/dense_features_v2_test.py
index 71cb163a7d9..95fc8b7ac1e 100644
--- a/tensorflow/python/feature_column/dense_features_v2_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features_v2 as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,6 +30,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features_v2 as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables as variables_lib
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
index 8784182e23b..b1100bf7b07 100644
--- a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
@@ -24,11 +24,11 @@ from google.protobuf import text_format
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.keras.feature_column import sequence_feature_column as ksfc
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.ops import init_ops_v2
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 67aaf1d6eb8..fc7feda07a5 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -64,23 +64,11 @@ ALL_V2_MODULES = (
     recurrent_v2,
     preprocessing_normalization
 )
-FEATURE_COLUMN_V1_OBJECTS = {}
-FEATURE_COLUMN_V2_OBJECTS = {}
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
 
 
-def inject_feature_column_v1_objects(name, cls):
-  global FEATURE_COLUMN_V1_OBJECTS
-  FEATURE_COLUMN_V1_OBJECTS[name] = cls
-
-
-def inject_feature_column_v2_objects(name, cls):
-  global FEATURE_COLUMN_V2_OBJECTS
-  FEATURE_COLUMN_V2_OBJECTS[name] = cls
-
-
 def populate_deserializable_objects():
   """Populates dict ALL_OBJECTS with every built-in layer.
   """
@@ -134,9 +122,11 @@ def populate_deserializable_objects():
   LOCAL.ALL_OBJECTS['WideDeepModel'] = WideDeepModel
 
   if tf2.enabled():
-    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V2_OBJECTS)
+    from tensorflow.python.keras.feature_column.dense_features_v2 import DenseFeatures  # pylint: disable=g-import-not-at-top
+    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
   else:
-    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V1_OBJECTS)
+    from tensorflow.python.keras.feature_column.dense_features import DenseFeatures  # pylint: disable=g-import-not-at-top
+    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
 
   # Merge layers, function versions.
   LOCAL.ALL_OBJECTS['add'] = merge.add
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 9cbe8607a54..5e9ccc2d37a 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -39,7 +39,6 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.feature_column.dense_features import DenseFeatures
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -48,6 +47,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column.dense_features import DenseFeatures
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import save_impl as keras_save
 from tensorflow.python.keras.utils import generic_utils
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index ecda1603325..ba9156d7f95 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index f7137f0d09b..130a9954202 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features_v2.DenseFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"

From ce5488f85ff359f707bb0243ca3a7f2f48cba01f Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Tue, 12 May 2020 16:54:30 -0700
Subject: [PATCH 074/412] Disable memory_optimizer_test on windows

PiperOrigin-RevId: 311229436
Change-Id: Iafaebef3e4574e87b9442c541d5d3de1432426aa
---
 tensorflow/core/grappler/optimizers/BUILD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 0b8846faf05..b880055b47d 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -531,7 +531,10 @@ cc_library(
 tf_cuda_cc_test(
     name = "memory_optimizer_test",
     srcs = ["memory_optimizer_test.cc"],
-    tags = ["no_cuda_on_cpu_tap"],  # Do not re-enable again without actually testing.
+    tags = [
+        "no_cuda_on_cpu_tap",  # Do not re-enable again without actually testing.
+        "no_windows",  # b/56402646
+    ],
     deps = [
         ":gpu_swapping_kernels",
         ":gpu_swapping_ops",

From bb15c97379f197a6a46ec1446d8fb0b292b860ba Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Tue, 12 May 2020 17:16:41 -0700
Subject: [PATCH 075/412] Restructure the Keras class hierarchy for Network,
 Model and Sequential.

The intention of this change is to reduce the code complexity within Keras class, especially for Network, which currently contains logic for both subclass Model and functional Model.

After this change, the subclass model and functional model become individual class and become self contained.

1. Model is now the base class for subclass model. It doesn't contains network structure management, and the topology will be created within __init__ and __call__, which is for user to implement. It also contains compile/fit/eval/predict, which is the basic functionality for model training.

2. Functional is created based on existing Network class. It extends the Model, which allows it leverage compile/fit/eval/predict. In addition, it also take input/output as init parameter and manage the network topology.

3. Sequential model is now a subclass of Functional, since it will use Functional's method to manage it topology (layer stacking).

Model(input, output) will create a Functional under the hood, and behave the same way as before.

PiperOrigin-RevId: 311232972
Change-Id: I6dd32e089cd294d35d5a1f3684e1a1ae1a0ab320
---
 tensorflow/python/keras/engine/BUILD          |   6 +-
 tensorflow/python/keras/engine/base_layer.py  |  14 +-
 .../python/keras/engine/base_layer_v1.py      |   6 +-
 .../engine/{network.py => functional.py}      | 905 +-----------------
 .../{network_test.py => functional_test.py}   |  67 +-
 tensorflow/python/keras/engine/sequential.py  |  31 +-
 tensorflow/python/keras/engine/training.py    | 881 +++++++++++++++--
 tensorflow/python/keras/engine/training_v1.py |  19 +-
 .../python/keras/layers/serialization.py      |   2 +-
 .../python/keras/layers/wrappers_test.py      |   3 +-
 tensorflow/python/keras/models.py             |  19 +-
 .../python/keras/saving/hdf5_format_test.py   |  13 +-
 .../python/keras/saving/saved_model/load.py   |  25 +-
 .../saving/saved_model/model_serialization.py |   8 +-
 .../saved_model/network_serialization.py      |  15 +-
 .../python/keras/utils/version_utils_test.py  |   4 +-
 tensorflow/python/keras/utils/vis_utils.py    |  25 +-
 .../golden/v1/tensorflow.keras.-model.pbtxt   |   3 +-
 .../v1/tensorflow.keras.-sequential.pbtxt     |   2 +-
 ...low.keras.experimental.-linear-model.pbtxt |   3 +-
 ....keras.experimental.-wide-deep-model.pbtxt |   3 +-
 .../v1/tensorflow.keras.models.-model.pbtxt   |   3 +-
 .../tensorflow.keras.models.-sequential.pbtxt |   2 +-
 .../golden/v2/tensorflow.keras.-model.pbtxt   |   3 +-
 .../v2/tensorflow.keras.-sequential.pbtxt     |   2 +-
 ...low.keras.experimental.-linear-model.pbtxt |   3 +-
 ....keras.experimental.-wide-deep-model.pbtxt |   3 +-
 .../v2/tensorflow.keras.models.-model.pbtxt   |   3 +-
 .../tensorflow.keras.models.-sequential.pbtxt |   2 +-
 29 files changed, 1023 insertions(+), 1052 deletions(-)
 rename tensorflow/python/keras/engine/{network.py => functional.py} (58%)
 rename tensorflow/python/keras/engine/{network_test.py => functional_test.py} (97%)

diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index 203e481170f..1ff15d7e2e1 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -21,8 +21,8 @@ py_library(
     srcs = [
         "__init__.py",
         "compile_utils.py",
+        "functional.py",
         "input_layer.py",
-        "network.py",
         "node.py",
         "partial_batch_padding_handler.py",
         "saving.py",
@@ -460,9 +460,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "network_test",
+    name = "functional_test",
     size = "medium",
-    srcs = ["network_test.py"],
+    srcs = ["functional_test.py"],
     python_version = "PY3",
     shard_count = 8,
     tags = [
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 210f56ae87a..f6fa17df5c2 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -1006,13 +1006,23 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     """Whether the layer is dynamic (eager-only); set in the constructor."""
     # NOTE(taylorrobie): Currently self._dynamic is read-only. If that changes
     #                    then this cache logic must be updated.
-    return self._dynamic
+    return self._dynamic or any(layer.dynamic
+                                for layer in self._unique_sublayers())
+
+  def _unique_sublayers(self):
+    # Model.layers will use this as implementation, but we can't expose this
+    # one as the public property since it might conflict with subclass layers
+    # which also have user defined layers property.
+    self._maybe_create_attribute('_layers', [])
+    return list(
+        trackable_layer_utils.filter_empty_layer_containers(self._layers))
 
   @property
   @doc_controls.do_not_doc_inheritable
   @trackable_layer_utils.cache_recursive_attribute('stateful')
   def stateful(self):
-    return self._stateful
+    return self._stateful or any(
+        getattr(layer, 'stateful', False) for layer in self._unique_sublayers())
 
   @stateful.setter
   @trackable_layer_utils.invalidate_recursive_cache('stateful')
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 626892752c8..24d12ae4d59 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -833,13 +833,15 @@ class Layer(base_layer.Layer):
   def dynamic(self):
     # NOTE(taylorrobie): Currently self._dynamic is read-only. If that changes
     #                    then this cache logic must be updated.
-    return self._dynamic
+    return self._dynamic or any(layer.dynamic
+                                for layer in self._unique_sublayers())
 
   @property
   @doc_controls.do_not_generate_docs
   @trackable_layer_utils.cache_recursive_attribute('stateful')
   def stateful(self):
-    return self._stateful
+    return self._stateful or any(
+        getattr(layer, 'stateful', False) for layer in self._unique_sublayers())
 
   @stateful.setter
   @trackable_layer_utils.invalidate_recursive_cache('stateful')
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/functional.py
similarity index 58%
rename from tensorflow/python/keras/engine/network.py
rename to tensorflow/python/keras/engine/functional.py
index 87d1953ace5..80eb6cb27d5 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -22,84 +22,46 @@ from __future__ import print_function
 import collections
 import copy
 import itertools
-import json
-import os
 
-import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import composite_tensor
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import base_layer_utils
-from tensorflow.python.keras.engine import compile_utils
 from tensorflow.python.keras.engine import input_layer as input_layer_module
+from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.saving import hdf5_format
-from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.saving.saved_model import network_serialization
 from tensorflow.python.keras.utils import generic_utils
-from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.keras.utils.io_utils import path_to_string
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.training.tracking import data_structures
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
-from tensorflow.python.training.tracking import tracking
-from tensorflow.python.training.tracking import util as trackable_utils
-from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
-from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_inspect
-from tensorflow.tools.docs import doc_controls
 
 
-# pylint: disable=g-import-not-at-top
-try:
-  import h5py
-except ImportError:
-  h5py = None
+# pylint: disable=g-classes-have-attributes
+class Functional(training_lib.Model):
+  """A `Functional` model is a `Model` defined as a directed graph of layers.
 
-try:
-  import yaml
-except ImportError:
-  yaml = None
-# pylint: enable=g-import-not-at-top
-
-
-class Network(base_layer.Layer):
-  """A `Network` is a composition of layers.
-
-  `Network` is the topological form of a "model". A `Model`
-  is simply a `Network` with added training routines.
-
-  Two types of `Networks` exist: Graph Networks and Subclass Networks. Graph
-  networks are used in the Keras Functional and Sequential APIs. Subclassed
-  networks are used when a user subclasses the `Model` class. In general,
-  more Keras features are supported with Graph Networks than with Subclassed
-  Networks, specifically:
+  Three types of `Model` exist: subclassed `Model`, `Functional` model,
+  and `Sequential` (a special case of `Functional`).
+  In general, more Keras features are supported with `Functional`
+  than with subclassed `Model`s, specifically:
 
   - Model cloning (`keras.models.clone`)
   - Serialization (`model.get_config()/from_config`, `model.to_json()/to_yaml()`
   - Whole-model saving (`model.save()`)
 
-  A Graph Network can be instantiated by passing two arguments to `__init__`.
-  The first argument is the `keras.Input` Tensors that represent the inputs
-  to the Network. The second argument specifies the output Tensors that
-  represent the outputs of this Network. Both arguments can be a nested
-  structure of Tensors.
+  A `Functional` model can be instantiated by passing two arguments to
+  `__init__`. The first argument is the `keras.Input` Tensors that represent
+  the inputs to the model. The second argument specifies the output
+  tensors that represent the outputs of this model. Both arguments can be a
+  nested structure of tensors.
 
   Example:
 
@@ -107,10 +69,10 @@ class Network(base_layer.Layer):
   inputs = {'x1': keras.Input(shape=(10,)), 'x2': keras.Input(shape=(1,))}
   t = keras.layers.Dense(1, activation='relu')(inputs['x1'])
   outputs = keras.layers.Add()([t, inputs['x2'])
-  network = Network(inputs, outputs)
+  model = keras.Model(inputs, outputs)
   ```
 
-  A Graph Network constructed using the Functional API can also include raw
+  A `Functional` model constructed using the Functional API can also include raw
   TensorFlow functions, with the exception of functions that create Variables
   or assign ops.
 
@@ -120,38 +82,14 @@ class Network(base_layer.Layer):
   inputs = keras.Input(shape=(10,))
   x = keras.layers.Dense(1)(inputs)
   outputs = tf.nn.relu(x)
-  network = Network(inputs, outputs)
+  model = keras.Model(inputs, outputs)
   ```
 
-  Subclassed Networks can be instantiated via `name` and (optional) `dynamic`
-  keyword arguments. Subclassed Networks keep track of their Layers, and their
-  `call` method can be overridden. Subclassed Networks are typically created
-  indirectly, by subclassing the `Model` class.
-
-  Example:
-
-  ```
-  class MyModel(keras.Model):
-    def __init__(self):
-      super(MyModel, self).__init__(name='my_model', dynamic=False)
-
-      self.layer1 = keras.layers.Dense(10, activation='relu')
-
-    def call(self, inputs):
-      return self.layer1(inputs)
-  ```
-
-  Allowed args in `super().__init__`:
-    name: String name of the model.
-    dynamic: (Subclassed models only) Set this to `True` if your model should
-      only be run eagerly, and should not be used to generate a static
-      computation graph. This attribute is automatically set for Functional API
-      models.
+  Arguments:
+    inputs: List of input tensors (must be created via `tf.keras.Input()`).
+    outputs: List of outputs tensors.
+    name: String, optional. Name of the model.
     trainable: Boolean, whether the model's variables should be trainable.
-    dtype: (Subclassed models only) Default dtype of the model's weights (
-      default of `None` means use the type of the first input). This attribute
-      has no effect on Functional API models, which do not have weights of their
-      own.
   """
 
   # See tf.Module for the usage of this property.
@@ -160,79 +98,31 @@ class Network(base_layer.Layer):
   _TF_MODULE_IGNORED_PROPERTIES = frozenset(itertools.chain(
       ('_layer_call_argspecs', '_compiled_trainable_state',
        '_output_mask_cache', '_output_tensor_cache', '_output_shape_cache'),
-      base_layer.Layer._TF_MODULE_IGNORED_PROPERTIES
+      training_lib.Model._TF_MODULE_IGNORED_PROPERTIES
   ))
 
-  def __init__(self, *args, **kwargs):  # pylint: disable=super-init-not-called
-    # Signature detection
-    if (len(args) == 2 or
-        len(args) == 1 and 'outputs' in kwargs or
-        'inputs' in kwargs and 'outputs' in kwargs):
-      # Graph network
-      self._init_graph_network(*args, **kwargs)
-    else:
-      # Subclassed network
-      self._init_subclassed_network(**kwargs)
-
-    tf_utils.assert_no_legacy_layers(self.layers)
-
-  # Several Network methods have "no_automatic_dependency_tracking"
-  # annotations. Since Network does automatic dependency tracking on attribute
-  # assignment, including for common data structures such as lists, by default
-  # we'd have quite a few empty dependencies which users don't care about (or
-  # would need some way to ignore dependencies automatically, which is confusing
-  # when applied to user code). Some attributes, such as _layers, would cause
-  # structural issues (_layers being the place where Layers assigned to tracked
-  # attributes are stored).
-  #
-  # Aside from these aesthetic and structural issues, useless dependencies on
-  # empty lists shouldn't cause issues; adding or removing them will not break
-  # checkpoints, but may cause "all Python objects matched" assertions to fail
-  # (in which case less strict assertions may be substituted if necessary).
   @trackable.no_automatic_dependency_tracking
-  def _base_init(self, **kwargs):
-    # The following are implemented as property functions:
-    # self.trainable_weights
-    # self.non_trainable_weights
-    # self.input_spec
-    # self.losses
-    # self.updates
-
-    generic_utils.validate_kwargs(kwargs, {'trainable', 'dtype', 'dynamic',
-                                           'name', 'autocast'})
-
-    super(Network, self).__init__(**kwargs)
-
-    self.input_names = None
-    self.output_names = None
-    self._saved_model_inputs_spec = None
-
-    # This is True for Sequential networks and Functional networks.
-    self._compute_output_and_mask_jointly = False
-
-    # Don't reset compilation if already done. This may occur if calling
-    # `__init__` (or `_init_graph_network`) on an already-compiled model
-    # such as a Sequential model. Sequential models may need to rebuild
-    # themselves after compilation.
-    self._maybe_create_attribute('_is_compiled', False)
-    self._maybe_create_attribute('optimizer', None)
-
-    self._trackable_saver = (
-        trackable_utils.saver_with_op_caching(self))
+  def __init__(self, inputs=None, outputs=None, name=None, trainable=True):
+    # generic_utils.validate_kwargs(
+    #     kwargs, {'name', 'trainable'},
+    #     'Functional models may only specify `name` and `trainable` keyword '
+    #     'arguments during initialization. Got an unexpected argument:')
+    super(Functional, self).__init__(name=name, trainable=trainable)
+    self._init_graph_network(inputs, outputs)
 
   @trackable.no_automatic_dependency_tracking
-  def _init_graph_network(self, inputs, outputs, **kwargs):
-    generic_utils.validate_kwargs(
-        kwargs, {'name', 'trainable'},
-        'Functional models may only specify `name` and `trainable` keyword '
-        'arguments during initialization. Got an unexpected argument:')
+  def _init_graph_network(self, inputs, outputs):
+    # This method is needed for Sequential to reinitialize graph network when
+    # layer is added or removed.
+    self._is_graph_network = True
+
     # Normalize and set self.inputs, self.outputs.
     if isinstance(inputs, list) and len(nest.flatten(inputs)) == 1:
       inputs = inputs[0]
     if isinstance(outputs, list) and len(nest.flatten(outputs)) == 1:
       outputs = outputs[0]
-    self._nested_outputs = outputs
     self._nested_inputs = inputs
+    self._nested_outputs = outputs
     self.inputs = nest.flatten(inputs)
     self.outputs = nest.flatten(outputs)
 
@@ -247,7 +137,6 @@ class Network(base_layer.Layer):
     if any(not hasattr(tensor, '_keras_history') for tensor in self.outputs):
       base_layer_utils.create_keras_history(self._nested_outputs)
 
-    self._base_init(**kwargs)
     self._validate_graph_inputs_and_outputs()
 
     # A Network does not create weights of its own, thus it is already
@@ -255,7 +144,6 @@ class Network(base_layer.Layer):
     self.built = True
     self._build_input_shape = nest.map_structure(lambda x: x.shape, inputs)
     self._compute_output_and_mask_jointly = True
-    self._is_graph_network = True
     # `_expects_training_arg` is True since the `training` argument is always
     # present in the signature of the `call` method of a graph network.
     self._expects_training_arg = True
@@ -325,6 +213,7 @@ class Network(base_layer.Layer):
 
     self._compute_tensor_usage_count()
     self._set_save_spec(self._nested_inputs)
+    tf_utils.assert_no_legacy_layers(self.layers)
 
   @property
   def input(self):
@@ -340,9 +229,7 @@ class Network(base_layer.Layer):
       RuntimeError: If called in Eager mode.
       AttributeError: If no inbound nodes are found.
     """
-    if self._is_graph_network:
-      return self._nested_inputs
-    return super(Network, self).input
+    return self._nested_inputs
 
   @property
   def input_shape(self):
@@ -360,9 +247,7 @@ class Network(base_layer.Layer):
         AttributeError: if the layer has no defined input_shape.
         RuntimeError: if called in Eager mode.
     """
-    if self._is_graph_network:
-      return nest.map_structure(backend.int_shape, self.input)
-    return super(Network, self).input_shape
+    return nest.map_structure(backend.int_shape, self.input)
 
   @property
   def output(self):
@@ -379,9 +264,7 @@ class Network(base_layer.Layer):
         layers.
       RuntimeError: if called in Eager mode.
     """
-    if self._is_graph_network:
-      return self._nested_outputs
-    return super(Network, self).output
+    return self._nested_outputs
 
   @property
   def output_shape(self):
@@ -398,9 +281,7 @@ class Network(base_layer.Layer):
         AttributeError: if the layer has no defined output shape.
         RuntimeError: if called in Eager mode.
     """
-    if self._is_graph_network:
-      return nest.map_structure(backend.int_shape, self.output)
-    return super(Network, self).output_shape
+    return nest.map_structure(backend.int_shape, self.output)
 
   def _set_output_names(self):
     """Assigns unique names to the Network's outputs.
@@ -421,29 +302,9 @@ class Network(base_layer.Layer):
       uniquified.append(proposal)
     self.output_names = uniquified
 
-  @trackable.no_automatic_dependency_tracking
-  def _init_subclassed_network(self, **kwargs):
-    self._base_init(**kwargs)
-    self._is_graph_network = False
-    self.inputs = None
-    self.outputs = None
-
-  @property
-  @trackable_layer_utils.cache_recursive_attribute('dynamic')
-  def dynamic(self):
-    if self._is_graph_network:
-      return any(layer.dynamic for layer in self.layers)
-    return self._dynamic or any(layer.dynamic for layer in self.layers)
-
   @property
   def _layer_checkpoint_dependencies(self):
     """Dictionary of layer dependencies to be included in the checkpoint."""
-    # Use getattr because this function can be called from __setattr__, at which
-    # point the _is_graph_network attribute has not been created.
-    if (not getattr(self, '_is_graph_network', False) and
-        base_layer_utils.is_subclassed(self)):
-      return {}  # Only add layer dependencies for graph networks
-
     weight_layer_index = 0
 
     dependencies = collections.OrderedDict()
@@ -470,14 +331,14 @@ class Network(base_layer.Layer):
     dependencies = [
         trackable.TrackableReference(name=name, ref=layer)
         for name, layer in self._layer_checkpoint_dependencies.items()]
-    dependencies.extend(super(Network, self)._checkpoint_dependencies)
+    dependencies.extend(super(Functional, self)._checkpoint_dependencies)
     return dependencies
 
   def _lookup_dependency(self, name):
     layer_dependencies = self._layer_checkpoint_dependencies
     if name in layer_dependencies:
       return layer_dependencies[name]
-    return super(Network, self)._lookup_dependency(name)
+    return super(Functional, self)._lookup_dependency(name)
 
   def _handle_deferred_layer_dependencies(self, layers):
     """Handles layer checkpoint dependencies that are added after init."""
@@ -488,263 +349,17 @@ class Network(base_layer.Layer):
         self._handle_deferred_dependencies(name=layer_to_name[layer],
                                            trackable=layer)
 
-  def __setattr__(self, name, value):
-    if not getattr(self, '_self_setattr_tracking', True):
-      super(Network, self).__setattr__(name, value)
-      return
-
-    if all(
-        isinstance(v, (base_layer.Layer,
-                       data_structures.TrackableDataStructure)) or
-        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
-      try:
-        self._is_graph_network
-      except AttributeError:
-        # six.raise_from supresses the original AttributeError from being raised
-        six.raise_from(
-            RuntimeError('It looks like you are subclassing `Model` and you '
-                         'forgot to call `super(YourClass, self).__init__()`.'
-                         ' Always start with this line.'), None)
-
-    super(Network, self).__setattr__(name, value)
-
-    # Keep track of metric instance created in subclassed model/layer.
-    # We do this so that we can maintain the correct order of metrics by adding
-    # the instance to the `metrics` list as soon as it is created.
-    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
-    if isinstance(value, metrics_module.Metric):
-      self._metrics.append(value)
-
   @property
-  @trackable_layer_utils.cache_recursive_attribute('stateful')
-  def stateful(self):
-    return any(getattr(layer, 'stateful', False) for layer in self.layers)
-
-  def reset_states(self):
-    for layer in self.layers:
-      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
-        layer.reset_states()
-
-  @property
-  @deprecation.deprecated(
-      date=None,
-      instructions='This property should not be used in TensorFlow 2.0, '
-      'as updates are applied automatically.')
-  @doc_controls.do_not_generate_docs
-  def state_updates(self):
-    """Deprecated, do NOT use!
-
-    Returns the `updates` from all layers that are stateful.
-
-    This is useful for separating training updates and
-    state updates, e.g. when we need to update a layer's internal state
-    during prediction.
-
-    Returns:
-        A list of update ops.
-    """
-    state_updates = []
-    for layer in self.layers:
-      if getattr(layer, 'stateful', False):
-        if hasattr(layer, 'updates'):
-          state_updates += layer.updates
-    return state_updates
-
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
-
-    Returns:
-      A list of variables.
-    """
-    return self._dedup_weights(self._undeduplicated_weights)
-
-  @property
-  def _undeduplicated_weights(self):
-    """Returns the undeduplicated list of all layer variables/weights."""
-    self._assert_weights_created()
-    weights = []
-    for layer in self._layers:
-      weights += layer.weights
-    weights += (self._trainable_weights + self._non_trainable_weights)
-    return weights
-
-  @property
-  @tracking.cached_per_instance
   def _should_compute_mask(self):
-    return self._is_graph_network and super(Network, self)._should_compute_mask
+    return True
 
   def compute_mask(self, inputs, mask):
-    if not self._is_graph_network:
-      return None
-
     # TODO(omalleyt): b/123540974 This function is not really safe to call
     # by itself because it will duplicate any updates and losses in graph
     # mode by `call`ing the Layers again.
     output_tensors = self._run_internal_graph(inputs, mask=mask)
     return nest.map_structure(lambda t: t._keras_mask, output_tensors)
 
-  @property
-  def layers(self):
-    return list(
-        trackable_layer_utils.filter_empty_layer_containers(self._layers))
-
-  def get_layer(self, name=None, index=None):
-    """Retrieves a layer based on either its name (unique) or index.
-
-    If `name` and `index` are both provided, `index` will take precedence.
-    Indices are based on order of horizontal graph traversal (bottom-up).
-
-    Arguments:
-        name: String, name of layer.
-        index: Integer, index of layer.
-
-    Returns:
-        A layer instance.
-
-    Raises:
-        ValueError: In case of invalid layer name or index.
-    """
-    # TODO(fchollet): We could build a dictionary based on layer names
-    # since they are constant, but we have not done that yet.
-    if index is not None and name is not None:
-      raise ValueError('Provide only a layer name or a layer index.')
-
-    if index is not None:
-      if len(self.layers) <= index:
-        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
-                         ' but model only has ' + str(len(self.layers)) +
-                         ' layers.')
-      else:
-        return self.layers[index]
-
-    if name is not None:
-      for layer in self.layers:
-        if layer.name == name:
-          return layer
-      raise ValueError('No such layer: ' + name + '.')
-    raise ValueError('Provide either a layer name or layer index.')
-
-  @property
-  def trainable_weights(self):
-    self._assert_weights_created()
-    return self._dedup_weights(
-        trackable_layer_utils.gather_trainable_weights(
-            trainable=self.trainable,
-            sub_layers=self._layers,
-            extra_variables=self._trainable_weights))
-
-  @property
-  def non_trainable_weights(self):
-    self._assert_weights_created()
-    return self._dedup_weights(
-        trackable_layer_utils.gather_non_trainable_weights(
-            trainable=self.trainable,
-            sub_layers=self._layers,
-            extra_variables=self._non_trainable_weights +
-            self._trainable_weights))
-
-  @generic_utils.default
-  def build(self, input_shape):
-    """Builds the model based on input shapes received.
-
-    This is to be used for subclassed models, which do not know at instantiation
-    time what their inputs look like.
-
-    This method only exists for users who want to call `model.build()` in a
-    standalone way (as a substitute for calling the model on real data to
-    build it). It will never be called by the framework (and thus it will
-    never throw unexpected errors in an unrelated workflow).
-
-    Args:
-     input_shape: Single tuple, TensorShape, or list of shapes, where shapes
-         are tuples, integers, or TensorShapes.
-
-    Raises:
-      ValueError:
-        1. In case of invalid user-provided data (not of type tuple,
-           list, or TensorShape).
-        2. If the model requires call arguments that are agnostic
-           to the input shapes (positional or kwarg in call signature).
-        3. If not all layers were properly built.
-        4. If float type inputs are not supported within the layers.
-
-      In each of these cases, the user should build their model by calling it
-      on real tensor data.
-    """
-    if self._is_graph_network:
-      super(Network, self).build(input_shape)
-      return
-
-    # If subclass network
-    if input_shape is None:
-      raise ValueError('Input shape must be defined when calling build on a '
-                       'model subclass network.')
-    valid_types = (tuple, list, tensor_shape.TensorShape)
-    if not isinstance(input_shape, valid_types):
-      raise ValueError('Specified input shape is not one of the valid types. '
-                       'Please specify a batch input shape of type tuple or '
-                       'list of input shapes. User provided '
-                       'input type: {}'.format(type(input_shape)))
-
-    if input_shape and not self.inputs:
-      # We create placeholders for the `None`s in the shape and build the model
-      # in a Graph. Since tf.Variable is compatible with both eager execution
-      # and graph building, the variables created after building the model in
-      # a Graph are still valid when executing eagerly.
-      if context.executing_eagerly():
-        graph = func_graph.FuncGraph('build_graph')
-      else:
-        graph = backend.get_graph()
-      with graph.as_default():
-        if isinstance(input_shape, list):
-          x = [base_layer_utils.generate_placeholders_from_shape(shape)
-               for shape in input_shape]
-        elif isinstance(input_shape, dict):
-          x = {
-              k: base_layer_utils.generate_placeholders_from_shape(shape)
-              for k, shape in input_shape.items()
-          }
-        else:
-          x = base_layer_utils.generate_placeholders_from_shape(input_shape)
-
-        kwargs = {}
-        call_signature = self._call_full_argspec
-        call_args = call_signature.args
-        # Exclude `self`, `inputs`, and any argument with a default value.
-        if len(call_args) > 2:
-          if call_signature.defaults:
-            call_args = call_args[2:-len(call_signature.defaults)]
-          else:
-            call_args = call_args[2:]
-          for arg in call_args:
-            if arg == 'training':
-              # Case where `training` is a positional arg with no default.
-              kwargs['training'] = False
-            else:
-              # Has invalid call signature with unknown positional arguments.
-              raise ValueError(
-                  'Currently, you cannot build your model if it has '
-                  'positional or keyword arguments that are not '
-                  'inputs to the model, but are required for its '
-                  '`call` method. Instead, in order to instantiate '
-                  'and build your model, `call` your model on real '
-                  'tensor data with all expected call arguments.')
-        elif len(call_args) < 2:
-          # Signature without `inputs`.
-          raise ValueError('You can only call `build` on a model if its `call` '
-                           'method accepts an `inputs` argument.')
-        try:
-          self.call(x, **kwargs)
-        except (errors.InvalidArgumentError, TypeError):
-          raise ValueError('You cannot build your model by calling `build` '
-                           'if your layers do not support float type inputs. '
-                           'Instead, in order to instantiate and build your '
-                           'model, `call` your model on real tensor data (of '
-                           'the correct dtype).')
-
-    super(Network, self).build(input_shape)
-
   def call(self, inputs, training=None, mask=None):
     """Calls the model on new inputs.
 
@@ -763,17 +378,10 @@ class Network(base_layer.Layer):
         A tensor if there is a single output, or
         a list of tensors if there are more than one outputs.
     """
-    if not self._is_graph_network:
-      raise NotImplementedError('When subclassing the `Model` class, you should'
-                                ' implement a `call` method.')
-
     return self._run_internal_graph(
         inputs, training=training, mask=mask)
 
   def compute_output_shape(self, input_shape):
-    if not self._is_graph_network:
-      return super(Network, self).compute_output_shape(input_shape)
-
     # Convert any shapes in tuple format to TensorShapes.
     input_shape = tf_utils.convert_shapes(input_shape, to_tuples=False)
 
@@ -975,8 +583,6 @@ class Network(base_layer.Layer):
     return tensor
 
   def get_config(self):
-    if not self._is_graph_network:
-      raise NotImplementedError
     return copy.deepcopy(get_network_config(self))
 
   @classmethod
@@ -1002,373 +608,6 @@ class Network(base_layer.Layer):
     connect_ancillary_layers(model, created_layers)
     return model
 
-  def save(self,
-           filepath,
-           overwrite=True,
-           include_optimizer=True,
-           save_format=None,
-           signatures=None,
-           options=None):
-    """Saves the model to Tensorflow SavedModel or a single HDF5 file.
-
-    The savefile includes:
-
-    - The model architecture, allowing to re-instantiate the model.
-    - The model weights.
-    - The state of the optimizer, allowing to resume training
-        exactly where you left off.
-
-    This allows you to save the entirety of the state of a model
-    in a single file.
-
-    Saved models can be reinstantiated via `keras.models.load_model`.
-    The model returned by `load_model` is a compiled model ready to be used
-    (unless the saved model was never compiled in the first place).
-
-    Models built with the Sequential and Functional API can be saved to both the
-    HDF5 and SavedModel formats. Subclassed models can only be saved with the
-    SavedModel format.
-
-    Note that the model weights may have different scoped names after being
-    loaded. Scoped names include the model/layer names, such as
-    `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
-     access specific variables, e.g. `model.get_layer("dense_1").kernel`.
-
-    Arguments:
-        filepath: String, PathLike, path to SavedModel or H5 file to save the
-            model.
-        overwrite: Whether to silently overwrite any existing file at the
-            target location, or provide the user with a manual prompt.
-        include_optimizer: If True, save optimizer's state together.
-        save_format: Either `'tf'` or `'h5'`, indicating whether to save the
-            model to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X,
-            and 'h5' in TF 1.X.
-        signatures: Signatures to save with the SavedModel. Applicable to the
-            'tf' format only. Please see the `signatures` argument in
-            `tf.saved_model.save` for details.
-        options: Optional `tf.saved_model.SaveOptions` object that specifies
-            options for saving to SavedModel.
-
-    Example:
-
-    ```python
-    from keras.models import load_model
-
-    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
-    del model  # deletes the existing model
-
-    # returns a compiled model
-    # identical to the previous one
-    model = load_model('my_model.h5')
-    ```
-    """
-    save.save_model(self, filepath, overwrite, include_optimizer, save_format,
-                    signatures, options)
-
-  def save_weights(self, filepath, overwrite=True, save_format=None):
-    """Saves all layer weights.
-
-    Either saves in HDF5 or in TensorFlow format based on the `save_format`
-    argument.
-
-    When saving in HDF5 format, the weight file has:
-      - `layer_names` (attribute), a list of strings
-          (ordered names of model layers).
-      - For every layer, a `group` named `layer.name`
-          - For every such layer group, a group attribute `weight_names`,
-              a list of strings
-              (ordered names of weights tensor of the layer).
-          - For every weight in the layer, a dataset
-              storing the weight value, named after the weight tensor.
-
-    When saving in TensorFlow format, all objects referenced by the network are
-    saved in the same format as `tf.train.Checkpoint`, including any `Layer`
-    instances or `Optimizer` instances assigned to object attributes. For
-    networks constructed from inputs and outputs using `tf.keras.Model(inputs,
-    outputs)`, `Layer` instances used by the network are tracked/saved
-    automatically. For user-defined classes which inherit from `tf.keras.Model`,
-    `Layer` instances must be assigned to object attributes, typically in the
-    constructor. See the documentation of `tf.train.Checkpoint` and
-    `tf.keras.Model` for details.
-
-    While the formats are the same, do not mix `save_weights` and
-    `tf.train.Checkpoint`. Checkpoints saved by `Model.save_weights` should be
-    loaded using `Model.load_weights`. Checkpoints saved using
-    `tf.train.Checkpoint.save` should be restored using the corresponding
-    `tf.train.Checkpoint.restore`. Prefer `tf.train.Checkpoint` over
-    `save_weights` for training checkpoints.
-
-    The TensorFlow format matches objects and variables by starting at a root
-    object, `self` for `save_weights`, and greedily matching attribute
-    names. For `Model.save` this is the `Model`, and for `Checkpoint.save` this
-    is the `Checkpoint` even if the `Checkpoint` has a model attached. This
-    means saving a `tf.keras.Model` using `save_weights` and loading into a
-    `tf.train.Checkpoint` with a `Model` attached (or vice versa) will not match
-    the `Model`'s variables. See the [guide to training
-    checkpoints](https://www.tensorflow.org/guide/checkpoint) for details
-    on the TensorFlow format.
-
-    Arguments:
-        filepath: String or PathLike, path to the file to save the weights to.
-            When saving in TensorFlow format, this is the prefix used for
-            checkpoint files (multiple files are generated). Note that the '.h5'
-            suffix causes weights to be saved in HDF5 format.
-        overwrite: Whether to silently overwrite any existing file at the
-            target location, or provide the user with a manual prompt.
-        save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
-            '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
-            `None` defaults to 'tf'.
-
-    Raises:
-        ImportError: If h5py is not available when attempting to save in HDF5
-            format.
-        ValueError: For invalid/unknown format arguments.
-    """
-    self._assert_weights_created()
-    filepath = path_to_string(filepath)
-    filepath_is_h5 = _is_hdf5_filepath(filepath)
-    if save_format is None:
-      if filepath_is_h5:
-        save_format = 'h5'
-      else:
-        save_format = 'tf'
-    else:
-      user_format = save_format.lower().strip()
-      if user_format in ('tensorflow', 'tf'):
-        save_format = 'tf'
-      elif user_format in ('hdf5', 'h5', 'keras'):
-        save_format = 'h5'
-      else:
-        raise ValueError(
-            'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
-                save_format,))
-    if save_format == 'tf' and filepath_is_h5:
-      raise ValueError(
-          ('save_weights got save_format="tf"/"tensorflow", but the '
-           'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
-           'when saving in TensorFlow format.')
-          % filepath)
-
-    if save_format == 'h5' and h5py is None:
-      raise ImportError(
-          '`save_weights` requires h5py when saving in hdf5.')
-    if save_format == 'tf':
-      check_filepath = filepath + '.index'
-    else:
-      check_filepath = filepath
-    # If file exists and should not be overwritten:
-    if not overwrite and os.path.isfile(check_filepath):
-      proceed = ask_to_proceed_with_overwrite(check_filepath)
-      if not proceed:
-        return
-    if save_format == 'h5':
-      with h5py.File(filepath, 'w') as f:
-        hdf5_format.save_weights_to_hdf5_group(f, self.layers)
-    else:
-      if context.executing_eagerly():
-        session = None
-      else:
-        session = backend.get_session()
-      optimizer = getattr(self, 'optimizer', None)
-      if (optimizer
-          and not isinstance(optimizer, trackable.Trackable)):
-        logging.warning(
-            ('This model was compiled with a Keras optimizer (%s) but is being '
-             'saved in TensorFlow format with `save_weights`. The model\'s '
-             'weights will be saved, but unlike with TensorFlow optimizers in '
-             'the TensorFlow format the optimizer\'s state will not be '
-             'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
-            % (optimizer,))
-      self._trackable_saver.save(filepath, session=session)
-      # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
-      checkpoint_management.update_checkpoint_state_internal(
-          save_dir=os.path.dirname(filepath),
-          model_checkpoint_path=filepath,
-          save_relative_paths=True,
-          all_model_checkpoint_paths=[filepath])
-
-  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
-    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
-
-    If `by_name` is False weights are loaded based on the network's
-    topology. This means the architecture should be the same as when the weights
-    were saved.  Note that layers that don't have weights are not taken into
-    account in the topological ordering, so adding or removing layers is fine as
-    long as they don't have weights.
-
-    If `by_name` is True, weights are loaded into layers only if they share the
-    same name. This is useful for fine-tuning or transfer-learning models where
-    some of the layers have changed.
-
-    Only topological loading (`by_name=False`) is supported when loading weights
-    from the TensorFlow format. Note that topological loading differs slightly
-    between TensorFlow and HDF5 formats for user-defined classes inheriting from
-    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
-    TensorFlow format loads based on the object-local names of attributes to
-    which layers are assigned in the `Model`'s constructor.
-
-    Arguments:
-        filepath: String or PathLike, path to the weights file to load. For
-            weight files in TensorFlow format, this is the file prefix (the
-            same as was passed to `save_weights`).
-        by_name: Boolean, whether to load weights by name or by topological
-            order. Only topological loading is supported for weight files in
-            TensorFlow format.
-        skip_mismatch: Boolean, whether to skip loading of layers where there is
-            a mismatch in the number of weights, or a mismatch in the shape of
-            the weight (only valid when `by_name=True`).
-
-    Returns:
-        When loading a weight file in TensorFlow format, returns the same status
-        object as `tf.train.Checkpoint.restore`. When graph building, restore
-        ops are run automatically as soon as the network is built (on first call
-        for user-defined classes inheriting from `Model`, immediately if it is
-        already built).
-
-        When loading weights in HDF5 format, returns `None`.
-
-    Raises:
-        ImportError: If h5py is not available and the weight file is in HDF5
-            format.
-        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
-          `False`.
-    """
-
-    if skip_mismatch and not by_name:
-      raise ValueError(
-          'When calling model.load_weights, skip_mismatch can only be set to '
-          'True when by_name is True.')
-
-    filepath = path_to_string(filepath)
-    if _is_hdf5_filepath(filepath):
-      save_format = 'h5'
-    else:
-      try:
-        py_checkpoint_reader.NewCheckpointReader(filepath)
-        save_format = 'tf'
-      except errors_impl.DataLossError:
-        # The checkpoint is not readable in TensorFlow format. Try HDF5.
-        save_format = 'h5'
-    if save_format == 'tf':
-      status = self._trackable_saver.restore(filepath)
-      if by_name:
-        raise NotImplementedError(
-            'Weights may only be loaded based on topology into Models when '
-            'loading TensorFlow-formatted weights (got by_name=True to '
-            'load_weights).')
-      if not context.executing_eagerly():
-        session = backend.get_session()
-        # Restore existing variables (if any) immediately, and set up a
-        # streaming restore for any variables created in the future.
-        trackable_utils.streaming_restore(status=status, session=session)
-      status.assert_nontrivial_match()
-      return status
-    if h5py is None:
-      raise ImportError(
-          '`load_weights` requires h5py when loading weights from HDF5.')
-    if self._is_graph_network and not self.built:
-      raise NotImplementedError(
-          'Unable to load weights saved in HDF5 format into a subclassed '
-          'Model which has not created its variables yet. Call the Model '
-          'first, then load the weights.')
-    self._assert_weights_created()
-    with h5py.File(filepath, 'r') as f:
-      if 'layer_names' not in f.attrs and 'model_weights' in f:
-        f = f['model_weights']
-      if by_name:
-        hdf5_format.load_weights_from_hdf5_group_by_name(
-            f, self.layers, skip_mismatch=skip_mismatch)
-      else:
-        hdf5_format.load_weights_from_hdf5_group(f, self.layers)
-
-  def _updated_config(self):
-    """Util shared between different serialization methods.
-
-    Returns:
-        Model config with Keras version information added.
-    """
-    from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-    config = self.get_config()
-    model_config = {
-        'class_name': self.__class__.__name__,
-        'config': config,
-        'keras_version': keras_version,
-        'backend': backend.backend()
-    }
-    return model_config
-
-  def to_json(self, **kwargs):
-    """Returns a JSON string containing the network configuration.
-
-    To load a network from a JSON save file, use
-    `keras.models.model_from_json(json_string, custom_objects={})`.
-
-    Arguments:
-        **kwargs: Additional keyword arguments
-            to be passed to `json.dumps()`.
-
-    Returns:
-        A JSON string.
-    """
-    model_config = self._updated_config()
-    return json.dumps(
-        model_config, default=serialization.get_json_type, **kwargs)
-
-  def to_yaml(self, **kwargs):
-    """Returns a yaml string containing the network configuration.
-
-    To load a network from a yaml save file, use
-    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
-
-    `custom_objects` should be a dictionary mapping
-    the names of custom losses / layers / etc to the corresponding
-    functions / classes.
-
-    Arguments:
-        **kwargs: Additional keyword arguments
-            to be passed to `yaml.dump()`.
-
-    Returns:
-        A YAML string.
-
-    Raises:
-        ImportError: if yaml module is not found.
-    """
-    if yaml is None:
-      raise ImportError(
-          'Requires yaml module installed (`pip install pyyaml`).')
-    return yaml.dump(self._updated_config(), **kwargs)
-
-  def summary(self, line_length=None, positions=None, print_fn=None):
-    """Prints a string summary of the network.
-
-    Arguments:
-        line_length: Total length of printed lines
-            (e.g. set this to adapt the display to different
-            terminal window sizes).
-        positions: Relative or absolute positions of log elements
-            in each line. If not provided,
-            defaults to `[.33, .55, .67, 1.]`.
-        print_fn: Print function to use. Defaults to `print`.
-            It will be called on each line of the summary.
-            You can set it to a custom function
-            in order to capture the string summary.
-
-    Raises:
-        ValueError: if `summary()` is called before the model is built.
-    """
-    if not self.built:
-      raise ValueError('This model has not yet been built. '
-                       'Build the model first by calling `build()` or calling '
-                       '`fit()` with some data, or specify '
-                       'an `input_shape` argument in the first layer(s) for '
-                       'automatic build.')
-    layer_utils.print_summary(self,
-                              line_length=line_length,
-                              positions=positions,
-                              print_fn=print_fn)
-
   def _validate_graph_inputs_and_outputs(self):
     """Validates the inputs and outputs of a Graph Network."""
     # Check for redundancy in inputs.
@@ -1542,30 +781,9 @@ class Network(base_layer.Layer):
     self._tensor_usage_count = tensor_usage_count
 
   def _assert_weights_created(self):
-    """Asserts that all the weights for the network have been created.
-
-    For a non-dynamic network, the weights must already be created after the
-    layer has been called. For a dynamic network, the exact list of weights can
-    never be known for certain since it may change at any time during execution.
-
-    We run this check right before accessing weights or getting the Numpy value
-    for the current weights. Otherwise, if the layer has never been called,
-    the user would just get an empty list, which is misleading.
-
-    Raises:
-      ValueError: if the weights of the network has not yet been created.
-    """
-    if self.dynamic:
-      return
-    if (not self._is_graph_network and
-        'build' in self.__class__.__dict__ and
-        not self.built):
-      # For any model that has customized build() method but hasn't
-      # been invoked yet, this will cover both sequential and subclass model.
-      raise ValueError('Weights for model %s have not yet been created. '
-                       'Weights are created when the Model is first called on '
-                       'inputs or `build()` is called with an `input_shape`.' %
-                       self.name)
+    # Override the implementation in Model.
+    # The Functional model should always have weight created already.
+    return
 
   def _graph_network_add_loss(self, symbolic_loss):
     new_nodes, new_layers = _map_subgraph_network(self.inputs, [symbolic_loss])
@@ -1587,42 +805,11 @@ class Network(base_layer.Layer):
     new_layers.append(add_metric_layer)
     self._insert_layers(new_layers, new_nodes)
 
-  @trackable.no_automatic_dependency_tracking
-  def _set_save_spec(self, inputs):
-    if self._saved_model_inputs_spec is not None:
-      return  # Already set.
-
-    input_names = self.input_names
-    if not input_names:
-      input_names = compile_utils.create_pseudo_input_names(inputs)
-
-    flat_inputs = nest.flatten(inputs)
-    specs = []
-    for name, tensor in zip(input_names, flat_inputs):
-      specs.append(
-          tf_utils.get_tensor_spec(tensor, dynamic_batch=False, name=name))
-    specs = nest.pack_sequence_as(inputs, specs)
-
-    self._saved_model_inputs_spec = specs
-
-  def _get_save_spec(self, dynamic_batch=True):
-    if self._saved_model_inputs_spec is None:
-      return None
-
-    return nest.map_structure(
-        lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
-        self._saved_model_inputs_spec)
-
   @property
   def _trackable_saved_model_saver(self):
     return network_serialization.NetworkSavedModelSaver(self)
 
 
-def _is_hdf5_filepath(filepath):
-  return (filepath.endswith('.h5') or filepath.endswith('.keras') or
-          filepath.endswith('.hdf5'))
-
-
 def _make_node_key(layer_name, node_index):
   return layer_name + '_ib-' + str(node_index)
 
@@ -1830,7 +1017,7 @@ def _map_subgraph_network(inputs, outputs):
 def _should_skip_first_node(layer):
   """Returns True if the first layer node should not be saved or loaded."""
   # Networks start with a pre-existing node linking their input to output.
-  return issubclass(layer.__class__, Network) and layer._is_graph_network
+  return isinstance(layer, Functional)
 
 
 def _deserialize_keras_tensors(kwargs, layer_map):
diff --git a/tensorflow/python/keras/engine/network_test.py b/tensorflow/python/keras/engine/functional_test.py
similarity index 97%
rename from tensorflow/python/keras/engine/network_test.py
rename to tensorflow/python/keras/engine/functional_test.py
index b4e8adf2c49..90fc9f2697f 100644
--- a/tensorflow/python/keras/engine/network_test.py
+++ b/tensorflow/python/keras/engine/functional_test.py
@@ -33,8 +33,8 @@ from tensorflow.python.keras import layers
 from tensorflow.python.keras import models
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import input_layer as input_layer_lib
-from tensorflow.python.keras.engine import network as network_lib
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.utils import layer_utils
@@ -89,7 +89,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
       self.assertEqual(len(layer.updates), 3)
 
-      network = network_lib.Network(x2, y2)
+      network = functional.Functional(x2, y2)
       self.assertEqual(len(network.updates), 3)
 
       x3 = input_layer_lib.Input(shape=(1,))
@@ -120,7 +120,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     dense_a = layers.Dense(4, name='dense_a')
     dense_b = layers.Dense(2, name='dense_b')
     y = dense_b(dense_a(x))
-    network = network_lib.Network(x, y, name='dense_network')
+    network = functional.Functional(x, y, name='dense_network')
 
     # test various get_layer by index
     self.assertEqual(network.get_layer(index=1), dense_a)
@@ -251,7 +251,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
       x = input_layer_lib.Input(shape=(32,))
       dense = layers.Dense(2)
       y = dense(x)
-      network = network_lib.Network(x, y, name='dense_network')
+      network = functional.Functional(x, y, name='dense_network')
 
       # test basic attributes
       self.assertEqual(network.name, 'dense_network')
@@ -740,7 +740,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     else:
       x = input_layer_lib.Input(shape=(32,))
       y = MaskedLayer()(x)  # pylint: disable=not-callable
-      network = network_lib.Network(x, y)
+      network = functional.Functional(x, y)
 
       # test callability on Input
       x_2 = input_layer_lib.Input(shape=(32,))
@@ -1102,7 +1102,7 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
 
   def test_subclassed_error_if_init_not_called(self):
 
-    class MyNetwork(network_lib.Network):
+    class MyNetwork(training_lib.Model):
 
       def __init__(self):
         self._foo = [layers.Dense(10), layers.Dense(10)]
@@ -1124,10 +1124,12 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     inputs = input_layer_lib.Input(shape=(32,))
     outputs = layers.Dense(4)(inputs)
 
-    with self.assertRaisesRegexp(TypeError, 'unexpected argument'):
+    with self.assertRaisesRegexp(TypeError,
+                                 'got an unexpected keyword argument'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dtype='int64')
-    with self.assertRaisesRegexp(TypeError, 'unexpected argument'):
+    with self.assertRaisesRegexp(TypeError,
+                                 'got an unexpected keyword argument'):
       model = training_lib.Model(
           inputs, outputs, name='m', trainable=False, dynamic=False)
 
@@ -1136,8 +1138,10 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     self.assertFalse(model.trainable)
     self.assertFalse(model.dynamic)
 
+    class SubclassModel(training_lib.Model):
+      pass
     # Subclassed model
-    model = training_lib.Model(
+    model = SubclassModel(
         name='subclassed', trainable=True, dtype='int64', dynamic=True)
     self.assertEqual('subclassed', model.name)
     self.assertTrue(model.dynamic)
@@ -1150,9 +1154,9 @@ class NetworkConstructionTest(keras_parameterized.TestCase):
     input_tensor2 = input_layer_lib.Input(shape=[10], name='b')
     output_tensor1 = layers.Dense(units=10)(input_tensor1)
 
-    net = network_lib.Network(
+    net = functional.Functional(
         inputs=[input_tensor1, input_tensor2], outputs=[output_tensor1])
-    net2 = network_lib.Network.from_config(net.get_config())
+    net2 = functional.Functional.from_config(net.get_config())
     self.assertLen(net2.inputs, 2)
     self.assertEqual('a', net2.layers[0].name)
     self.assertEqual('b', net2.layers[1].name)
@@ -1180,8 +1184,8 @@ class DeferredModeTest(keras_parameterized.TestCase):
       self.assertEqual(x.shape.as_list(), [None, 2])
 
     outputs = layers.Dense(4)(x)
-    network = network_lib.Network(inputs, outputs)
-    self.assertIsInstance(network, network_lib.Network)
+    network = functional.Functional(inputs, outputs)
+    self.assertIsInstance(network, functional.Functional)
 
     if context.executing_eagerly():
       # It should be possible to call such a network on EagerTensors.
@@ -1204,7 +1208,7 @@ class DeferredModeTest(keras_parameterized.TestCase):
     c = AddLayer()([a, input_b])  # pylint: disable=not-callable
     c = layers.Dense(2)(c)
 
-    network = network_lib.Network([input_a, input_b], [a, c])
+    network = functional.Functional([input_a, input_b], [a, c])
     if context.executing_eagerly():
       a_val = constant_op.constant(
           np.random.random((10, 32)).astype('float32'))
@@ -1484,9 +1488,9 @@ class NestedNetworkTest(keras_parameterized.TestCase):
         'x2': input_layer_lib.Input(shape=(1,))
     }
     outputs = layers.Add()([inputs['x1'], inputs['x2']])
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network = network_lib.Network.from_config(network.get_config())
+    network = functional.Functional.from_config(network.get_config())
 
     result_tensor = network({
         'x': array_ops.ones((1, 1), 'float32'),
@@ -1509,9 +1513,9 @@ class NestedNetworkTest(keras_parameterized.TestCase):
         'x*x': layers.Multiply()([inputs, inputs])
     }
 
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network = network_lib.Network.from_config(network.get_config())
+    network = functional.Functional.from_config(network.get_config())
 
     result_tensor = network(array_ops.ones((1, 1), 'float32'))
     result = self.evaluate(result_tensor)
@@ -1531,7 +1535,8 @@ class NestedNetworkTest(keras_parameterized.TestCase):
         'x1+x2': layers.Add()([inner_inputs['x1'], inner_inputs['x2']]),
         'x1*x2': layers.Multiply()([inner_inputs['x1'], inner_inputs['x2']])
     }
-    inner_network = network_lib.Network(inner_inputs, inner_outputs)
+    inner_network = functional.Functional(
+        inner_inputs, inner_outputs)
 
     inputs = [
         input_layer_lib.Input(shape=(1,)),
@@ -1539,9 +1544,9 @@ class NestedNetworkTest(keras_parameterized.TestCase):
     ]
     middle = inner_network({'x1': inputs[0], 'x2': inputs[1]})
     outputs = layers.Add()([middle['x1+x2'], middle['x1*x2']])
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
-    network = network_lib.Network.from_config(network.get_config())
+    network = functional.Functional.from_config(network.get_config())
 
     # Computes: `(x1+x2) + (x1*x2)`
     result_tensor = network(
@@ -1735,13 +1740,13 @@ class DTypeTest(keras_parameterized.TestCase):
   def test_graph_network_dtype(self):
     inputs = input_layer_lib.Input((10,))
     outputs = layers.Dense(10)(inputs)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
     self.assertEqual(network.dtype, 'float32')
 
   @testing_utils.enable_v2_dtype_behavior
   def test_subclassed_network_dtype(self):
 
-    class IdentityNetwork(network_lib.Network):
+    class IdentityNetwork(training_lib.Model):
 
       def call(self, inputs):
         return inputs
@@ -1785,11 +1790,11 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
   def layer_and_network_test(self):
     # Top level layer
-    network = network_lib.Network()
+    network = functional.Functional()
 
     layer_0 = AttrTrackingLayer()
 
-    sub_network = network_lib.Network()
+    sub_network = functional.Functional()
     layer_1 = AttrTrackingLayer(dynamic=True)
     layer_2 = AttrTrackingLayer()
     sub_network.sub_layers = [layer_1, layer_2]
@@ -1887,7 +1892,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
     x = input_layer_lib.Input(shape=(None, 32))
     dense = layers.Dense(2)
     y = dense(x)
-    network = network_lib.Network(x, y, name='dense_network')
+    network = functional.Functional(x, y, name='dense_network')
 
     for i in range(999, 1024):
       self.assertEqual(network.compute_output_shape((1, i, 32)), (1, i, 2))
@@ -1895,7 +1900,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
   def test_2d_inputs_squeezed_to_1d(self):
     input_1d = input_layer_lib.Input(shape=())
     outputs = input_1d * 2.
-    net = network_lib.Network(input_1d, outputs)
+    net = functional.Functional(input_1d, outputs)
 
     x = np.ones((10, 1))
     y = net(x)
@@ -1904,7 +1909,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
   def test_1d_inputs_expanded_to_2d(self):
     input_1d = input_layer_lib.Input(shape=(1,))
     outputs = input_1d * 2.
-    net = network_lib.Network(input_1d, outputs)
+    net = functional.Functional(input_1d, outputs)
 
     x = np.ones((10,))
     y = net(x)
@@ -1927,14 +1932,14 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=True)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
     # Hard-coded value passed during construction is respected.
     self.assertAllEqual(network(x, training=False), x)
 
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=False)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
     network(x, training=True)
     # Hard-coded value passed during construction is respected.
@@ -1942,7 +1947,7 @@ class CacheCorrectnessTest(keras_parameterized.TestCase):
 
     inputs = input_layer_lib.Input(10)
     outputs = my_layer(inputs, training=None)
-    network = network_lib.Network(inputs, outputs)
+    network = functional.Functional(inputs, outputs)
 
     # `None` value passed during construction is overridden.
     self.assertAllEqual(network(x, training=True), x)
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 2d5abac7fd6..d07ed477ba9 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -26,8 +26,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import input_layer
-from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.saving.saved_model import model_serialization
 from tensorflow.python.keras.utils import generic_utils
@@ -35,7 +35,6 @@ from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.tracking import base as trackable
-from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.deprecation import deprecated
@@ -48,7 +47,7 @@ SINGLE_LAYER_OUTPUT_ERROR_MSG = ('All layers in a Sequential model should have '
 
 
 @keras_export('keras.Sequential', 'keras.models.Sequential')
-class Sequential(training.Model):
+class Sequential(functional.Functional):
   """`Sequential` groups a linear stack of layers into a `tf.keras.Model`.
 
   `Sequential` provides training and inference features on this model.
@@ -113,7 +112,9 @@ class Sequential(training.Model):
       layers: Optional list of layers to add to the model.
       name: Optional name for the model.
     """
-    super(Sequential, self).__init__(name=name, autocast=False)
+    # Skip the init in FunctionalModel since model doesn't have input/output yet
+    super(functional.Functional, self).__init__(  # pylint: disable=bad-super-call
+        name=name, autocast=False)
     self.supports_masking = True
     self._compute_output_and_mask_jointly = True
     self._auto_track_sub_layers = False
@@ -152,11 +153,6 @@ class Sequential(training.Model):
       return layers[1:]
     return layers[:]
 
-  @property
-  @trackable_layer_utils.cache_recursive_attribute('dynamic')
-  def dynamic(self):
-    return any(layer.dynamic for layer in self.layers)
-
   @trackable.no_automatic_dependency_tracking
   def add(self, layer):
     """Adds a layer instance on top of the layer stack.
@@ -233,7 +229,7 @@ class Sequential(training.Model):
       self.built = True
 
     if set_inputs or self._graph_initialized:
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self._init_graph_network(self.inputs, self.outputs)
       self._graph_initialized = True
     else:
       self._layers.append(layer)
@@ -267,7 +263,7 @@ class Sequential(training.Model):
     elif self._graph_initialized:
       self.layers[-1]._outbound_nodes = []
       self.outputs = [self.layers[-1].output]
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self._init_graph_network(self.inputs, self.outputs)
       self.built = True
 
   @trackable.no_automatic_dependency_tracking
@@ -341,7 +337,7 @@ class Sequential(training.Model):
             # case, we fall back to the legacy deferred behavior.
             # TODO(fchollet): consider raising here, as we should not be
             # supporting such layers.
-            self._init_graph_network(inputs, outputs, name=self.name)
+            self._init_graph_network(inputs, outputs)
             self._graph_initialized = True
           except:  # pylint:disable=bare-except
             self._use_legacy_deferred_behavior = True
@@ -350,7 +346,7 @@ class Sequential(training.Model):
   @generic_utils.default
   def build(self, input_shape=None):
     if self._graph_initialized:
-      self._init_graph_network(self.inputs, self.outputs, name=self.name)
+      self._init_graph_network(self.inputs, self.outputs)
     else:
       if input_shape is None:
         raise ValueError('You must provide an `input_shape` argument.')
@@ -380,7 +376,7 @@ class Sequential(training.Model):
 
     if self._graph_initialized:
       if not self.built:
-        self._init_graph_network(self.inputs, self.outputs, name=self.name)
+        self._init_graph_network(self.inputs, self.outputs)
       return super(Sequential, self).call(inputs, training=training, mask=mask)
 
     outputs = inputs  # handle the corner case where self.layers is empty
@@ -519,6 +515,13 @@ class Sequential(training.Model):
         return False
     return True
 
+  def _assert_weights_created(self):
+    if self._graph_initialized:
+      return
+    # When the graph has not been initialized, use the Model's implementation to
+    # to check if the weights has been created.
+    super(functional.Functional, self)._assert_weights_created()  # pylint: disable=bad-super-call
+
 
 def _get_shape_tuple(t):
   if hasattr(t, 'shape'):
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index bb68ffca2ed..52bf42a099d 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -20,6 +20,9 @@ from __future__ import print_function
 
 import copy
 import itertools
+import json
+import os
+import six
 
 from tensorflow.python.autograph.lang import directives
 from tensorflow.python.distribute import distribute_coordinator as dc
@@ -31,19 +34,31 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import monitoring
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as callbacks_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import compile_utils
 from tensorflow.python.keras.engine import data_adapter
-from tensorflow.python.keras.engine import network
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.mixed_precision.experimental import loss_scale_optimizer as lso
+from tensorflow.python.keras.saving import hdf5_format
+from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.saving.saved_model import model_serialization
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
+from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.keras.utils.io_utils import path_to_string
 from tensorflow.python.keras.utils.mode_keys import ModeKeys
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -52,12 +67,33 @@ from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import trace
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training import py_checkpoint_reader
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.training.tracking import data_structures
+from tensorflow.python.training.tracking import layer_utils as trackable_layer_utils
+from tensorflow.python.training.tracking import util as trackable_utils
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import keras_export
+from tensorflow.tools.docs import doc_controls
+
+
+# pylint: disable=g-import-not-at-top
+try:
+  import h5py
+except ImportError:
+  h5py = None
+
+try:
+  import yaml
+except ImportError:
+  yaml = None
+# pylint: enable=g-import-not-at-top
 
 
 _keras_api_gauge = monitoring.BoolGauge('/tensorflow/api/keras',
@@ -97,8 +133,25 @@ def disable_multi_worker(method):
       target=method, decorator_func=_method_wrapper)
 
 
+def inject_functional_model_class(cls):
+  from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.engine import training_v1  # pylint: disable=g-import-not-at-top
+  if cls == Model or cls == training_v1.Model:
+    return functional.Functional
+
+  cls.__bases__ = tuple(inject_functional_model_class(base)
+                        for base in cls.__bases__)
+  return cls
+
+
+def is_functional_model_init_params(args, kwargs):
+  return (len(args) == 2 or
+          len(args) == 1 and 'outputs' in kwargs or
+          'inputs' in kwargs and 'outputs' in kwargs)
+
+
 @keras_export('keras.Model', 'keras.models.Model')
-class Model(network.Network, version_utils.ModelVersionSelector):
+class Model(base_layer.Layer, version_utils.ModelVersionSelector):
   """`Model` groups layers into an object with training and inference features.
 
   Arguments:
@@ -174,11 +227,61 @@ class Model(network.Network, version_utils.ModelVersionSelector):
   _TF_MODULE_IGNORED_PROPERTIES = frozenset(
       itertools.chain(('_train_counter', '_test_counter', '_predict_counter',
                        '_steps_per_execution'),
-                      network.Network._TF_MODULE_IGNORED_PROPERTIES))  # pylint: disable=protected-access
+                      base_layer.Layer._TF_MODULE_IGNORED_PROPERTIES))  # pylint: disable=protected-access
 
+  def __new__(cls, *args, **kwargs):
+    # Signature detection
+    if is_functional_model_init_params(args, kwargs) and cls == Model:
+      # Functional model
+      from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+      return functional.Functional(*args, **kwargs)
+    else:
+      return super(Model, cls).__new__(cls, *args, **kwargs)
+
+  @trackable.no_automatic_dependency_tracking
   def __init__(self, *args, **kwargs):
-    super(Model, self).__init__(*args, **kwargs)
-    _keras_api_gauge.get_cell('model').set(True)
+    # Special case for Subclassed Functional Model, which we couldn't detect
+    # when __new__ is called. We only realize it is a functional model when it
+    # calls super.__init__ with input and output tensor.
+    from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+    if (is_functional_model_init_params(args, kwargs) and
+        not isinstance(self, functional.Functional)):
+      inject_functional_model_class(self.__class__)
+      functional.Functional.__init__(self, *args, **kwargs)
+      return
+
+    # The following are implemented as property functions:
+    # self.trainable_weights
+    # self.non_trainable_weights
+    generic_utils.validate_kwargs(kwargs, {'trainable', 'dtype', 'dynamic',
+                                           'name', 'autocast'})
+    super(Model, self).__init__(**kwargs)
+    # By default, Model is a subclass model, which is not in graph network.
+    self._is_graph_network = False
+
+    self.inputs = None
+    self.outputs = None
+    self.input_names = None
+    self.output_names = None
+    # stop_training is used by callback to stop training when error happens
+    self.stop_training = False
+    self.history = None
+    # These objects are used in the default `Model.compile`. They are not
+    # guaranteed to be set after `Model.compile` is called, as users can
+    # override compile with custom logic.
+    self.compiled_loss = None
+    self.compiled_metrics = None
+
+    # This is True for Sequential networks and Functional networks.
+    self._compute_output_and_mask_jointly = False
+
+    # Don't reset compilation if already done. This may occur if calling
+    # `__init__` (or `_init_graph_network`) on an already-compiled model
+    # such as a Sequential model. Sequential models may need to rebuild
+    # themselves after compilation.
+    self._maybe_create_attribute('_is_compiled', False)
+    self._maybe_create_attribute('optimizer', None)
+
     # Model must be created under scope of DistStrat it will be trained with.
     if ds_context.has_strategy():
       self._distribution_strategy = ds_context.get_strategy()
@@ -186,23 +289,20 @@ class Model(network.Network, version_utils.ModelVersionSelector):
       self._distribution_strategy = None
     # Defaults to value of `tf.config.experimental_functions_run_eagerly`.
     self._run_eagerly = None
-    self.stop_training = False
     # Initialize cache attrs.
     self._reset_compile_cache()
 
     # Fault-tolerance handler. Set in `ModelCheckpoint`.
     self._training_state = None
-    self.history = None
-
-    # These objects are used in the default `Model.compile`. They are not
-    # guaranteed to be set after `Model.compile` is called, as users can
-    # override compile with custom logic.
-    self.compiled_loss = None
-    self.compiled_metrics = None
+    self._saved_model_inputs_spec = None
+    self._trackable_saver = (
+        trackable_utils.saver_with_op_caching(self))
 
     self._steps_per_execution = None
 
     self._init_batch_counters()
+    self._base_model_initialized = True
+    _keras_api_gauge.get_cell('model').set(True)
 
   @trackable.no_automatic_dependency_tracking
   def _init_batch_counters(self):
@@ -214,67 +314,153 @@ class Model(network.Network, version_utils.ModelVersionSelector):
     self._predict_counter = variables.Variable(
         0, dtype='int64', aggregation=agg)
 
-  def get_weights(self):
-    """Retrieves the weights of the model.
+  def __setattr__(self, name, value):
+    if not getattr(self, '_self_setattr_tracking', True):
+      super(Model, self).__setattr__(name, value)
+      return
 
-    Returns:
-        A flat list of Numpy arrays.
-    """
-    with self.distribute_strategy.scope():
-      return super(Model, self).get_weights()
+    if all(
+        isinstance(v, (base_layer.Layer,
+                       data_structures.TrackableDataStructure)) or
+        trackable_layer_utils.has_weights(v) for v in nest.flatten(value)):
+      try:
+        self._base_model_initialized
+      except AttributeError:
+        # six.raise_from supresses the original AttributeError from being raised
+        six.raise_from(
+            RuntimeError('It looks like you are subclassing `Model` and you '
+                         'forgot to call `super(YourClass, self).__init__()`.'
+                         ' Always start with this line.'), None)
 
-  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
-    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
+    super(Model, self).__setattr__(name, value)
 
-    If `by_name` is False weights are loaded based on the network's
-    topology. This means the architecture should be the same as when the weights
-    were saved.  Note that layers that don't have weights are not taken into
-    account in the topological ordering, so adding or removing layers is fine as
-    long as they don't have weights.
+    # Keep track of metric instance created in subclassed model/layer.
+    # We do this so that we can maintain the correct order of metrics by adding
+    # the instance to the `metrics` list as soon as it is created.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    if isinstance(value, metrics_module.Metric):
+      self._metrics.append(value)
 
-    If `by_name` is True, weights are loaded into layers only if they share the
-    same name. This is useful for fine-tuning or transfer-learning models where
-    some of the layers have changed.
+  @generic_utils.default
+  def build(self, input_shape):
+    """Builds the model based on input shapes received.
 
-    Only topological loading (`by_name=False`) is supported when loading weights
-    from the TensorFlow format. Note that topological loading differs slightly
-    between TensorFlow and HDF5 formats for user-defined classes inheriting from
-    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
-    TensorFlow format loads based on the object-local names of attributes to
-    which layers are assigned in the `Model`'s constructor.
+    This is to be used for subclassed models, which do not know at instantiation
+    time what their inputs look like.
 
-    Arguments:
-        filepath: String, path to the weights file to load. For weight files in
-            TensorFlow format, this is the file prefix (the same as was passed
-            to `save_weights`).
-        by_name: Boolean, whether to load weights by name or by topological
-            order. Only topological loading is supported for weight files in
-            TensorFlow format.
-        skip_mismatch: Boolean, whether to skip loading of layers where there is
-            a mismatch in the number of weights, or a mismatch in the shape of
-            the weight (only valid when `by_name=True`).
+    This method only exists for users who want to call `model.build()` in a
+    standalone way (as a substitute for calling the model on real data to
+    build it). It will never be called by the framework (and thus it will
+    never throw unexpected errors in an unrelated workflow).
 
-    Returns:
-        When loading a weight file in TensorFlow format, returns the same status
-        object as `tf.train.Checkpoint.restore`. When graph building, restore
-        ops are run automatically as soon as the network is built (on first call
-        for user-defined classes inheriting from `Model`, immediately if it is
-        already built).
-
-        When loading weights in HDF5 format, returns `None`.
+    Args:
+     input_shape: Single tuple, TensorShape, or list of shapes, where shapes
+         are tuples, integers, or TensorShapes.
 
     Raises:
-        ImportError: If h5py is not available and the weight file is in HDF5
-            format.
-        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
-          `False`.
+      ValueError:
+        1. In case of invalid user-provided data (not of type tuple,
+           list, or TensorShape).
+        2. If the model requires call arguments that are agnostic
+           to the input shapes (positional or kwarg in call signature).
+        3. If not all layers were properly built.
+        4. If float type inputs are not supported within the layers.
+
+      In each of these cases, the user should build their model by calling it
+      on real tensor data.
     """
-    if dist_utils.is_tpu_strategy(self._distribution_strategy):
-      if (self._distribution_strategy.extended.steps_per_run > 1 and
-          (not network._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
-        raise ValueError('Load weights is not yet supported with TPUStrategy '
-                         'with steps_per_run greater than 1.')
-    return super(Model, self).load_weights(filepath, by_name, skip_mismatch)
+    if self._is_graph_network:
+      super(Model, self).build(input_shape)
+      return
+
+    if input_shape is None:
+      raise ValueError('Input shape must be defined when calling build on a '
+                       'model subclass network.')
+    valid_types = (tuple, list, tensor_shape.TensorShape)
+    if not isinstance(input_shape, valid_types):
+      raise ValueError('Specified input shape is not one of the valid types. '
+                       'Please specify a batch input shape of type tuple or '
+                       'list of input shapes. User provided '
+                       'input type: {}'.format(type(input_shape)))
+
+    if input_shape and not self.inputs:
+      # We create placeholders for the `None`s in the shape and build the model
+      # in a Graph. Since tf.Variable is compatible with both eager execution
+      # and graph building, the variables created after building the model in
+      # a Graph are still valid when executing eagerly.
+      if context.executing_eagerly():
+        graph = func_graph.FuncGraph('build_graph')
+      else:
+        graph = backend.get_graph()
+      with graph.as_default():
+        if isinstance(input_shape, list):
+          x = [base_layer_utils.generate_placeholders_from_shape(shape)
+               for shape in input_shape]
+        elif isinstance(input_shape, dict):
+          x = {
+              k: base_layer_utils.generate_placeholders_from_shape(shape)
+              for k, shape in input_shape.items()
+          }
+        else:
+          x = base_layer_utils.generate_placeholders_from_shape(input_shape)
+
+        kwargs = {}
+        call_signature = self._call_full_argspec
+        call_args = call_signature.args
+        # Exclude `self`, `inputs`, and any argument with a default value.
+        if len(call_args) > 2:
+          if call_signature.defaults:
+            call_args = call_args[2:-len(call_signature.defaults)]
+          else:
+            call_args = call_args[2:]
+          for arg in call_args:
+            if arg == 'training':
+              # Case where `training` is a positional arg with no default.
+              kwargs['training'] = False
+            else:
+              # Has invalid call signature with unknown positional arguments.
+              raise ValueError(
+                  'Currently, you cannot build your model if it has '
+                  'positional or keyword arguments that are not '
+                  'inputs to the model, but are required for its '
+                  '`call` method. Instead, in order to instantiate '
+                  'and build your model, `call` your model on real '
+                  'tensor data with all expected call arguments.')
+        elif len(call_args) < 2:
+          # Signature without `inputs`.
+          raise ValueError('You can only call `build` on a model if its `call` '
+                           'method accepts an `inputs` argument.')
+        try:
+          self.call(x, **kwargs)
+        except (errors.InvalidArgumentError, TypeError):
+          raise ValueError('You cannot build your model by calling `build` '
+                           'if your layers do not support float type inputs. '
+                           'Instead, in order to instantiate and build your '
+                           'model, `call` your model on real tensor data (of '
+                           'the correct dtype).')
+
+    super(Model, self).build(input_shape)
+
+  def call(self, inputs, training=None, mask=None):
+    """Calls the model on new inputs.
+
+    In this case `call` just reapplies
+    all ops in the graph to the new inputs
+    (e.g. build a new computational graph from the provided inputs).
+
+    Arguments:
+        inputs: A tensor or list of tensors.
+        training: Boolean or boolean scalar tensor, indicating whether to run
+          the `Network` in training mode or inference mode.
+        mask: A mask or list of masks. A mask can be
+            either a tensor or None (no mask).
+
+    Returns:
+        A tensor if there is a single output, or
+        a list of tensors if there are more than one outputs.
+    """
+    raise NotImplementedError('When subclassing the `Model` class, you should '
+                              'implement a `call` method.')
 
   def compile(self,
               optimizer='rmsprop',
@@ -399,6 +585,10 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         dtype='int64',
         aggregation=variables.VariableAggregationV2.ONLY_FIRST_REPLICA)
 
+  @property
+  def _should_compute_mask(self):
+    return False
+
   @property
   def metrics(self):
     """Returns the model's metrics added using `compile`, `add_metric` APIs.
@@ -1661,6 +1851,564 @@ class Model(network.Network, version_utils.ModelVersionSelector):
         verbose=verbose,
         callbacks=callbacks)
 
+  ######################################################################
+  # Functions below are not training related. They are for model weights
+  # tracking, save/load, serialization, etc.
+  ######################################################################
+
+  @property
+  def trainable_weights(self):
+    self._assert_weights_created()
+    return self._dedup_weights(
+        trackable_layer_utils.gather_trainable_weights(
+            trainable=self.trainable,
+            sub_layers=self._layers,
+            extra_variables=self._trainable_weights))
+
+  @property
+  def non_trainable_weights(self):
+    self._assert_weights_created()
+    return self._dedup_weights(
+        trackable_layer_utils.gather_non_trainable_weights(
+            trainable=self.trainable,
+            sub_layers=self._layers,
+            extra_variables=self._non_trainable_weights +
+            self._trainable_weights))
+
+  def get_weights(self):
+    """Retrieves the weights of the model.
+
+    Returns:
+        A flat list of Numpy arrays.
+    """
+    with self.distribute_strategy.scope():
+      return super(Model, self).get_weights()
+
+  def save(self,
+           filepath,
+           overwrite=True,
+           include_optimizer=True,
+           save_format=None,
+           signatures=None,
+           options=None):
+    """Saves the model to Tensorflow SavedModel or a single HDF5 file.
+
+    The savefile includes:
+
+    - The model architecture, allowing to re-instantiate the model.
+    - The model weights.
+    - The state of the optimizer, allowing to resume training
+        exactly where you left off.
+
+    This allows you to save the entirety of the state of a model
+    in a single file.
+
+    Saved models can be reinstantiated via `keras.models.load_model`.
+    The model returned by `load_model` is a compiled model ready to be used
+    (unless the saved model was never compiled in the first place).
+
+    Models built with the Sequential and Functional API can be saved to both the
+    HDF5 and SavedModel formats. Subclassed models can only be saved with the
+    SavedModel format.
+
+    Note that the model weights may have different scoped names after being
+    loaded. Scoped names include the model/layer names, such as
+    `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
+     access specific variables, e.g. `model.get_layer("dense_1").kernel`.
+
+    Arguments:
+        filepath: String, PathLike, path to SavedModel or H5 file to save the
+            model.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+        include_optimizer: If True, save optimizer's state together.
+        save_format: Either `'tf'` or `'h5'`, indicating whether to save the
+            model to Tensorflow SavedModel or HDF5. Defaults to 'tf' in TF 2.X,
+            and 'h5' in TF 1.X.
+        signatures: Signatures to save with the SavedModel. Applicable to the
+            'tf' format only. Please see the `signatures` argument in
+            `tf.saved_model.save` for details.
+        options: Optional `tf.saved_model.SaveOptions` object that specifies
+            options for saving to SavedModel.
+
+    Example:
+
+    ```python
+    from keras.models import load_model
+
+    model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'
+    del model  # deletes the existing model
+
+    # returns a compiled model
+    # identical to the previous one
+    model = load_model('my_model.h5')
+    ```
+    """
+    save.save_model(self, filepath, overwrite, include_optimizer, save_format,
+                    signatures, options)
+
+  def save_weights(self, filepath, overwrite=True, save_format=None):
+    """Saves all layer weights.
+
+    Either saves in HDF5 or in TensorFlow format based on the `save_format`
+    argument.
+
+    When saving in HDF5 format, the weight file has:
+      - `layer_names` (attribute), a list of strings
+          (ordered names of model layers).
+      - For every layer, a `group` named `layer.name`
+          - For every such layer group, a group attribute `weight_names`,
+              a list of strings
+              (ordered names of weights tensor of the layer).
+          - For every weight in the layer, a dataset
+              storing the weight value, named after the weight tensor.
+
+    When saving in TensorFlow format, all objects referenced by the network are
+    saved in the same format as `tf.train.Checkpoint`, including any `Layer`
+    instances or `Optimizer` instances assigned to object attributes. For
+    networks constructed from inputs and outputs using `tf.keras.Model(inputs,
+    outputs)`, `Layer` instances used by the network are tracked/saved
+    automatically. For user-defined classes which inherit from `tf.keras.Model`,
+    `Layer` instances must be assigned to object attributes, typically in the
+    constructor. See the documentation of `tf.train.Checkpoint` and
+    `tf.keras.Model` for details.
+
+    While the formats are the same, do not mix `save_weights` and
+    `tf.train.Checkpoint`. Checkpoints saved by `Model.save_weights` should be
+    loaded using `Model.load_weights`. Checkpoints saved using
+    `tf.train.Checkpoint.save` should be restored using the corresponding
+    `tf.train.Checkpoint.restore`. Prefer `tf.train.Checkpoint` over
+    `save_weights` for training checkpoints.
+
+    The TensorFlow format matches objects and variables by starting at a root
+    object, `self` for `save_weights`, and greedily matching attribute
+    names. For `Model.save` this is the `Model`, and for `Checkpoint.save` this
+    is the `Checkpoint` even if the `Checkpoint` has a model attached. This
+    means saving a `tf.keras.Model` using `save_weights` and loading into a
+    `tf.train.Checkpoint` with a `Model` attached (or vice versa) will not match
+    the `Model`'s variables. See the [guide to training
+    checkpoints](https://www.tensorflow.org/guide/checkpoint) for details
+    on the TensorFlow format.
+
+    Arguments:
+        filepath: String or PathLike, path to the file to save the weights to.
+            When saving in TensorFlow format, this is the prefix used for
+            checkpoint files (multiple files are generated). Note that the '.h5'
+            suffix causes weights to be saved in HDF5 format.
+        overwrite: Whether to silently overwrite any existing file at the
+            target location, or provide the user with a manual prompt.
+        save_format: Either 'tf' or 'h5'. A `filepath` ending in '.h5' or
+            '.keras' will default to HDF5 if `save_format` is `None`. Otherwise
+            `None` defaults to 'tf'.
+
+    Raises:
+        ImportError: If h5py is not available when attempting to save in HDF5
+            format.
+        ValueError: For invalid/unknown format arguments.
+    """
+    self._assert_weights_created()
+    filepath = path_to_string(filepath)
+    filepath_is_h5 = _is_hdf5_filepath(filepath)
+    if save_format is None:
+      if filepath_is_h5:
+        save_format = 'h5'
+      else:
+        save_format = 'tf'
+    else:
+      user_format = save_format.lower().strip()
+      if user_format in ('tensorflow', 'tf'):
+        save_format = 'tf'
+      elif user_format in ('hdf5', 'h5', 'keras'):
+        save_format = 'h5'
+      else:
+        raise ValueError(
+            'Unknown format "%s". Was expecting one of {"tf", "h5"}.' % (
+                save_format,))
+    if save_format == 'tf' and filepath_is_h5:
+      raise ValueError(
+          ('save_weights got save_format="tf"/"tensorflow", but the '
+           'filepath ("%s") looks like an HDF5 file. Omit the ".h5"/".keras" '
+           'when saving in TensorFlow format.')
+          % filepath)
+
+    if save_format == 'h5' and h5py is None:
+      raise ImportError(
+          '`save_weights` requires h5py when saving in hdf5.')
+    if save_format == 'tf':
+      check_filepath = filepath + '.index'
+    else:
+      check_filepath = filepath
+    # If file exists and should not be overwritten:
+    if not overwrite and os.path.isfile(check_filepath):
+      proceed = ask_to_proceed_with_overwrite(check_filepath)
+      if not proceed:
+        return
+    if save_format == 'h5':
+      with h5py.File(filepath, 'w') as f:
+        hdf5_format.save_weights_to_hdf5_group(f, self.layers)
+    else:
+      if context.executing_eagerly():
+        session = None
+      else:
+        session = backend.get_session()
+      optimizer = getattr(self, 'optimizer', None)
+      if (optimizer
+          and not isinstance(optimizer, trackable.Trackable)):
+        logging.warning(
+            ('This model was compiled with a Keras optimizer (%s) but is being '
+             'saved in TensorFlow format with `save_weights`. The model\'s '
+             'weights will be saved, but unlike with TensorFlow optimizers in '
+             'the TensorFlow format the optimizer\'s state will not be '
+             'saved.\n\nConsider using a TensorFlow optimizer from `tf.train`.')
+            % (optimizer,))
+      self._trackable_saver.save(filepath, session=session)
+      # Record this checkpoint so it's visible from tf.train.latest_checkpoint.
+      checkpoint_management.update_checkpoint_state_internal(
+          save_dir=os.path.dirname(filepath),
+          model_checkpoint_path=filepath,
+          save_relative_paths=True,
+          all_model_checkpoint_paths=[filepath])
+
+  def load_weights(self, filepath, by_name=False, skip_mismatch=False):
+    """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
+
+    If `by_name` is False weights are loaded based on the network's
+    topology. This means the architecture should be the same as when the weights
+    were saved.  Note that layers that don't have weights are not taken into
+    account in the topological ordering, so adding or removing layers is fine as
+    long as they don't have weights.
+
+    If `by_name` is True, weights are loaded into layers only if they share the
+    same name. This is useful for fine-tuning or transfer-learning models where
+    some of the layers have changed.
+
+    Only topological loading (`by_name=False`) is supported when loading weights
+    from the TensorFlow format. Note that topological loading differs slightly
+    between TensorFlow and HDF5 formats for user-defined classes inheriting from
+    `tf.keras.Model`: HDF5 loads based on a flattened list of weights, while the
+    TensorFlow format loads based on the object-local names of attributes to
+    which layers are assigned in the `Model`'s constructor.
+
+    Arguments:
+        filepath: String, path to the weights file to load. For weight files in
+            TensorFlow format, this is the file prefix (the same as was passed
+            to `save_weights`).
+        by_name: Boolean, whether to load weights by name or by topological
+            order. Only topological loading is supported for weight files in
+            TensorFlow format.
+        skip_mismatch: Boolean, whether to skip loading of layers where there is
+            a mismatch in the number of weights, or a mismatch in the shape of
+            the weight (only valid when `by_name=True`).
+
+    Returns:
+        When loading a weight file in TensorFlow format, returns the same status
+        object as `tf.train.Checkpoint.restore`. When graph building, restore
+        ops are run automatically as soon as the network is built (on first call
+        for user-defined classes inheriting from `Model`, immediately if it is
+        already built).
+
+        When loading weights in HDF5 format, returns `None`.
+
+    Raises:
+        ImportError: If h5py is not available and the weight file is in HDF5
+            format.
+        ValueError: If `skip_mismatch` is set to `True` when `by_name` is
+          `False`.
+    """
+    if dist_utils.is_tpu_strategy(self._distribution_strategy):
+      if (self._distribution_strategy.extended.steps_per_run > 1 and
+          (not _is_hdf5_filepath(filepath))):
+        raise ValueError('Load weights is not yet supported with TPUStrategy '
+                         'with steps_per_run greater than 1.')
+    if skip_mismatch and not by_name:
+      raise ValueError(
+          'When calling model.load_weights, skip_mismatch can only be set to '
+          'True when by_name is True.')
+
+    filepath = path_to_string(filepath)
+    if _is_hdf5_filepath(filepath):
+      save_format = 'h5'
+    else:
+      try:
+        py_checkpoint_reader.NewCheckpointReader(filepath)
+        save_format = 'tf'
+      except errors_impl.DataLossError:
+        # The checkpoint is not readable in TensorFlow format. Try HDF5.
+        save_format = 'h5'
+    if save_format == 'tf':
+      status = self._trackable_saver.restore(filepath)
+      if by_name:
+        raise NotImplementedError(
+            'Weights may only be loaded based on topology into Models when '
+            'loading TensorFlow-formatted weights (got by_name=True to '
+            'load_weights).')
+      if not context.executing_eagerly():
+        session = backend.get_session()
+        # Restore existing variables (if any) immediately, and set up a
+        # streaming restore for any variables created in the future.
+        trackable_utils.streaming_restore(status=status, session=session)
+      status.assert_nontrivial_match()
+      return status
+    if h5py is None:
+      raise ImportError(
+          '`load_weights` requires h5py when loading weights from HDF5.')
+    if not self._is_graph_network and not self.built:
+      raise ValueError(
+          'Unable to load weights saved in HDF5 format into a subclassed '
+          'Model which has not created its variables yet. Call the Model '
+          'first, then load the weights.')
+    self._assert_weights_created()
+    with h5py.File(filepath, 'r') as f:
+      if 'layer_names' not in f.attrs and 'model_weights' in f:
+        f = f['model_weights']
+      if by_name:
+        hdf5_format.load_weights_from_hdf5_group_by_name(
+            f, self.layers, skip_mismatch=skip_mismatch)
+      else:
+        hdf5_format.load_weights_from_hdf5_group(f, self.layers)
+
+  def _updated_config(self):
+    """Util shared between different serialization methods.
+
+    Returns:
+        Model config with Keras version information added.
+    """
+    from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+
+    config = self.get_config()
+    model_config = {
+        'class_name': self.__class__.__name__,
+        'config': config,
+        'keras_version': keras_version,
+        'backend': backend.backend()
+    }
+    return model_config
+
+  def get_config(self):
+    raise NotImplementedError
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    # Since only FunctionalModel produces config, the model can only
+    # be constructed for FunctionalModel
+    from tensorflow.python.keras.engine import functional  # pylint: disable=g-import-not-at-top
+    return functional.Functional.from_config(
+        config, custom_objects=custom_objects)
+
+  def to_json(self, **kwargs):
+    """Returns a JSON string containing the network configuration.
+
+    To load a network from a JSON save file, use
+    `keras.models.model_from_json(json_string, custom_objects={})`.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `json.dumps()`.
+
+    Returns:
+        A JSON string.
+    """
+    model_config = self._updated_config()
+    return json.dumps(
+        model_config, default=serialization.get_json_type, **kwargs)
+
+  def to_yaml(self, **kwargs):
+    """Returns a yaml string containing the network configuration.
+
+    To load a network from a yaml save file, use
+    `keras.models.model_from_yaml(yaml_string, custom_objects={})`.
+
+    `custom_objects` should be a dictionary mapping
+    the names of custom losses / layers / etc to the corresponding
+    functions / classes.
+
+    Arguments:
+        **kwargs: Additional keyword arguments
+            to be passed to `yaml.dump()`.
+
+    Returns:
+        A YAML string.
+
+    Raises:
+        ImportError: if yaml module is not found.
+    """
+    if yaml is None:
+      raise ImportError(
+          'Requires yaml module installed (`pip install pyyaml`).')
+    return yaml.dump(self._updated_config(), **kwargs)
+
+  def reset_states(self):
+    for layer in self.layers:
+      if hasattr(layer, 'reset_states') and getattr(layer, 'stateful', False):
+        layer.reset_states()
+
+  @property
+  @deprecation.deprecated(
+      date=None,
+      instructions='This property should not be used in TensorFlow 2.0, '
+      'as updates are applied automatically.')
+  @doc_controls.do_not_generate_docs
+  def state_updates(self):
+    """Deprecated, do NOT use!
+
+    Returns the `updates` from all layers that are stateful.
+
+    This is useful for separating training updates and
+    state updates, e.g. when we need to update a layer's internal state
+    during prediction.
+
+    Returns:
+        A list of update ops.
+    """
+    state_updates = []
+    for layer in self.layers:
+      if getattr(layer, 'stateful', False):
+        if hasattr(layer, 'updates'):
+          state_updates += layer.updates
+    return state_updates
+
+  @property
+  def weights(self):
+    """Returns the list of all layer variables/weights.
+
+    Returns:
+      A list of variables.
+    """
+    return self._dedup_weights(self._undeduplicated_weights)
+
+  @property
+  def _undeduplicated_weights(self):
+    """Returns the undeduplicated list of all layer variables/weights."""
+    self._assert_weights_created()
+    weights = []
+    for layer in self._layers:
+      weights += layer.weights
+    weights += (self._trainable_weights + self._non_trainable_weights)
+    return weights
+
+  def summary(self, line_length=None, positions=None, print_fn=None):
+    """Prints a string summary of the network.
+
+    Arguments:
+        line_length: Total length of printed lines
+            (e.g. set this to adapt the display to different
+            terminal window sizes).
+        positions: Relative or absolute positions of log elements
+            in each line. If not provided,
+            defaults to `[.33, .55, .67, 1.]`.
+        print_fn: Print function to use. Defaults to `print`.
+            It will be called on each line of the summary.
+            You can set it to a custom function
+            in order to capture the string summary.
+
+    Raises:
+        ValueError: if `summary()` is called before the model is built.
+    """
+    if not self.built:
+      raise ValueError('This model has not yet been built. '
+                       'Build the model first by calling `build()` or calling '
+                       '`fit()` with some data, or specify '
+                       'an `input_shape` argument in the first layer(s) for '
+                       'automatic build.')
+    layer_utils.print_summary(self,
+                              line_length=line_length,
+                              positions=positions,
+                              print_fn=print_fn)
+
+  @property
+  def layers(self):
+    return self._unique_sublayers()
+
+  def get_layer(self, name=None, index=None):
+    """Retrieves a layer based on either its name (unique) or index.
+
+    If `name` and `index` are both provided, `index` will take precedence.
+    Indices are based on order of horizontal graph traversal (bottom-up).
+
+    Arguments:
+        name: String, name of layer.
+        index: Integer, index of layer.
+
+    Returns:
+        A layer instance.
+
+    Raises:
+        ValueError: In case of invalid layer name or index.
+    """
+    # TODO(fchollet): We could build a dictionary based on layer names
+    # since they are constant, but we have not done that yet.
+    if index is not None and name is not None:
+      raise ValueError('Provide only a layer name or a layer index.')
+
+    if index is not None:
+      if len(self.layers) <= index:
+        raise ValueError('Was asked to retrieve layer at index ' + str(index) +
+                         ' but model only has ' + str(len(self.layers)) +
+                         ' layers.')
+      else:
+        return self.layers[index]
+
+    if name is not None:
+      for layer in self.layers:
+        if layer.name == name:
+          return layer
+      raise ValueError('No such layer: ' + name + '.')
+    raise ValueError('Provide either a layer name or layer index.')
+
+  @trackable.no_automatic_dependency_tracking
+  def _set_save_spec(self, inputs):
+    if self._saved_model_inputs_spec is not None:
+      return  # Already set.
+
+    input_names = self.input_names
+    if not input_names:
+      input_names = compile_utils.create_pseudo_input_names(inputs)
+
+    flat_inputs = nest.flatten(inputs)
+    specs = []
+    for name, tensor in zip(input_names, flat_inputs):
+      specs.append(
+          tf_utils.get_tensor_spec(tensor, dynamic_batch=False, name=name))
+    specs = nest.pack_sequence_as(inputs, specs)
+
+    self._saved_model_inputs_spec = specs
+
+  def _get_save_spec(self, dynamic_batch=True):
+    if self._saved_model_inputs_spec is None:
+      return None
+
+    return nest.map_structure(
+        lambda t: tf_utils.get_tensor_spec(t, dynamic_batch=dynamic_batch),
+        self._saved_model_inputs_spec)
+
+  def _assert_weights_created(self):
+    """Asserts that all the weights for the model have been created.
+
+    For a non-dynamic model, the weights must already be created after the
+    layer has been called. For a dynamic model, the exact list of weights can
+    never be known for certain since it may change at any time during execution.
+
+    We run this check right before accessing weights or getting the Numpy value
+    for the current weights. Otherwise, if the layer has never been called,
+    the user would just get an empty list, which is misleading.
+
+    Raises:
+      ValueError: if the weights of the network has not yet been created.
+    """
+    if self.dynamic:
+      return
+
+    if ('build' in self.__class__.__dict__ and
+        self.__class__ != Model and
+        not self.built):
+      # For any model that has customized build() method but hasn't
+      # been invoked yet, this will cover both sequential and subclass model.
+      # Also make sure to exclude Model class itself which has build() defined.
+      raise ValueError('Weights for model %s have not yet been created. '
+                       'Weights are created when the Model is first called on '
+                       'inputs or `build()` is called with an `input_shape`.' %
+                       self.name)
+
   def _check_call_args(self, method_name):
     """Check that `call` has only one positional arg."""
     # Always allow first arg, regardless of arg name.
@@ -1990,3 +2738,8 @@ def _disallow_inside_tf_function(method_name):
         'directly on `Tensor`s inside a `tf.function` like: `model(x)`.'
     ).format(method_name=method_name)
     raise RuntimeError(error_msg)
+
+
+def _is_hdf5_filepath(filepath):
+  return (filepath.endswith('.h5') or filepath.endswith('.keras') or
+          filepath.endswith('.hdf5'))
diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py
index 16188af833a..c137c6e517a 100644
--- a/tensorflow/python/keras/engine/training_v1.py
+++ b/tensorflow/python/keras/engine/training_v1.py
@@ -43,7 +43,7 @@ from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
 from tensorflow.python.keras.distribute import distributed_training_utils
-from tensorflow.python.keras.engine import network
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import training as training_lib
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_distributed
@@ -181,8 +181,8 @@ class Model(training_lib.Model):
                 self._compile_time_distribution_strategy)
     if strategy:
       with strategy.scope():
-        return network.Network.get_weights(self)
-    return network.Network.get_weights(self)
+        return base_layer.Layer.get_weights(self)
+    return base_layer.Layer.get_weights(self)
 
   def load_weights(self, filepath, by_name=False, skip_mismatch=False):
     """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
@@ -232,7 +232,7 @@ class Model(training_lib.Model):
     """
     if distributed_training_utils.is_tpu_strategy(self._distribution_strategy):
       if (self._distribution_strategy.extended.steps_per_run > 1 and
-          (not network._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
+          (not training_lib._is_hdf5_filepath(filepath))):  # pylint: disable=protected-access
         raise ValueError('Load weights is not yet supported with TPUStrategy '
                          'with steps_per_run greater than 1.')
     return super(Model, self).load_weights(filepath, by_name, skip_mismatch)
@@ -491,6 +491,11 @@ class Model(training_lib.Model):
     """Returns the model's metrics added using `compile`, `add_metric` APIs."""
     metrics = []
     if self._is_compiled:
+      if not hasattr(self, '_v1_compile_was_called'):
+        # See b/155687393 for more details, the model is created as a v2
+        # instance but converted to v1. Fallback to use base Model to retrieve
+        # the metrics.
+        return super(Model, self).metrics
       metrics += self._compile_metric_functions
     metrics.extend(self._metrics)
     metrics.extend(_get_metrics_from_layers(self._layers))
@@ -504,6 +509,12 @@ class Model(training_lib.Model):
     # losses for backward compatibility.
     metrics_names = ['loss']
     if self._is_compiled:
+      if not hasattr(self, '_v1_compile_was_called'):
+        # See b/155687393 for more details, the model is created as a v2
+        # instance but converted to v1. Fallback to use base Model to retrieve
+        # the metrics name
+        return super(Model, self).metrics_names
+
       # Add output loss metric names to the metric names list.
       if len(self._training_endpoints) > 1:
         metrics_names.extend([
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index fc7feda07a5..30be3d485df 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -114,7 +114,7 @@ def populate_deserializable_objects():
 
   LOCAL.ALL_OBJECTS['Input'] = input_layer.Input
   LOCAL.ALL_OBJECTS['InputSpec'] = input_spec.InputSpec
-  LOCAL.ALL_OBJECTS['Network'] = models.Network
+  LOCAL.ALL_OBJECTS['Functional'] = models.Functional
   LOCAL.ALL_OBJECTS['Model'] = models.Model
   LOCAL.ALL_OBJECTS['SequenceFeatures'] = SequenceFeatures
   LOCAL.ALL_OBJECTS['Sequential'] = models.Sequential
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index a3173f4d11f..bb22db25591 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -377,7 +377,8 @@ class TimeDistributedTest(keras_parameterized.TestCase):
           input_layer.compute_output_shape([None, 2, 4]).as_list(),
           [None, 2, 8])
 
-  @keras_parameterized.run_all_keras_modes
+  @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+  # TODO(scottzhu): check why v1 session failed.
   def test_TimeDistributed_with_mask_first_implementation(self):
     np.random.seed(100)
     rnn_layer = keras.layers.LSTM(4, return_sequences=True, stateful=True)
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index eaffb90e64b..9f5099e100e 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -23,7 +23,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
-from tensorflow.python.keras.engine import network
+from tensorflow.python.keras.engine import functional
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.engine import training_v1
@@ -31,7 +31,6 @@ from tensorflow.python.keras.engine.base_layer import AddMetric
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.saving import model_config
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils import generic_utils
@@ -45,6 +44,7 @@ from tensorflow.python.util.tf_export import keras_export
 # API entries importable from `keras.models`:
 Model = training.Model  # pylint: disable=invalid-name
 Sequential = sequential.Sequential  # pylint: disable=invalid-name
+Functional = functional.Functional  # pylint: disable=invalid-name
 save_model = save.save_model
 load_model = save.load_model
 model_from_config = model_config.model_from_config
@@ -193,12 +193,12 @@ def _clone_functional_model(model, input_tensors=None, layer_fn=_clone_layer):
   if not callable(layer_fn):
     raise ValueError('Expected `layer_fn` argument to be a callable.')
 
-  model_config, created_layers = _clone_layers_and_model_config(
+  model_configs, created_layers = _clone_layers_and_model_config(
       model, new_input_layers, layer_fn)
   # Reconstruct model from the config, using the cloned layers.
   input_tensors, output_tensors, created_layers = (
-      network.reconstruct_from_config(model_config,
-                                      created_layers=created_layers))
+      functional.reconstruct_from_config(model_configs,
+                                         created_layers=created_layers))
   metrics_names = model.metrics_names
   model = Model(input_tensors, output_tensors, name=model.name)
   # Layers not directly tied to outputs of the Model, such as loss layers
@@ -209,8 +209,8 @@ def _clone_functional_model(model, input_tensors=None, layer_fn=_clone_layer):
   if ancillary_layers:
     new_nodes = nest.flatten([
         layer.inbound_nodes[1:]
-        if network._should_skip_first_node(layer) else layer.inbound_nodes
-        for layer in created_layers.values()
+        if functional._should_skip_first_node(layer)
+        else layer.inbound_nodes for layer in created_layers.values()
     ])
     _insert_ancillary_layers(model, ancillary_layers, metrics_names, new_nodes)
   return model
@@ -244,7 +244,8 @@ def _clone_layers_and_model_config(model, input_layers, layer_fn):
       created_layers[layer.name] = layer_fn(layer)
     return {}
 
-  config = network.get_network_config(model, serialize_layer_fn=_copy_layer)
+  config = functional.get_network_config(
+      model, serialize_layer_fn=_copy_layer)
   return config, created_layers
 
 
@@ -495,7 +496,7 @@ def _in_place_subclassed_model_reset(model):
     # This will not work for nested subclassed models used as layers.
     # This would be theoretically possible to support, but would add complexity.
     # Only do it if users complain.
-    if isinstance(layer, Network) and not layer._is_graph_network:
+    if isinstance(layer, training.Model) and not layer._is_graph_network:
       raise ValueError('We do not support the use of nested subclassed models '
                        'in `model_to_estimator` at this time. Found nested '
                        'model: %s' % layer)
diff --git a/tensorflow/python/keras/saving/hdf5_format_test.py b/tensorflow/python/keras/saving/hdf5_format_test.py
index cae58329005..757385a25ea 100644
--- a/tensorflow/python/keras/saving/hdf5_format_test.py
+++ b/tensorflow/python/keras/saving/hdf5_format_test.py
@@ -1210,7 +1210,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   def test_incompatible_checkpoint(self):
     save_path = trackable.Checkpoint().save(
         os.path.join(self.get_temp_dir(), 'ckpt'))
-    m = keras.Model()
+    m = DummySubclassModel()
     with self.assertRaisesRegexp(AssertionError, 'Nothing to load'):
       m.load_weights(save_path)
     m.dense = keras.layers.Dense(2)
@@ -1222,7 +1222,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_directory_passed(self):
     with self.cached_session():
-      m = keras.Model()
+      m = DummySubclassModel()
       v = m.add_weight(name='v', shape=[])
       self.evaluate(v.assign(42.))
       prefix = os.path.join(self.get_temp_dir(),
@@ -1235,7 +1235,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_relative_path(self):
     with self.cached_session():
-      m = keras.Model()
+      m = DummySubclassModel()
       v = m.add_weight(name='v', shape=[])
       os.chdir(self.get_temp_dir())
 
@@ -1266,7 +1266,7 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_nonexistent_prefix_directory(self):
     with self.cached_session():
-      m = keras.Model()
+      m = DummySubclassModel()
       v = m.add_weight(name='v', shape=[])
       self.evaluate(v.assign(42.))
       prefix = os.path.join(self.get_temp_dir(),
@@ -1276,5 +1276,10 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase, parameterized.TestCase):
       m.load_weights(prefix)
       self.assertEqual(42., self.evaluate(v))
 
+
+class DummySubclassModel(training.Model):
+  pass
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/saving/saved_model/load.py b/tensorflow/python/keras/saving/saved_model/load.py
index 5ffeb0671a1..13af49e3a0d 100644
--- a/tensorflow/python/keras/saving/saved_model/load.py
+++ b/tensorflow/python/keras/saving/saved_model/load.py
@@ -62,9 +62,9 @@ layers_module = LazyLoader(
 input_layer = LazyLoader(
     "input_layer", globals(),
     "tensorflow.python.keras.engine.input_layer")
-network_lib = LazyLoader(
-    "network_lib", globals(),
-    "tensorflow.python.keras.engine.network")
+functional_lib = LazyLoader(
+    "functional_lib", globals(),
+    "tensorflow.python.keras.engine.functional")
 training_lib = LazyLoader(
     "training_lib", globals(),
     "tensorflow.python.keras.engine.training")
@@ -142,7 +142,7 @@ def _is_graph_network(layer):
   # pylint: disable=protected-access
   if isinstance(layer, RevivedNetwork):
     return False
-  elif isinstance(layer, network_lib.Network):
+  elif isinstance(layer, functional_lib.Functional):
     return (layer._is_graph_network or
             isinstance(layer, models_lib.Sequential))
   return False
@@ -371,7 +371,8 @@ class KerasObjectLoader(tf_load.Loader):
     # functional or Sequential model.
     model_is_functional_or_sequential = (
         metadata.get('is_graph_network', False) or
-        metadata['class_name'] == 'Sequential')
+        metadata['class_name'] == 'Sequential' or
+        metadata['class_name'] == 'Functional')
     if not (generic_utils.validate_config(config) and
             model_is_functional_or_sequential):
       return None  # Revive as custom model.
@@ -383,7 +384,8 @@ class KerasObjectLoader(tf_load.Loader):
     if class_name == 'Sequential':
       model = models_lib.Sequential(name=config['name'])
     else:
-      model = models_lib.Model(name=config['name'])
+      model = models_lib.Functional(
+          inputs=[], outputs=[], name=config['name'])
 
     # Record this model and its layers. This will later be used to reconstruct
     # the model.
@@ -561,10 +563,11 @@ class KerasObjectLoader(tf_load.Loader):
         if not model.built and not isinstance(input_specs, dict):
           model.build(input_shapes)
     else:
-      (inputs, outputs, created_layers) = network_lib.reconstruct_from_config(
-          config, created_layers={layer.name: layer for layer in layers})
+      (inputs, outputs,
+       created_layers) = functional_lib.reconstruct_from_config(
+           config, created_layers={layer.name: layer for layer in layers})
       model.__init__(inputs, outputs, name=config['name'])
-      network_lib.connect_ancillary_layers(model, created_layers)
+      functional_lib.connect_ancillary_layers(model, created_layers)
 
     # Set model dtype and trainable status.
     _set_network_attributes_from_metadata(model)
@@ -764,7 +767,7 @@ def revive_custom_object(identifier, metadata):
   revived_classes = {
       '_tf_keras_layer': (RevivedLayer, base_layer.Layer),
       '_tf_keras_input_layer': (RevivedInputLayer, input_layer.InputLayer),
-      '_tf_keras_network': (RevivedNetwork, network_lib.Network),
+      '_tf_keras_network': (RevivedNetwork, functional_lib.Functional),
       '_tf_keras_model': (RevivedNetwork, model_class),
       '_tf_keras_sequential': (RevivedNetwork, models_lib.Sequential),
   }
@@ -852,7 +855,7 @@ def _revive_setter(layer, name, value):
       layer._track_trackable(value, name=name)
     layer._serialized_attributes[name] = value
     # pylint: enable=protected-access
-  elif (isinstance(layer, network_lib.Network) and
+  elif (isinstance(layer, functional_lib.Functional) and
         re.match(r'^layer(_with_weights)?-[\d+]', name) is not None):
     # Edges named "layer-n" or "layer_with_weights-n", which are tracked in
     # network._track_layers, should not be added as an attribute.
diff --git a/tensorflow/python/keras/saving/saved_model/model_serialization.py b/tensorflow/python/keras/saving/saved_model/model_serialization.py
index 412fb0b54e5..c711e82a045 100644
--- a/tensorflow/python/keras/saving/saved_model/model_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/model_serialization.py
@@ -20,11 +20,11 @@ from __future__ import print_function
 
 from tensorflow.python.keras.saving import saving_utils
 from tensorflow.python.keras.saving.saved_model import constants
-from tensorflow.python.keras.saving.saved_model import network_serialization
+from tensorflow.python.keras.saving.saved_model import layer_serialization
 from tensorflow.python.keras.saving.saved_model import save_impl
 
 
-class ModelSavedModelSaver(network_serialization.NetworkSavedModelSaver):
+class ModelSavedModelSaver(layer_serialization.LayerSavedModelSaver):
   """Model SavedModel serialization."""
 
   @property
@@ -33,6 +33,10 @@ class ModelSavedModelSaver(network_serialization.NetworkSavedModelSaver):
 
   def _python_properties_internal(self):
     metadata = super(ModelSavedModelSaver, self)._python_properties_internal()
+    # Network stateful property is dependent on the child layers.
+    metadata.pop('stateful')
+    metadata['is_graph_network'] = self.obj._is_graph_network  # pylint: disable=protected-access
+
     metadata.update(
         saving_utils.model_metadata(
             self.obj, include_optimizer=True, require_config=False))
diff --git a/tensorflow/python/keras/saving/saved_model/network_serialization.py b/tensorflow/python/keras/saving/saved_model/network_serialization.py
index 1c94377e3db..c98cba47155 100644
--- a/tensorflow/python/keras/saving/saved_model/network_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/network_serialization.py
@@ -18,22 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras.saving.saved_model import layer_serialization
+from tensorflow.python.keras.saving.saved_model import model_serialization
 
 
-# Network serialization is pretty much the same as layer serialization.
-class NetworkSavedModelSaver(layer_serialization.LayerSavedModelSaver):
+# FunctionalModel serialization is pretty much the same as Model serialization.
+class NetworkSavedModelSaver(model_serialization.ModelSavedModelSaver):
   """Network serialization."""
 
   @property
   def object_identifier(self):
     return '_tf_keras_network'
-
-  def _python_properties_internal(self):
-    metadata = super(NetworkSavedModelSaver, self)._python_properties_internal()
-
-    # Network stateful property is dependent on the child layers.
-    metadata.pop('stateful')
-
-    metadata['is_graph_network'] = self.obj._is_graph_network  # pylint: disable=protected-access
-    return metadata
diff --git a/tensorflow/python/keras/utils/version_utils_test.py b/tensorflow/python/keras/utils/version_utils_test.py
index 76e888ca553..0a3cd53f3c0 100644
--- a/tensorflow/python/keras/utils/version_utils_test.py
+++ b/tensorflow/python/keras/utils/version_utils_test.py
@@ -53,12 +53,12 @@ class SplitUtilsTest(keras_parameterized.TestCase):
     inputs = keras.Input(10)
     outputs = keras.layers.Dense(1)(inputs)
     model = keras.Model(inputs, outputs)
-    self._check_model_class(model.__class__)
+    self._check_model_class(model.__class__.__bases__[0])
     self._check_layer_class(model)
 
   def test_sequential_model(self):
     model = keras.Sequential([keras.layers.Dense(1)])
-    model_class = model.__class__.__bases__[0]
+    model_class = model.__class__.__bases__[0].__bases__[0]
     self._check_model_class(model_class)
     self._check_layer_class(model)
 
diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index 87c436a5bd7..158f6c83748 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -55,10 +55,10 @@ def check_pydot():
 
 
 def is_wrapped_model(layer):
-  from tensorflow.python.keras.engine import network
+  from tensorflow.python.keras.engine import functional
   from tensorflow.python.keras.layers import wrappers
   return (isinstance(layer, wrappers.Wrapper) and
-          isinstance(layer.layer, network.Network))
+          isinstance(layer.layer, functional.Functional))
 
 
 def add_edge(dot, src, dst):
@@ -98,7 +98,7 @@ def model_to_dot(model,
   """
   from tensorflow.python.keras.layers import wrappers
   from tensorflow.python.keras.engine import sequential
-  from tensorflow.python.keras.engine import network
+  from tensorflow.python.keras.engine import functional
 
   if not check_pydot():
     message = (
@@ -147,7 +147,8 @@ def model_to_dot(model,
     class_name = layer.__class__.__name__
 
     if isinstance(layer, wrappers.Wrapper):
-      if expand_nested and isinstance(layer.layer, network.Network):
+      if expand_nested and isinstance(layer.layer,
+                                      functional.Functional):
         submodel_wrapper = model_to_dot(layer.layer, show_shapes,
                                         show_layer_names, rankdir,
                                         expand_nested,
@@ -162,7 +163,7 @@ def model_to_dot(model,
         child_class_name = layer.layer.__class__.__name__
         class_name = '{}({})'.format(class_name, child_class_name)
 
-    if expand_nested and isinstance(layer, network.Network):
+    if expand_nested and isinstance(layer, functional.Functional):
       submodel_not_wrapper = model_to_dot(layer, show_shapes,
                                           show_layer_names, rankdir,
                                           expand_nested,
@@ -200,7 +201,8 @@ def model_to_dot(model,
                                                      inputlabels,
                                                      outputlabels)
 
-    if not expand_nested or not isinstance(layer, network.Network):
+    if not expand_nested or not isinstance(
+        layer, functional.Functional):
       node = pydot.Node(layer_id, label=label)
       dot.add_node(node)
 
@@ -218,16 +220,17 @@ def model_to_dot(model,
             add_edge(dot, inbound_layer_id, layer_id)
           else:
             # if inbound_layer is not Model or wrapped Model
-            if (not isinstance(inbound_layer, network.Network) and
+            if (not isinstance(inbound_layer,
+                               functional.Functional) and
                 not is_wrapped_model(inbound_layer)):
               # if current layer is not Model or wrapped Model
-              if (not isinstance(layer, network.Network) and
+              if (not isinstance(layer, functional.Functional) and
                   not is_wrapped_model(layer)):
                 assert dot.get_node(inbound_layer_id)
                 assert dot.get_node(layer_id)
                 add_edge(dot, inbound_layer_id, layer_id)
               # if current layer is Model
-              elif isinstance(layer, network.Network):
+              elif isinstance(layer, functional.Functional):
                 add_edge(dot, inbound_layer_id,
                          sub_n_first_node[layer.name].get_name())
               # if current layer is wrapped Model
@@ -236,9 +239,9 @@ def model_to_dot(model,
                 name = sub_w_first_node[layer.layer.name].get_name()
                 add_edge(dot, layer_id, name)
             # if inbound_layer is Model
-            elif isinstance(inbound_layer, network.Network):
+            elif isinstance(inbound_layer, functional.Functional):
               name = sub_n_last_node[inbound_layer.name].get_name()
-              if isinstance(layer, network.Network):
+              if isinstance(layer, functional.Functional):
                 output_name = sub_n_first_node[layer.name].get_name()
                 add_edge(dot, name, output_name)
               else:
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 272396239d7..d696021fcb4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 8979491971f..b8486a27b9e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
index 448ea60cc0f..7bf71844fa6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.LinearModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.linear.LinearModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 8e1d9927434..87a7319639b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.WideDeepModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.wide_deep.WideDeepModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index 13c3416fc0c..00c9fc22def 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.models.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index 9218cbea99e..d3cca7311ee 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.models.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 272396239d7..d696021fcb4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 8979491971f..b8486a27b9e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
index 448ea60cc0f..7bf71844fa6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.LinearModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.linear.LinearModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
index 8e1d9927434..87a7319639b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt
@@ -2,7 +2,6 @@ path: "tensorflow.keras.experimental.WideDeepModel"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.premade.wide_deep.WideDeepModel\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -176,7 +175,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index 13c3416fc0c..00c9fc22def 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -1,7 +1,6 @@
 path: "tensorflow.keras.models.Model"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
@@ -175,7 +174,7 @@ tf_class {
   }
   member_method {
     name: "compute_mask"
-    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_output_shape"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index 9218cbea99e..d3cca7311ee 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.models.Sequential"
 tf_class {
   is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.functional.Functional\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
   is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"

From 9da04eec59c1e99d7b4c0bfd29f9efa09598cf68 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Tue, 12 May 2020 17:23:01 -0700
Subject: [PATCH 076/412] IWYU in profiler/lib

PiperOrigin-RevId: 311233906
Change-Id: I9207e56c017112eba2f59ee57d67c9825a015818
---
 tensorflow/core/profiler/lib/BUILD               |  3 +++
 tensorflow/core/profiler/lib/annotated_traceme.h |  4 ++++
 tensorflow/core/profiler/lib/profiler_session.cc | 11 ++++++++++-
 tensorflow/core/profiler/lib/profiler_session.h  |  2 ++
 tensorflow/core/profiler/lib/scoped_annotation.h |  1 +
 tensorflow/core/profiler/lib/traceme.h           |  4 ++++
 6 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index b3028c717bf..6316fd118fc 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -52,12 +52,14 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler:profiler_options_proto_cc",
         "@com_google_absl//absl/memory",
+        "//tensorflow/core:protos_all_cc",
     ] + if_not_android([
         ":profiler_utils",
         "//tensorflow/core/profiler/internal:profiler_factory",
         "//tensorflow/core/profiler/utils:derived_timeline",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_utils",
+        "//tensorflow/core/profiler/utils:xplane_schema",
     ]),
     alwayslink = True,
 )
@@ -109,6 +111,7 @@ cc_library(
         ":traceme",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
     ],
 )
 
diff --git a/tensorflow/core/profiler/lib/annotated_traceme.h b/tensorflow/core/profiler/lib/annotated_traceme.h
index f40c1e9ad92..c3257e2adbe 100644
--- a/tensorflow/core/profiler/lib/annotated_traceme.h
+++ b/tensorflow/core/profiler/lib/annotated_traceme.h
@@ -15,7 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
 
+#include <utility>
+
 #include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc
index 497ee76b2af..9783cd14f95 100644
--- a/tensorflow/core/profiler/lib/profiler_session.cc
+++ b/tensorflow/core/profiler/lib/profiler_session.cc
@@ -15,12 +15,20 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/lib/profiler_session.h"
 
+#include <memory>
+
 #include "absl/memory/memory.h"
 #include "tensorflow/core/platform/env_time.h"
-#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/platform.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/internal/profiler_interface.h"
+#include "tensorflow/core/profiler/profiler_options.pb.h"
+#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/util/env_var.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
@@ -28,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/profiler_utils.h"
 #include "tensorflow/core/profiler/utils/derived_timeline.h"
 #include "tensorflow/core/profiler/utils/group_events.h"
+#include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #endif
 
diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h
index 83d0683f740..6f92b047eb7 100644
--- a/tensorflow/core/profiler/lib/profiler_session.h
+++ b/tensorflow/core/profiler/lib/profiler_session.h
@@ -21,9 +21,11 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/internal/profiler_interface.h"
 #include "tensorflow/core/profiler/profiler_options.pb.h"
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/profiler/lib/scoped_annotation.h b/tensorflow/core/profiler/lib/scoped_annotation.h
index 61b0cf42dd6..2cad5fd4708 100644
--- a/tensorflow/core/profiler/lib/scoped_annotation.h
+++ b/tensorflow/core/profiler/lib/scoped_annotation.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <atomic>
+#include <utility>
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index 8b42f187850..af93ac11b1e 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -15,7 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
 
+#include <new>
+#include <utility>
+
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/core/platform/env_time.h"

From 6d583589fe3f1fd95290df760abe165526c18585 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 18:01:00 -0700
Subject: [PATCH 077/412] Flush denormals to zero in eager mode.

PiperOrigin-RevId: 311239051
Change-Id: Iefbc09c82e07af29580319fee024965a2c554378
---
 .../common_runtime/eager/kernel_and_device.cc |  4 +++
 .../python/kernel_tests/denormal_test.py      | 33 +++++++++----------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 3c586e8188a..bf7c083f24b 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -35,8 +35,10 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/version.h"
@@ -281,6 +283,8 @@ Status KernelAndDeviceOp::Run(
   OpKernelContext context(&params);
 
   {
+    port::ScopedFlushDenormal flush;
+    port::ScopedSetRound round(FE_TONEAREST);
     // 'AnnotatedTraceMe' will trace both scheduling time on host and execution
     // time on device of the OpKernel.
     profiler::AnnotatedTraceMe activity(
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index d824e95f213..6e073f0d526 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -23,7 +23,6 @@ import platform
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -35,32 +34,30 @@ class DenormalTest(test.TestCase):
       tiny = np.finfo(dtype).tiny
       self.assertEqual(tiny, tiny / 16 * 16)
 
-  def _flushDenormalsTest(self, use_gpu, dtypes):
-    if platform.machine() == "ppc64le" or platform.machine(
-    ) == "s390x" or platform.machine() == "aarch64":
+  def _flushDenormalsTest(self, dtypes):
+    if (platform.machine() == "ppc64le" or platform.machine() == "s390x" or
+        platform.machine() == "aarch64"):
       # Disabled denormal_test on power/s390x/aarch64 platform
       # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
       return
-    with self.cached_session(use_gpu=use_gpu):
-      array_ops.identity(7).eval()
-      for dtype in dtypes:
-        tiny = np.finfo(dtype).tiny
-        # Small shape to test main thread, large shape to test thread pool
-        for shape in (), (1 << 20,):
-          flush = 0.1 * constant_op.constant(tiny, shape=shape)
-          self.assertAllEqual(flush.eval(), np.zeros(shape))
-          # Make sure the flags don't leak out
-          self.testPythonHasDenormals()
+    for dtype in dtypes:
+      tiny = np.finfo(dtype).tiny
+      # Small shape to test main thread, large shape to test thread pool
+      for shape in (), (1 << 20,):
+        flush = 0.1 * constant_op.constant(tiny, shape=shape)
+        self.assertAllEqual(self.evaluate(flush), np.zeros(shape))
+        # Make sure the flags don't leak out
+        self.testPythonHasDenormals()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
   def testFlushDenormalsCPU(self):
     # On CPUs, the processor flags flush for both single and double precision.
-    self._flushDenormalsTest(use_gpu=False, dtypes=(np.float32, np.float64))
+    self._flushDenormalsTest(dtypes=(np.float32, np.float64))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testFlushDenormalsGPU(self):
     # On GPUs, only single precision can flush to zero.
-    self._flushDenormalsTest(use_gpu=True, dtypes=(np.float32,))
+    self._flushDenormalsTest(dtypes=(np.float32,))
 
 
 if __name__ == "__main__":

From 4e9b6b454e1d057513ac477b2cd65f5925f91cc8 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 13 May 2020 01:29:03 +0000
Subject: [PATCH 078/412] Fix the issue of tf.divide's return value is not a
 tensor

This PR fixes the issue of 39475 where tf.divide's return value
is not a tensor in case x, y in divide(x, y) are both primitive python
types.

The reason was that tf.divide relies on implict `x / y`. However,
if both x and y are not tensor, the return value will fall through
python and will not be a tensor.

This PR fixes the issue.

This PR fixes 39475.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py      | 7 ++++++-
 tensorflow/python/ops/math_ops_test.py | 6 ++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index f062047cec2..b981af72e83 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -72,6 +72,7 @@ from __future__ import print_function
 
 import numpy as np
 import six
+import sys
 from six.moves import builtins
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -438,9 +439,13 @@ def divide(x, y, name=None):
     # override names. Use a dummy class to track the runtime division behavior
     return DivideDelegateWithName(x, name) / y
   else:
+    if not (isinstance(x, ops.Tensor)  or isinstance(y, ops.Tensor)):
+      if sys.version_info.major < 3:
+        return _truediv_python2(x, y)
+      else:
+        return _truediv_python3(x, y)
     return x / y
 
-
 @tf_export("math.multiply", "multiply")
 @dispatch.add_dispatch_support
 def multiply(x, y, name=None):
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 2405eec9e49..dab0ea88ba8 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -495,6 +495,12 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     # Consistent with desire to get numerator
     self.assertAllEqual(tf_result, expanded_nums)
 
+  def testWithPythonValue(self):
+    # Test case for GitHub issue 39475:
+    # https://github.com/tensorflow/tensorflow/issues/39475
+    x = math_ops.divide(5,  2)
+    self.assertTrue(isinstance(x, ops.Tensor))
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class DivNoNanTest(test_util.TensorFlowTestCase):

From e617aabd09291f752dc0c56f337dcd5031bd754f Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Tue, 12 May 2020 18:25:28 -0700
Subject: [PATCH 079/412] Add optional error reporting and status returns to
 MicroOpResolver

PiperOrigin-RevId: 311242225
Change-Id: Ibb92991c3ab161c1aac5d828f8f4f3e17cdecd8b
---
 .../examples/micro_speech/main_functions.cc   |  24 ++--
 .../lite/micro/micro_mutable_op_resolver.h    |  55 ++++++---
 .../micro/micro_mutable_op_resolver_test.cc   | 105 +++++++++++++++++-
 3 files changed, 157 insertions(+), 27 deletions(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
index 23c63a32986..d3989c07333 100644
--- a/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
+++ b/tensorflow/lite/micro/examples/micro_speech/main_functions.cc
@@ -74,14 +74,22 @@ void setup() {
   //
   // tflite::ops::micro::AllOpsResolver resolver;
   // NOLINTNEXTLINE(runtime-global-variables)
-  static tflite::MicroOpResolver<3> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(
-      tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-      tflite::ops::micro::Register_DEPTHWISE_CONV_2D());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_FULLY_CONNECTED,
-                               tflite::ops::micro::Register_FULLY_CONNECTED());
-  micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
-                               tflite::ops::micro::Register_SOFTMAX());
+  static tflite::MicroOpResolver<3> micro_op_resolver(error_reporter);
+  if (micro_op_resolver.AddBuiltin(
+          tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+          tflite::ops::micro::Register_DEPTHWISE_CONV_2D()) != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddBuiltin(
+          tflite::BuiltinOperator_FULLY_CONNECTED,
+          tflite::ops::micro::Register_FULLY_CONNECTED()) != kTfLiteOk) {
+    return;
+  }
+  if (micro_op_resolver.AddBuiltin(tflite::BuiltinOperator_SOFTMAX,
+                                   tflite::ops::micro::Register_SOFTMAX()) !=
+      kTfLiteOk) {
+    return;
+  }
 
   // Build an interpreter to run the model with.
   static tflite::MicroInterpreter static_interpreter(
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index 2f6d4d27823..ead9be490a3 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -34,6 +34,9 @@ inline int MicroOpResolverAnyVersion() { return 0; }
 template <unsigned int tOpCount = TFLITE_REGISTRATIONS_MAX>
 class MicroOpResolver : public OpResolver {
  public:
+  explicit MicroOpResolver(ErrorReporter* error_reporter = nullptr)
+      : error_reporter_(error_reporter) {}
+
   const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
                                    int version) const override {
     for (unsigned int i = 0; i < registrations_len_; ++i) {
@@ -62,11 +65,16 @@ class MicroOpResolver : public OpResolver {
     return nullptr;
   }
 
-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
-                  int version = 1) {
+  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
+                          TfLiteRegistration* registration, int version = 1) {
     if (registrations_len_ >= tOpCount) {
-      // TODO(b/147748244) - Add error reporting hooks so we can report this!
-      return;
+      if (error_reporter_) {
+        TF_LITE_REPORT_ERROR(error_reporter_,
+                             "Couldn't register builtin op #%d, resolver size "
+                             "is too small (%d)",
+                             op, tOpCount);
+      }
+      return kTfLiteError;
     }
     TfLiteRegistration* new_registration = &registrations_[registrations_len_];
     registrations_len_ += 1;
@@ -74,20 +82,32 @@ class MicroOpResolver : public OpResolver {
     *new_registration = *registration;
     new_registration->builtin_code = op;
     new_registration->version = version;
+
+    return kTfLiteOk;
   }
 
-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
-                  int min_version, int max_version) {
+  TfLiteStatus AddBuiltin(tflite::BuiltinOperator op,
+                          TfLiteRegistration* registration, int min_version,
+                          int max_version) {
     for (int version = min_version; version <= max_version; ++version) {
-      AddBuiltin(op, registration, version);
+      TfLiteStatus add_status = AddBuiltin(op, registration, version);
+      if (add_status != kTfLiteOk) {
+        return add_status;
+      }
     }
+    return kTfLiteOk;
   }
 
-  void AddCustom(const char* name, TfLiteRegistration* registration,
-                 int version = 1) {
+  TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration,
+                         int version = 1) {
     if (registrations_len_ >= tOpCount) {
-      // TODO(b/147748244) - Add error reporting hooks so we can report this!
-      return;
+      if (error_reporter_) {
+        TF_LITE_REPORT_ERROR(
+            error_reporter_,
+            "Couldn't register custom op '%s', resolver size is too small (%d)",
+            name, tOpCount);
+      }
+      return kTfLiteError;
     }
     TfLiteRegistration* new_registration = &registrations_[registrations_len_];
     registrations_len_ += 1;
@@ -96,13 +116,19 @@ class MicroOpResolver : public OpResolver {
     new_registration->builtin_code = BuiltinOperator_CUSTOM;
     new_registration->custom_name = name;
     new_registration->version = version;
+
+    return kTfLiteOk;
   }
 
-  void AddCustom(const char* name, TfLiteRegistration* registration,
-                 int min_version, int max_version) {
+  TfLiteStatus AddCustom(const char* name, TfLiteRegistration* registration,
+                         int min_version, int max_version) {
     for (int version = min_version; version <= max_version; ++version) {
-      AddCustom(name, registration, version);
+      TfLiteStatus add_status = AddCustom(name, registration, version);
+      if (add_status != kTfLiteOk) {
+        return add_status;
+      }
     }
+    return kTfLiteOk;
   }
 
   unsigned int GetRegistrationLength() { return registrations_len_; }
@@ -110,6 +136,7 @@ class MicroOpResolver : public OpResolver {
  private:
   TfLiteRegistration registrations_[tOpCount];
   unsigned int registrations_len_ = 0;
+  ErrorReporter* error_reporter_;
 
   TF_LITE_REMOVE_VIRTUAL_DELETE
 };
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
index cf39994acec..61ab0e3bec9 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
-
 #include "tensorflow/lite/micro/testing/micro_test.h"
 
 namespace tflite {
@@ -35,6 +34,22 @@ TfLiteStatus MockPrepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus MockInvoke(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
+
+class MockErrorReporter : public ErrorReporter {
+ public:
+  MockErrorReporter() : has_been_called_(false) {}
+  int Report(const char* format, va_list args) override {
+    has_been_called_ = true;
+    return 0;
+  };
+
+  bool HasBeenCalled() { return has_been_called_; }
+
+ private:
+  bool has_been_called_;
+  TF_LITE_REMOVE_VIRTUAL_DELETE
+};
+
 }  // namespace
 }  // namespace tflite
 
@@ -52,8 +67,10 @@ TF_LITE_MICRO_TEST(TestOperations) {
   // We need space for 7 operators because of 2 ops, one with 3 versions, one
   // with 4 versions.
   MicroOpResolver<7> micro_op_resolver;
-  micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 1, 3);
-  micro_op_resolver.AddCustom("mock_custom", &r, 1, 4);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
+                                         BuiltinOperator_CONV_2D, &r, 1, 3));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          micro_op_resolver.AddCustom("mock_custom", &r, 1, 4));
   OpResolver* resolver = &micro_op_resolver;
 
   const TfLiteRegistration* registration =
@@ -96,8 +113,10 @@ TF_LITE_MICRO_TEST(TestOpRegistrationOverflow) {
   MicroOpResolver<4> micro_op_resolver;
   // Register 7 ops, but only 4 is expected because the class is created with
   // that limit..
-  micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r, 0, 2);
-  micro_op_resolver.AddCustom("mock_custom", &r, 0, 3);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
+                                         BuiltinOperator_CONV_2D, &r, 0, 2));
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          micro_op_resolver.AddCustom("mock_custom", &r, 0, 3));
   OpResolver* resolver = &micro_op_resolver;
 
   TF_LITE_MICRO_EXPECT_EQ(4, micro_op_resolver.GetRegistrationLength());
@@ -174,4 +193,80 @@ TF_LITE_MICRO_TEST(TestZeroModelVersion) {
   TF_LITE_MICRO_EXPECT_EQ(nullptr, registration);
 }
 
+TF_LITE_MICRO_TEST(TestBuiltinRegistrationErrorReporting) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroOpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  tflite::MockErrorReporter mock_reporter;
+  MicroOpResolver<1> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, micro_op_resolver.AddBuiltin(BuiltinOperator_CONV_2D, &r));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, micro_op_resolver.AddBuiltin(BuiltinOperator_RELU, &r));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+}
+
+TF_LITE_MICRO_TEST(TestCustomRegistrationErrorReporting) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroOpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  tflite::MockErrorReporter mock_reporter;
+  MicroOpResolver<1> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk,
+                          micro_op_resolver.AddCustom("mock_custom_0", &r));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError,
+                          micro_op_resolver.AddCustom("mock_custom_1", &r));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+}
+
+TF_LITE_MICRO_TEST(TestBuiltinVersionRegistrationErrorReporting) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroOpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  tflite::MockErrorReporter mock_reporter;
+  MicroOpResolver<2> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, micro_op_resolver.AddBuiltin(
+                                         BuiltinOperator_CONV_2D, &r, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteError, micro_op_resolver.AddBuiltin(
+                                            BuiltinOperator_RELU, &r, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+}
+
+TF_LITE_MICRO_TEST(TestCustomVersionRegistrationErrorReporting) {
+  using tflite::BuiltinOperator_CONV_2D;
+  using tflite::BuiltinOperator_RELU;
+  using tflite::MicroOpResolver;
+
+  static TfLiteRegistration r = {tflite::MockInit, tflite::MockFree,
+                                 tflite::MockPrepare, tflite::MockInvoke};
+
+  tflite::MockErrorReporter mock_reporter;
+  MicroOpResolver<2> micro_op_resolver(&mock_reporter);
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteOk, micro_op_resolver.AddCustom("mock_custom_0", &r, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(false, mock_reporter.HasBeenCalled());
+  TF_LITE_MICRO_EXPECT_EQ(
+      kTfLiteError, micro_op_resolver.AddCustom("mock_custom_1", &r, 1, 2));
+  TF_LITE_MICRO_EXPECT_EQ(true, mock_reporter.HasBeenCalled());
+}
+
 TF_LITE_MICRO_TESTS_END

From 047d788ea42397f22aa3f6d80c9c9dce53f564b5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 18:34:46 -0700
Subject: [PATCH 080/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311243401
Change-Id: I89adb2b883527f2e548665b7710bfb03d71b32cd
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From f690a054c599d51d7c8e9ae83c7d0ebd70f80cca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 19:45:42 -0700
Subject: [PATCH 081/412] Flush denormals to zero in eager mode.

PiperOrigin-RevId: 311251058
Change-Id: I6ddca2fabc904e8e7400735aaddef361ba0b8778
---
 .../common_runtime/eager/kernel_and_device.cc |  4 ---
 .../python/kernel_tests/denormal_test.py      | 33 ++++++++++---------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index bf7c083f24b..3c586e8188a 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -35,10 +35,8 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/fingerprint.h"
-#include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/version.h"
@@ -283,8 +281,6 @@ Status KernelAndDeviceOp::Run(
   OpKernelContext context(&params);
 
   {
-    port::ScopedFlushDenormal flush;
-    port::ScopedSetRound round(FE_TONEAREST);
     // 'AnnotatedTraceMe' will trace both scheduling time on host and execution
     // time on device of the OpKernel.
     profiler::AnnotatedTraceMe activity(
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index 6e073f0d526..d824e95f213 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -23,6 +23,7 @@ import platform
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -34,30 +35,32 @@ class DenormalTest(test.TestCase):
       tiny = np.finfo(dtype).tiny
       self.assertEqual(tiny, tiny / 16 * 16)
 
-  def _flushDenormalsTest(self, dtypes):
-    if (platform.machine() == "ppc64le" or platform.machine() == "s390x" or
-        platform.machine() == "aarch64"):
+  def _flushDenormalsTest(self, use_gpu, dtypes):
+    if platform.machine() == "ppc64le" or platform.machine(
+    ) == "s390x" or platform.machine() == "aarch64":
       # Disabled denormal_test on power/s390x/aarch64 platform
       # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
       return
-    for dtype in dtypes:
-      tiny = np.finfo(dtype).tiny
-      # Small shape to test main thread, large shape to test thread pool
-      for shape in (), (1 << 20,):
-        flush = 0.1 * constant_op.constant(tiny, shape=shape)
-        self.assertAllEqual(self.evaluate(flush), np.zeros(shape))
-        # Make sure the flags don't leak out
-        self.testPythonHasDenormals()
+    with self.cached_session(use_gpu=use_gpu):
+      array_ops.identity(7).eval()
+      for dtype in dtypes:
+        tiny = np.finfo(dtype).tiny
+        # Small shape to test main thread, large shape to test thread pool
+        for shape in (), (1 << 20,):
+          flush = 0.1 * constant_op.constant(tiny, shape=shape)
+          self.assertAllEqual(flush.eval(), np.zeros(shape))
+          # Make sure the flags don't leak out
+          self.testPythonHasDenormals()
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
+  @test_util.run_deprecated_v1
   def testFlushDenormalsCPU(self):
     # On CPUs, the processor flags flush for both single and double precision.
-    self._flushDenormalsTest(dtypes=(np.float32, np.float64))
+    self._flushDenormalsTest(use_gpu=False, dtypes=(np.float32, np.float64))
 
-  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  @test_util.run_deprecated_v1
   def testFlushDenormalsGPU(self):
     # On GPUs, only single precision can flush to zero.
-    self._flushDenormalsTest(dtypes=(np.float32,))
+    self._flushDenormalsTest(use_gpu=True, dtypes=(np.float32,))
 
 
 if __name__ == "__main__":

From e0157b592c2c3b0f75226bc4aaf0c5bc1df69974 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 13 May 2020 10:45:58 +0800
Subject: [PATCH 082/412] add fp16 option to NNAPI delegate

adding a field for relaxing fp32 to fp16 into the NNAPI delegate
option and changing evaluation tools accordingly
---
 .../lite/delegates/nnapi/nnapi_delegate.cc       | 16 ++++++++++++----
 tensorflow/lite/delegates/nnapi/nnapi_delegate.h |  5 +++++
 .../lite/delegates/nnapi/nnapi_delegate_kernel.h |  1 +
 .../accuracy/ilsvrc/imagenet_model_evaluator.cc  |  4 ++--
 .../accuracy/ilsvrc/imagenet_model_evaluator.h   |  2 +-
 .../tools/delegates/nnapi_delegate_provider.cc   | 15 ++++++++++++++-
 .../evaluation/evaluation_delegate_provider.cc   |  4 ++--
 .../evaluation/proto/evaluation_stages.proto     |  4 ++--
 .../evaluation/stages/tflite_inference_stage.cc  |  1 -
 .../tasks/coco_object_detection/run_eval.cc      |  6 +++---
 .../imagenet_image_classification/run_eval.cc    |  6 +++---
 .../evaluation/tasks/inference_diff/run_eval.cc  |  5 +++--
 12 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 002c29915c6..867d03f5227 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -3151,7 +3151,8 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context,
                                     "creating NNAPI model", nnapi_errno);
     nn_model_.reset(model);
 
-    TF_LITE_ENSURE_STATUS(BuildGraph(context, params->input_tensors,
+    TF_LITE_ENSURE_STATUS(BuildGraph(context, params->delegate,
+                                     params->input_tensors,
                                      params->output_tensors, nnapi_errno));
   }
 
@@ -3202,6 +3203,7 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context,
 
   const auto delegate_options =
       StatefulNnApiDelegate::GetOptions(node->delegate);
+
   ANeuralNetworksCompilation* compilation = nullptr;
   if (!nnapi_devices_.empty()) {
     // Compile for the selected accelerator.
@@ -3875,8 +3877,9 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
 }
 
 TfLiteStatus NNAPIDelegateKernel::BuildGraph(
-    TfLiteContext* context, const TfLiteIntArray* input_tensors,
-    const TfLiteIntArray* output_tensors, int* nnapi_errno) {
+    TfLiteContext* context, TfLiteDelegate* delegate,
+    const TfLiteIntArray* input_tensors, const TfLiteIntArray* output_tensors,
+    int* nnapi_errno) {
   // Build the ops and tensors.
   TF_LITE_ENSURE_STATUS(AddOpsAndTensors(context, nnapi_errno));
   // Map input and output tensor indices to ANN
@@ -3885,6 +3888,7 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
   std::vector<uint32_t> outputs;
   outputs.reserve(output_tensors->size);
 
+  const auto delegate_options = StatefulNnApiDelegate::GetOptions(delegate);
   size_t total_input_byte_size = 0;
   // Make the TensorFlow Lite inputs and outputs to ann_indices.
   for (int i : TfLiteIntArrayView(input_tensors)) {
@@ -3941,11 +3945,13 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
           outputs.data()),
       "identifying model inputs and outputs", nnapi_errno);
 
+  auto allow_fp16 =
+      context->allow_fp32_relax_to_fp16 | delegate_options.allow_fp16;
   if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI11) {
     RETURN_TFLITE_ERROR_IF_NN_ERROR(
         context,
         nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-            nn_model_.get(), context->allow_fp32_relax_to_fp16),
+            nn_model_.get(), allow_fp16),
         "set relaxed computation mode for fp32 if possible", nnapi_errno);
   }
 
@@ -4021,6 +4027,7 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
       options.max_number_delegated_partitions;
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                        "Created TensorFlow Lite delegate for NNAPI.");
+  delegate_data_.allow_fp16 = options.allow_fp16;
   Prepare = DoPrepare;
   CopyFromBufferHandle = DoCopyFromBufferHandle;
   CopyToBufferHandle = DoCopyToBufferHandle;
@@ -4048,6 +4055,7 @@ const StatefulNnApiDelegate::Options StatefulNnApiDelegate::GetOptions(
   options.disallow_nnapi_cpu = delegate_data->disallow_nnapi_cpu;
   options.max_number_delegated_partitions =
       delegate_data->max_number_delegated_partitions;
+  options.allow_fp16 = delegate_data->allow_fp16;
   return options;
 }
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index fe777ea99aa..1bd9fb5c49f 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -89,6 +89,9 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // The selection is currently done sorting partitions in decreasing order
     // of number of nodes and selecting them until the limit is reached.
     int max_number_delegated_partitions = 3;
+
+    // allow fp32 compuation to be run in fp16
+    bool allow_fp16 = false;
   };
 
   // Uses default options.
@@ -184,6 +187,8 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // Maximum number of NNAPI partition to delegate. Zero or negative means
     // no limit. Copied from StatefulNnApiDelegate::Options
     int max_number_delegated_partitions;
+    // allow fp32 computation to be run in fp32
+    bool allow_fp16 = false;
 
     ~Data();
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index b35bf0224fd..60151196372 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -349,6 +349,7 @@ class NNAPIDelegateKernel {
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno);
 
   TfLiteStatus BuildGraph(TfLiteContext* context,
+                          TfLiteDelegate* delegate,
                           const TfLiteIntArray* input_tensors,
                           const TfLiteIntArray* output_tensors,
                           int* nnapi_errno);
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index 61c2acb8b2e..64ce87ae8aa 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -141,8 +141,8 @@ class CompositeObserver : public ImagenetModelEvaluator::Observer {
       tflite::Flag::CreateFlag(kNumRanksFlag, &params.num_ranks,
                                "Generates the top-1 to top-k accuracy values"
                                "where k = num_ranks. Default: 10"),
-      tflite::Flag::CreateFlag("allow_fp16", &params.allow_fp16,
-                               "allow fp16"),
+      tflite::Flag::CreateFlag("nnapi_allow_fp16", &params.nnapi_allow_fp16,
+                               "allow fp16 in nnapi"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
index 323069383c3..3ba22cbc2af 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
@@ -80,7 +80,7 @@ class ImagenetModelEvaluator {
     int num_interpreter_threads = 1;
 
     // allow fp16
-    bool allow_fp16 = false;
+    bool nnapi_allow_fp16 = false;
   };
 
   // An evaluation observer.
diff --git a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
index f3ed8743e54..6492ba82849 100644
--- a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
@@ -33,6 +33,8 @@ class NnapiDelegateProvider : public DelegateProvider {
                              ToolParam::Create<std::string>(""));
     default_params_.AddParam("disable_nnapi_cpu",
                              ToolParam::Create<bool>(false));
+    default_params_.AddParam("nnapi_allow_fp16",
+                             ToolParam::Create<bool>(false));
   }
 
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -56,7 +58,9 @@ std::vector<Flag> NnapiDelegateProvider::CreateFlags(ToolParams* params) const {
           "nnapi_accelerator_name", params,
           "the name of the nnapi accelerator to use (requires Android Q+)"),
       CreateFlag<bool>("disable_nnapi_cpu", params,
-                       "Disable the NNAPI CPU device")};
+                       "Disable the NNAPI CPU device"),
+      CreateFlag<bool>("nnapi_allow_fp16", params,
+                       "Allow fp32 computation to be run in fp16")};
 
   return flags;
 }
@@ -83,6 +87,10 @@ void NnapiDelegateProvider::LogParams(const ToolParams& params) const {
       TFLITE_LOG(INFO) << "disable_nnapi_cpu: ["
                        << params.Get<bool>("disable_nnapi_cpu") << "]";
     }
+    if (params.Get<bool>("nnapi_allow_fp16")) {
+      TFLITE_LOG(INFO) << "nnapi_allow_fp16: ["
+                       << params.Get<bool>("nnapi_allow_fp16") << "]";
+    }
   }
 #endif
 }
@@ -99,6 +107,11 @@ TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
     } else if (params.Get<bool>("disable_nnapi_cpu")) {
       options.disallow_nnapi_cpu = true;
     }
+
+    if (params.Get<bool>("nnapi_allow_fp16")) {
+      options.allow_fp16 = true;
+    }
+
     std::string string_execution_preference =
         params.Get<std::string>("nnapi_execution_preference");
     // Only set execution preference if user explicitly passes one. Otherwise,
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
index a7625441406..ea07378a8fa 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
@@ -132,8 +132,8 @@ tools::ToolParams DelegateProviders::GetAllParams(
     tool_params.Set<int32_t>("num_threads", params.num_threads());
   }
 
-  if (params.has_allow_fp16()) {
-    tool_params.Set<bool>("allow_fp16", params.allow_fp16());
+  if (params.has_nnapi_allow_fp16()) {
+    tool_params.Set<bool>("nnapi_allow_fp16", params.nnapi_allow_fp16());
   }
 
   const auto type = params.delegate();
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
index c7d033eb111..cecdb22c637 100644
--- a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
@@ -122,8 +122,8 @@ message TfliteInferenceParams {
   // required for every input.
   optional int32 invocations_per_run = 4 [default = 1];
 
-  // allow_fp16
-  optional bool allow_fp16 = 5 [default = false];
+  // nnapi_allow_fp16
+  optional bool nnapi_allow_fp16 = 5 [default = false];
 }
 
 // Metrics specific to TFLite inference.
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
index 8189140e953..365a00c3cd1 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
@@ -95,7 +95,6 @@ TfLiteStatus TfliteInferenceStage::Init(
     return kTfLiteError;
   }
   interpreter_->SetNumThreads(params.num_threads());
-  interpreter_->SetAllowFp16PrecisionForFp32(params.allow_fp16());
 
   if (!delegate_providers) {
     std::string error_message;
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index 1ff4e55c270..de1ae6e2e94 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -106,8 +106,8 @@ CocoObjectDetection::CocoObjectDetection(int* argc, char* argv[])
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'xnnpack', 'hexagon'}"),
       tflite::Flag::CreateFlag(
-          "allow_fp16", &allow_fp16_,
-          "allow fp16"),
+          "nnapi_allow_fp16", &allow_fp16_,
+          "nnapi allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   DelegateProviders delegate_providers;
@@ -136,7 +136,7 @@ absl::optional<EvaluationStageMetrics> CocoObjectDetection::Run() {
   inference_params->set_model_file_path(model_file_path_);
   inference_params->set_num_threads(num_interpreter_threads_);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
-  inference_params->set_allow_fp16(allow_fp16_);
+  inference_params->set_nnapi_allow_fp16(allow_fp16_);
 
   // Get ground truth data.
   absl::flat_hash_map<std::string, ObjectDetectionResult> ground_truth_map;
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index 1e1cf86732a..8a7fd864c6e 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -108,8 +108,8 @@ ImagenetClassification::ImagenetClassification(int* argc, char* argv[])
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
       tflite::Flag::CreateFlag(
-          "allow_fp16", &allow_fp16_,
-          "allow fp16"),
+          "nnapi_allow_fp16", &allow_fp16_,
+          "nnapi allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
@@ -159,7 +159,7 @@ absl::optional<EvaluationStageMetrics> ImagenetClassification::Run() {
   inference_params->set_model_file_path(model_file_path_);
   inference_params->set_num_threads(num_interpreter_threads_);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
-  inference_params->set_allow_fp16(allow_fp16_);
+  inference_params->set_nnapi_allow_fp16(allow_fp16_);
   classification_params->mutable_topk_accuracy_eval_params()->set_k(10);
 
   ImageClassificationStage eval(eval_config);
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
index de41fb96a03..c85d997974b 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
@@ -72,7 +72,8 @@ InferenceDiff::InferenceDiff(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for test inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
-      tflite::Flag::CreateFlag("allow_fp16", &allow_fp16_, "allow fp16")
+      tflite::Flag::CreateFlag("nnapi_allow_fp16", &allow_fp16_,
+                               "nnapi allow fp16")
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
@@ -90,7 +91,7 @@ absl::optional<EvaluationStageMetrics> InferenceDiff::Run() {
   // generating random data.
   inference_params->set_invocations_per_run(3);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
-  inference_params->set_allow_fp16(allow_fp16_);
+  inference_params->set_nnapi_allow_fp16(allow_fp16_);
   if (!delegate_.empty() &&
       inference_params->delegate() == TfliteInferenceParams::NONE) {
     TFLITE_LOG(WARN) << "Unsupported TFLite delegate: " << delegate_;

From efa921a7702b6afee571da9de52aea801c519968 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 13 May 2020 11:03:16 +0800
Subject: [PATCH 083/412] add `nnapi_allow_fp16` option to benchmark_model

---
 .../lite/tools/benchmark/benchmark_performance_options.cc     | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
index cafef6fa133..c2d9374506e 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -233,6 +233,7 @@ void BenchmarkPerformanceOptions::ResetPerformanceOptions() {
   single_option_run_params_->Set<std::string>("nnapi_accelerator_name", "");
   single_option_run_params_->Set<bool>("disable_nnapi_cpu", false);
   single_option_run_params_->Set<int>("max_delegated_partitions", 0);
+  single_option_run_params_->Set<bool>("nnapi_allow_fp16", false);
 #endif
 #if defined(TFLITE_ENABLE_HEXAGON)
   single_option_run_params_->Set<bool>("use_hexagon", false);
@@ -302,6 +303,9 @@ void BenchmarkPerformanceOptions::CreatePerformanceOptions() {
                         BenchmarkParam::Create<bool>(false));
         params.AddParam("max_delegated_partitions",
                         BenchmarkParam::Create<int>(0));
+        params.AddParam("max_delegated_partitions",
+        params.AddParam("nnapi_allow_fp16",
+                        BenchmarkParam::Create<bool>(false));
         all_run_params_.emplace_back(std::move(params));
       }
     }

From 1c74b32aa27dc0d40a9ce1f883ea632d399a7b9a Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Tue, 12 May 2020 21:23:08 -0700
Subject: [PATCH 084/412] Validate remote resource devices before safe access
 of resources.

Cluster updates (due to recreated distribution strategies, remote worker failures, etc.) can lead to crashing failures with segfaults when accessing resources created before the update. Some common patterns are:
* Accessing datasets created on old remote workers;
* Accessing variables created on failed workers;
* Garbage collecting datasets/iterators created on old remote workers;

This CL validate the remote devices to make sure the access is safe before executing the ops by looking up the device in a set of device pointers and checking its incarnation ID. Remote workers on restarted devices will have different incarnation IDs, and accessing resources on those devices will fail gracefully.

PiperOrigin-RevId: 311261000
Change-Id: Ifc07862229b06301e0275fe80975565d9df28152
---
 tensorflow/c/eager/c_api_cluster_test.cc      | 120 ++++++++++++++++++
 tensorflow/c/eager/c_api_test.cc              |   2 +
 tensorflow/c/eager/c_api_test_util.cc         |   1 +
 tensorflow/core/common_runtime/device_mgr.cc  |   5 +
 tensorflow/core/common_runtime/device_mgr.h   |  10 ++
 .../core/common_runtime/dynamic_device_mgr.cc |   7 +
 .../core/common_runtime/eager/execute.cc      |  17 +++
 .../common_runtime/eager/tensor_handle.cc     |  20 +++
 .../core/common_runtime/eager/tensor_handle.h |   6 +
 .../eager/tensor_handle_test.cc               | 101 ++++++++++++++-
 10 files changed, 286 insertions(+), 3 deletions(-)

diff --git a/tensorflow/c/eager/c_api_cluster_test.cc b/tensorflow/c/eager/c_api_cluster_test.cc
index 8f585d6f02c..252a0408758 100644
--- a/tensorflow/c/eager/c_api_cluster_test.cc
+++ b/tensorflow/c/eager/c_api_cluster_test.cc
@@ -50,6 +50,13 @@ tensorflow::ServerDef GetServerDef(int num_tasks) {
   return GetServerDef("localhost", num_tasks);
 }
 
+void ReplaceTaskInServerDef(tensorflow::ServerDef* server_def, int task_index) {
+  tensorflow::JobDef* job_def = server_def->mutable_cluster()->mutable_job(0);
+  int port = tensorflow::testing::PickUnusedPortOrDie();
+  job_def->mutable_tasks()->at(task_index) =
+      tensorflow::strings::StrCat("localhost:", port);
+}
+
 void CheckTFE_TensorHandleHasFloats(TFE_TensorHandle* handle,
                                     const std::vector<float>& expected_values) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
@@ -101,6 +108,22 @@ void CheckRemoteMatMulExecutesOK(TFE_Context* ctx,
   TF_DeleteStatus(status);
 }
 
+// Read the value of variable `var` and save it into `out_value`.
+void ReadVariable(TFE_Context* ctx, TFE_TensorHandle* var,
+                  TFE_TensorHandle** out_value) {
+  TF_Status* status = TF_NewStatus();
+  TFE_Op* op = TFE_NewOp(ctx, "ReadVariableOp", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetAttrType(op, "dtype", TF_FLOAT);
+  TFE_OpAddInput(op, var, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  int num_retvals = 1;
+  TFE_Execute(op, out_value, &num_retvals, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(op);
+  TF_DeleteStatus(status);
+}
+
 void TestRemoteExecuteChangeServerDef(bool async) {
   tensorflow::ServerDef server_def = GetServerDef(2);
 
@@ -243,6 +266,102 @@ TEST(CAPI, RemoteExecuteUpdateServerDefAsync) {
   TestRemoteExecuteUpdateServerDef(true);
 }
 
+void TestRemoteExecuteUpdateServerDefResourceAccess(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(async));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  const char dev0_name[] = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const char dev1_name[] = "/job:localhost/replica:0/task:1/device:CPU:0";
+
+  TFE_TensorHandle* var_handle0 = TestVariable(ctx, 1.0, dev0_name);
+  EXPECT_NE(var_handle0, nullptr);
+  TFE_TensorHandle* var_handle1 = TestVariable(ctx, 2.0, dev1_name);
+  EXPECT_NE(var_handle1, nullptr);
+
+  TFE_TensorHandle* value_handle = nullptr;
+  ReadVariable(ctx, var_handle1, &value_handle);
+  CheckTFE_TensorHandleHasFloats(value_handle, {2});
+  TFE_DeleteTensorHandle(value_handle);
+
+  // Start a new worker to replace task:1
+  ReplaceTaskInServerDef(&server_def, 1);
+  server_def.set_task_index(1);
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server)
+                  .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  // Update server def to replace the remote device with the device info on the
+  // new worker (different incarnation ID).
+  server_def.set_task_index(0);
+  string serialized_update = server_def.SerializeAsString();
+  TFE_ContextUpdateServerDef(ctx, 0, serialized_update.data(),
+                             serialized_update.size(), status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  // The device of var_handle0 is local device which is the same before and
+  // after cluster update. Remove resource with valid device should succeed.
+  TFE_Op* op = TFE_NewOp(ctx, "DestroyResourceOp", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle0, status);
+  TFE_OpSetDevice(op, dev0_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  int num_retvals = 0;
+  TFE_Execute(op, nullptr, &num_retvals, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(op);
+
+  // The device of var_handle1 is remote device, which was replaced during
+  // cluster update. Removing resource with invalid device should fail
+  // gracefully (i.e., with error status) instead of crashing with segfaults.
+  op = TFE_NewOp(ctx, "DestroyResourceOp", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle1, status);
+  TFE_OpSetDevice(op, dev1_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  num_retvals = 0;
+  TFE_Execute(op, nullptr, &num_retvals, status);
+  EXPECT_NE(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteOp(op);
+
+  TFE_DeleteTensorHandle(var_handle0);
+  TFE_DeleteTensorHandle(var_handle1);
+
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, TestRemoteExecuteUpdateServerDefResourceAccess) {
+  TestRemoteExecuteUpdateServerDefResourceAccess(false);
+}
+
+TEST(CAPI, TestRemoteExecuteUpdateServerDefResourceAccessAsync) {
+  TestRemoteExecuteUpdateServerDefResourceAccess(true);
+}
+
 void TestRemoteExecuteUpdateServerDefWithFailures(bool async) {
   // Fail fast on GetStatus requests so we can get errors instead of timeout
   // when updating cluster with non-exsitent worker
@@ -282,6 +401,7 @@ void TestRemoteExecuteUpdateServerDefWithFailures(bool async) {
   int port = tensorflow::testing::PickUnusedPortOrDie();
   job_def->mutable_tasks()->insert(
       {2, tensorflow::strings::StrCat("localhost:", port)});
+  server_def.set_task_index(0);
   string serialized_update = server_def.SerializeAsString();
   TFE_ContextUpdateServerDef(ctx, 0, serialized_update.data(),
                              serialized_update.size(), status);
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 548bf1337bb..724176505ba 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -1203,6 +1203,8 @@ void BM_ReadVariable(int iters) {
     CHECK_EQ(0, TFE_TensorHandleNumDims(h, status));
     CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
     h = nullptr;
+    TFE_OpAddInput(op, var_handle, status);
+    CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   }
   tensorflow::testing::StopTiming();
   TFE_DeleteOp(op);
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index bbdc4c8f410..29b624b8537 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -150,6 +150,7 @@ TFE_TensorHandle* TestVariable(TFE_Context* ctx, float value,
   TFE_TensorHandle* var_handle = nullptr;
   int num_retvals = 1;
   TFE_Execute(op, &var_handle, &num_retvals, status);
+  if (TF_GetCode(status) != TF_OK) return nullptr;
   TFE_DeleteOp(op);
   if (TF_GetCode(status) != TF_OK) return nullptr;
   CHECK_EQ(1, num_retvals);
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index c7583c374f2..0b693085da3 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -45,6 +45,7 @@ StaticDeviceMgr::StaticDeviceMgr(std::vector<std::unique_ptr<Device>> devices)
     }
     const auto& t = d->device_type();
     device_type_counts_[t]++;
+    device_incarnation_set_.insert(d->attributes().incarnation());
     if (cpu_device_ == nullptr && t == "CPU" && d->parsed_name().id == 0) {
       cpu_device_ = d.get();
     }
@@ -123,6 +124,10 @@ Status StaticDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
   return Status::OK();
 }
 
+bool StaticDeviceMgr::ContainsDevice(int64 device_incarnation) const {
+  return device_incarnation_set_.contains(device_incarnation);
+}
+
 void StaticDeviceMgr::ClearContainers(
     gtl::ArraySlice<string> containers) const {
   Status s;
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index 56248b39078..83a0d0cc29c 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/lib/core/arena.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -56,6 +57,11 @@ class DeviceMgr {
   // Accepts either a full device name, or just the replica-local suffix.
   virtual Status LookupDevice(StringPiece name, Device** device) const = 0;
 
+  // Check if the current device manager contains device with the given
+  // incarnation ID. Looking up by incarnation IDs because they are randomly
+  // generated and not intentionally reused (unlike device pointers).
+  virtual bool ContainsDevice(int64 device_incarnation) const = 0;
+
   // Clears given containers of all devices if 'container' is
   // non-empty. Otherwise, clears default containers of all devices.
   virtual void ClearContainers(gtl::ArraySlice<string> containers) const = 0;
@@ -86,6 +92,7 @@ class StaticDeviceMgr : public DeviceMgr {
   string DebugString() const override;
   string DeviceMappingString() const override;
   Status LookupDevice(StringPiece name, Device** device) const override;
+  bool ContainsDevice(int64 device_incarnation) const override;
   void ClearContainers(gtl::ArraySlice<string> containers) const override;
   int NumDeviceType(const string& type) const override;
   Device* HostCPU() const override;
@@ -95,6 +102,7 @@ class StaticDeviceMgr : public DeviceMgr {
 
   StringPiece CopyToBackingStore(StringPiece s);
 
+  absl::flat_hash_set<int64> device_incarnation_set_;
   std::unordered_map<StringPiece, Device*, StringPieceHasher> device_map_;
   core::Arena name_backing_store_;  // Storage for keys in device_map_
   std::unordered_map<string, int> device_type_counts_;
@@ -117,6 +125,7 @@ class DynamicDeviceMgr : public DeviceMgr {
   string DebugString() const override;
   string DeviceMappingString() const override;
   Status LookupDevice(StringPiece name, Device** device) const override;
+  bool ContainsDevice(int64 device_incarnation) const override;
   void ClearContainers(gtl::ArraySlice<string> containers) const override;
   int NumDeviceType(const string& type) const override;
   Device* HostCPU() const override;
@@ -140,6 +149,7 @@ class DynamicDeviceMgr : public DeviceMgr {
   std::unordered_map<Device*, std::unique_ptr<Device>> dynamic_devices_
       TF_GUARDED_BY(devices_mu_);
 
+  absl::flat_hash_set<int64> device_incarnation_set_ TF_GUARDED_BY(devices_mu_);
   std::unordered_map<string, Device*> device_map_ TF_GUARDED_BY(devices_mu_);
 
   std::unordered_map<string, int> device_type_counts_
diff --git a/tensorflow/core/common_runtime/dynamic_device_mgr.cc b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
index f35fa7e416a..f47de47c5b9 100644
--- a/tensorflow/core/common_runtime/dynamic_device_mgr.cc
+++ b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
@@ -92,6 +92,11 @@ Status DynamicDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
   return Status::OK();
 }
 
+bool DynamicDeviceMgr::ContainsDevice(int64 device_incarnation) const {
+  tf_shared_lock l(devices_mu_);
+  return device_incarnation_set_.contains(device_incarnation);
+}
+
 void DynamicDeviceMgr::ClearContainers(
     gtl::ArraySlice<string> containers) const {
   Status s;
@@ -138,6 +143,7 @@ Status DynamicDeviceMgr::AddDevices(
       device_map_[name] = d.get();
     }
     device_type_counts_[d->device_type()]++;
+    device_incarnation_set_.insert(d->attributes().incarnation());
     dynamic_devices_.emplace(d.get(), std::move(d));
   }
   return Status::OK();
@@ -171,6 +177,7 @@ Status DynamicDeviceMgr::RemoveDevices(std::vector<Device*> devices) {
       device_map_.erase(name);
     }
     device_type_counts_[d->device_type()]--;
+    device_incarnation_set_.erase(d->attributes().incarnation());
     dynamic_devices_.erase(it);
   }
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 35dd9990054..3036e6d7989 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -874,6 +874,19 @@ bool IsPinnableOp(const string& op_type) {
          !absl::StartsWith(op_type, "XRT");
 }
 
+// Validate if the remote device with the given incarnation is valid in the
+// remote device manager of the current eager context.
+Status ValidateTensorHandleRemoteDevice(EagerContext* ctx,
+                                        int64 device_incarnation) {
+  if (ctx->remote_device_mgr()->ContainsDevice(device_incarnation)) {
+    return Status::OK();
+  }
+  return errors::InvalidArgument(
+      "Resource input tensor contains an invalid device. This might happen "
+      "when the client has connected to a different cluster, or some remote "
+      "workers have been restarted.");
+}
+
 // The Op device may be updated if:
 // - A resource touching input is specified: all resource-touching ops run in
 // the device the resource is, regardless of anything else that has been
@@ -935,6 +948,10 @@ Status MaybeUpdateOpDevice(EagerOperation* op) {
   for (int i = 0; i < op->Inputs().size(); ++i) {
     TensorHandle* tensor_handle = op->Inputs()[i];
     if (tensor_handle->dtype == DT_RESOURCE) {
+      if (tensor_handle->resource_remote_device_incarnation() != 0) {
+        TF_RETURN_IF_ERROR(ValidateTensorHandleRemoteDevice(
+            &ctx, tensor_handle->resource_remote_device_incarnation()));
+      }
       Device* resource_device = tensor_handle->resource_device();
       DVLOG(2) << "for op " << op->Name() << " input " << i << " "
                << DataTypeString(tensor_handle->dtype)
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index eef46b691ce..dfe3e4a1426 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -49,6 +49,13 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+int64 GetRemoteDeviceIncarnation(Device* device) {
+  if (device == nullptr || device->IsLocal()) return 0;
+  return device->attributes().incarnation();
+}
+}  // namespace
+
 TensorHandle::PackedTensorHandleData::PackedTensorHandleData(
     std::vector<TensorHandle*>&& handles, const TensorShape& shape)
     : handles_(std::move(handles)), shape_(shape) {
@@ -244,6 +251,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
       device_((!ctx || d == ctx->HostCPU()) ? nullptr : d),
       op_device_(op_device),
       resource_device_(resource_device),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<LocalTensorHandleData>, std::move(t)) {
   DVLOG(3) << "Creating Local TensorHandle: " << this
@@ -258,6 +267,8 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, Device* d, Device* op_device,
       op_device_(op_device),
       resource_device_(
           GetResourceDevice(t.flat<class ResourceHandle>()(0), ctx)),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       resource_handle_info_(
           {t.flat<class ResourceHandle>()(0).dtypes_and_shapes(),
@@ -274,6 +285,7 @@ TensorHandle::TensorHandle(tensorflow::Tensor&& t, CustomDevice* d,
       device_(d),
       op_device_(nullptr),
       resource_device_(nullptr),
+      resource_remote_device_incarnation_(0),
       ctx_(ctx),
       data_(absl::in_place_type<LocalTensorHandleData>, std::move(t)) {
   // TODO(allenl): Figure out a better op_device story for custom devices,
@@ -297,6 +309,8 @@ TensorHandle::TensorHandle(Device* d, Device* op_device,
       device_((d == ctx->HostCPU()) ? nullptr : d),
       op_device_(op_device),
       resource_device_(resource_device),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<LocalTensorHandleData>) {
   DVLOG(3) << "Creating empty Local TensorHandle: " << this
@@ -354,6 +368,8 @@ TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
       device_(device),
       op_device_(device),
       resource_device_(dtype == DT_RESOURCE ? device : nullptr),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<PackedTensorHandleData>, std::move(handles),
             shape) {
@@ -376,6 +392,8 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
       device_(d),
       op_device_(d),
       resource_device_(dtype == DT_RESOURCE ? d : nullptr),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<RemoteTensorHandleData>, op_id, output_num,
             remote_task, ctx) {
@@ -398,6 +416,8 @@ TensorHandle::TensorHandle(int64 op_id, int32 output_num,
       device_(d),
       op_device_(d),
       resource_device_(dtype == DT_RESOURCE ? d : nullptr),
+      resource_remote_device_incarnation_(
+          GetRemoteDeviceIncarnation(resource_device_)),
       ctx_(ctx),
       data_(absl::in_place_type<RemoteTensorHandleData>, op_id, output_num,
             ctx->GetContextViewId()) {
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 45e7a3815a8..25d7fea3200 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -133,6 +133,9 @@ class TensorHandle : public AbstractTensorHandleInterface,
   VariantDevice device() const { return device_; }
   Device* op_device() const { return op_device_; }
   Device* resource_device() const { return resource_device_; }
+  int64 resource_remote_device_incarnation() const {
+    return resource_remote_device_incarnation_;
+  }
 
   VariantDevice DeviceOrHostCPU(const EagerContext& ctx) const;
 
@@ -265,6 +268,9 @@ class TensorHandle : public AbstractTensorHandleInterface,
   // If the tensor dtype is DT_RESOURCE, resource_device_ holds the device
   // backing the resource. Else resource_device_ is nullptr.
   tensorflow::Device* const resource_device_;
+  // Incarnation ID of the resource device if it locates on a remote device, or
+  // 0 if it locates on a local device.
+  const int64 resource_remote_device_incarnation_;
 
   mutable mutex mu_;
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 2bcde7dce5b..779158375de 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/random.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -66,17 +67,28 @@ TEST(TensorHandle_ShapeTest, AsyncShape) {
   ctx->Unref();
 }
 
-static Device* CreateDevice(const char* type, const char* name) {
+static Device* CreateDevice(const char* type, const char* name,
+                            bool is_local = true) {
   class FakeDevice : public Device {
    public:
-    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
+    explicit FakeDevice(const DeviceAttributes& attr, bool is_local)
+        : Device(nullptr, attr), is_local_(is_local) {}
     Status Sync() override { return Status::OK(); }
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+    bool IsLocal() const override { return is_local_; }
+
+   private:
+    const bool is_local_;
   };
   DeviceAttributes attr;
   attr.set_name(name);
   attr.set_device_type(type);
-  return new FakeDevice(attr);
+  int64 incarnation = random::New64();
+  while (incarnation == 0) {
+    incarnation = random::New64();
+  }
+  attr.set_incarnation(incarnation);
+  return new FakeDevice(attr, is_local);
 }
 
 }  // namespace
@@ -204,4 +216,87 @@ TEST_F(PackedTensorHandleTest, PackedHandle) {
   packed_handle->Unref();
 }
 
+TEST(TensorHandle_ResourceDeviceTest, OnLocalDevice) {
+  std::unique_ptr<Device> d0(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  StaticDeviceMgr local_device_mgr(std::move(d0));
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
+      &local_device_mgr, false, nullptr, nullptr, nullptr);
+
+  tensorflow::DataType dtype = DT_RESOURCE;
+  TensorShape shape = {2};
+  Tensor t(dtype, shape);
+
+  Device* d = local_device_mgr.ListDevices()[0];
+  TensorHandle* th =
+      TensorHandle::CreateLocalHandle(std::move(t), d, d, d, ctx);
+  // Remote device incarnation for local resource should be 0 (invalid)
+  EXPECT_EQ(0, th->resource_remote_device_incarnation());
+  // Local device manager must contain the resource device.
+  EXPECT_TRUE(local_device_mgr.ContainsDevice(
+      th->resource_device()->attributes().incarnation()));
+
+  std::unique_ptr<Device> d1(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  StaticDeviceMgr new_device_mgr(std::move(d1));
+  EXPECT_FALSE(new_device_mgr.ContainsDevice(
+      th->resource_device()->attributes().incarnation()));
+
+  th->Unref();
+  ctx->Unref();
+}
+
+TEST(TensorHandle_ResourceDeviceTest, OnRemoteDevice) {
+  std::unique_ptr<Device> d_local(
+      CreateDevice("CPU", "/job:localhost/replica:0/task:0/device:CPU:0"));
+  StaticDeviceMgr local_device_mgr(std::move(d_local));
+  auto ctx = new EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      tensorflow::ContextMirroringPolicy::MIRRORING_NONE, false, false,
+      &local_device_mgr, false, nullptr, nullptr, nullptr);
+
+  std::unique_ptr<Device> d0(
+      CreateDevice("CPU", "/job:worker/task:0/device:CPU:0", false));
+  Device* d0_ptr = d0.get();
+  std::unique_ptr<Device> d1(
+      CreateDevice("CPU", "/job:worker/task:1/device:CPU:0", false));
+  Device* d1_ptr = d1.get();
+
+  DynamicDeviceMgr remote_device_mgr;
+  std::vector<std::unique_ptr<Device>> vector_d0;
+  vector_d0.emplace_back(std::move(d0));
+  TF_ASSERT_OK(remote_device_mgr.AddDevices(std::move(vector_d0)));
+
+  TensorHandle* th0 = TensorHandle::CreateUnshapedRemoteHandle(
+      0, 0, "", DT_RESOURCE, d0_ptr, ctx);
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th0->resource_remote_device_incarnation()));
+
+  std::vector<std::unique_ptr<Device>> vector_d1;
+  vector_d1.emplace_back(std::move(d1));
+  TF_ASSERT_OK(remote_device_mgr.AddDevices(std::move(vector_d1)));
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th0->resource_remote_device_incarnation()));
+
+  TensorHandle* th1 = TensorHandle::CreateUnshapedRemoteHandle(
+      0, 0, "", DT_RESOURCE, d1_ptr, ctx);
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th1->resource_remote_device_incarnation()));
+
+  std::vector<Device*> remove_d1{d1_ptr};
+  TF_ASSERT_OK(remote_device_mgr.RemoveDevices(std::move(remove_d1)));
+  EXPECT_FALSE(remote_device_mgr.ContainsDevice(
+      th1->resource_remote_device_incarnation()));
+  EXPECT_TRUE(remote_device_mgr.ContainsDevice(
+      th0->resource_remote_device_incarnation()));
+
+  th0->Unref();
+  th1->Unref();
+  ctx->Unref();
+}
+
 }  // namespace tensorflow

From 296993a42ca74d7a49efcaa92d3b0dd427551980 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Tue, 12 May 2020 21:30:26 -0700
Subject: [PATCH 085/412] Remove deprecated variants of DynamicSlice and
 DynamicUpdateSlice builders

Upgraded existing users by converting 1d start_slices to a list of scalars. I am expecting this to be performance neutral as these tensors are expected to be small. I decided against having the XlaBuilder do this internally as I guess we want to discourage usage of vector indices.

PiperOrigin-RevId: 311261628
Change-Id: I4b779a58cfca1699bdf5104c236bc6453fd419bc
---
 .../tf2xla/kernels/dynamic_slice_ops.cc       | 28 +++++++---
 .../compiler/tf2xla/kernels/slice_op.cc       | 22 +++++---
 tensorflow/compiler/xla/client/xla_builder.cc | 51 -------------------
 tensorflow/compiler/xla/client/xla_builder.h  | 16 ------
 tensorflow/compiler/xla/tests/while_test.cc   |  2 +-
 5 files changed, 36 insertions(+), 83 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
index bb2c0d9ddb8..5dbc083368c 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
@@ -28,6 +28,15 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+absl::InlinedVector<xla::XlaOp, 4> SliceVector(xla::XlaOp input, int64 rank) {
+  absl::InlinedVector<xla::XlaOp, 4> scalar_indices;
+  scalar_indices.reserve(rank);
+  for (int i = 0; i < rank; i++)
+    scalar_indices.push_back(
+        xla::Reshape(xla::Slice(input, {i}, {i + 1}, {1}), {}));
+  return scalar_indices;
+}
+
 class DynamicUpdateSliceOp : public XlaOpKernel {
  public:
   explicit DynamicUpdateSliceOp(OpKernelConstruction* context)
@@ -41,21 +50,23 @@ class DynamicUpdateSliceOp : public XlaOpKernel {
     const TensorShape update_shape = ctx->InputShape("update");
     const TensorShape index_shape = ctx->InputShape("indices");
 
+    int64 rank = input_shape.dims();
     OP_REQUIRES(
         ctx,
         TensorShapeUtils::IsVector(index_shape) &&
-            index_shape.num_elements() == input_shape.dims(),
+            index_shape.num_elements() == rank,
         errors::InvalidArgument("index must be a vector with length equal to "
                                 "the number of input dimensions"));
     OP_REQUIRES(
-        ctx, input_shape.dims() == update_shape.dims(),
+        ctx, rank == update_shape.dims(),
         errors::InvalidArgument("input and update must have the same rank,"
                                 " input shape is ",
                                 input_shape.DebugString(), "; update shape is ",
                                 update_shape.DebugString()));
 
+    xla::XlaOp indices = ctx->Input("indices");
     xla::XlaOp result = xla::DynamicUpdateSlice(
-        ctx->Input("input"), ctx->Input("update"), ctx->Input("indices"));
+        ctx->Input("input"), ctx->Input("update"), SliceVector(indices, rank));
     ctx->SetOutput(0, result);
   }
 };
@@ -76,17 +87,18 @@ class DynamicSliceOp : public XlaOpKernel {
     const TensorShape start_indices_shape = ctx->InputShape("start_indices");
     const TensorShape size_indices_shape = ctx->InputShape("size_indices");
 
+    int64 rank = input_shape.dims();
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsVector(start_indices_shape) &&
-                    start_indices_shape.num_elements() == input_shape.dims(),
+                    start_indices_shape.num_elements() == rank,
                 errors::InvalidArgument(
                     "start_indices must be a vector with length equal to "
                     "input rank, but input rank is ",
-                    input_shape.dims(), " and start_indices has shape ",
+                    rank, " and start_indices has shape ",
                     start_indices_shape.DebugString()));
     OP_REQUIRES(ctx,
                 TensorShapeUtils::IsVector(size_indices_shape) &&
-                    size_indices_shape.num_elements() == input_shape.dims(),
+                    size_indices_shape.num_elements() == rank,
                 errors::InvalidArgument(
                     "size_indices must be a vector with length equal to "
                     "input rank, but input rank is ",
@@ -96,8 +108,10 @@ class DynamicSliceOp : public XlaOpKernel {
     std::vector<int64> size_indices;
     OP_REQUIRES_OK(
         ctx, ctx->ConstantInputAsIntVector("size_indices", &size_indices));
+
+    xla::XlaOp start_indices = ctx->Input("start_indices");
     xla::XlaOp result = xla::DynamicSlice(
-        ctx->Input("input"), ctx->Input("start_indices"), size_indices);
+        ctx->Input("input"), SliceVector(start_indices, rank), size_indices);
     ctx->SetOutput(0, result);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 17d0b87edda..7f274c6b00f 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -42,19 +42,17 @@ class SliceOp : public XlaOpKernel {
     const TensorShape begin_tensor_shape = ctx->InputShape(1);
     const TensorShape size_tensor_shape = ctx->InputShape(2);
 
+    const int input_dims = input_shape.dims();
     OP_REQUIRES(
         ctx,
         TensorShapeUtils::IsVector(begin_tensor_shape) &&
             TensorShapeUtils::IsVector(size_tensor_shape) &&
-            begin_tensor_shape.num_elements() == input_shape.dims() &&
-            size_tensor_shape.num_elements() == input_shape.dims(),
+            begin_tensor_shape.num_elements() == input_dims &&
+            size_tensor_shape.num_elements() == input_dims,
         errors::InvalidArgument(
             "Expected begin and size arguments to be 1-D tensors of size ",
-            input_shape.dims(), ", but got shapes ",
-            begin_tensor_shape.DebugString(), " and ",
-            size_tensor_shape.DebugString(), " instead."));
-
-    const int input_dims = input_shape.dims();
+            input_dims, ", but got shapes ", begin_tensor_shape.DebugString(),
+            " and ", size_tensor_shape.DebugString(), " instead."));
 
     std::vector<int64> begin;
     std::vector<int64> size;
@@ -129,7 +127,15 @@ class SliceOp : public XlaOpKernel {
                                             input_shape.dim_size(i), "], but ",
                                             "got ", size[i]));
       }
-      ctx->SetOutput(0, xla::DynamicSlice(ctx->Input(0), ctx->Input(1), size));
+
+      absl::InlinedVector<xla::XlaOp, 4> scalar_indices;
+      scalar_indices.reserve(input_dims);
+      xla::XlaOp begin = ctx->Input("begin");
+      for (int i = 0; i < input_dims; i++)
+        scalar_indices.push_back(
+            xla::Reshape(xla::Slice(begin, {i}, {i + 1}, {1}), {}));
+
+      ctx->SetOutput(0, xla::DynamicSlice(ctx->Input(0), scalar_indices, size));
     }
   }
 };
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index bd70ce80082..6539817d524 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -860,28 +860,6 @@ XlaOp XlaBuilder::SliceInDim(XlaOp operand, int64 start_index,
   });
 }
 
-XlaOp XlaBuilder::DynamicSlice(XlaOp operand, XlaOp start_indices,
-                               absl::Span<const int64> slice_sizes) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
-    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-    TF_ASSIGN_OR_RETURN(const Shape* start_indices_shape,
-                        GetShapePtr(start_indices));
-    TF_ASSIGN_OR_RETURN(
-        Shape shape, ShapeInference::InferDynamicSliceShape(
-                         *operand_shape, {*start_indices_shape}, slice_sizes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    for (int64 size : slice_sizes) {
-      instr.add_dynamic_slice_sizes(size);
-    }
-
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice,
-                          {operand, start_indices});
-  });
-}
-
 XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
                                absl::Span<const XlaOp> start_indices,
                                absl::Span<const int64> slice_sizes) {
@@ -910,26 +888,6 @@ XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
   });
 }
 
-XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
-                                     XlaOp start_indices) {
-  return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
-    TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
-    TF_ASSIGN_OR_RETURN(const Shape* update_shape, GetShapePtr(update));
-    TF_ASSIGN_OR_RETURN(const Shape* start_indices_shape,
-                        GetShapePtr(start_indices));
-    TF_ASSIGN_OR_RETURN(
-        Shape shape,
-        ShapeInference::InferDynamicUpdateSliceShape(
-            *operand_shape, *update_shape, {*start_indices_shape}));
-    *instr.mutable_shape() = shape.ToProto();
-
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
-                          {operand, update, start_indices});
-  });
-}
-
 XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                      absl::Span<const XlaOp> start_indices) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -3152,20 +3110,11 @@ XlaOp SliceInDim(const XlaOp operand, int64 start_index, int64 limit_index,
                                        stride, dimno);
 }
 
-XlaOp DynamicSlice(const XlaOp operand, const XlaOp start_indices,
-                   absl::Span<const int64> slice_sizes) {
-  return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
-}
 XlaOp DynamicSlice(const XlaOp operand, absl::Span<const XlaOp> start_indices,
                    absl::Span<const int64> slice_sizes) {
   return operand.builder()->DynamicSlice(operand, start_indices, slice_sizes);
 }
 
-XlaOp DynamicUpdateSlice(const XlaOp operand, const XlaOp update,
-                         const XlaOp start_indices) {
-  return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
-}
-
 XlaOp DynamicUpdateSlice(const XlaOp operand, const XlaOp update,
                          absl::Span<const XlaOp> start_indices) {
   return operand.builder()->DynamicUpdateSlice(operand, update, start_indices);
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 33fe62e9322..24b0cba3a1b 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -421,14 +421,9 @@ class XlaBuilder {
   virtual XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
                            int64 stride, int64 dimno);
 
-  ABSL_DEPRECATED("Use span-of-indices form instead")
-  XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
-                     absl::Span<const int64> slice_sizes);
   XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                      absl::Span<const int64> slice_sizes);
 
-  ABSL_DEPRECATED("Use span-of-indices form instead")
-  XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update, XlaOp start_indices);
   XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                            absl::Span<const XlaOp> start_indices);
 
@@ -858,14 +853,10 @@ class XlaBuilder {
   friend XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
                           int64 stride, int64 dimno);
 
-  friend XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
-                            absl::Span<const int64> slice_sizes);
   friend XlaOp DynamicSlice(XlaOp operand,
                             absl::Span<const XlaOp> start_indices,
                             absl::Span<const int64> slice_sizes);
 
-  friend XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
-                                  XlaOp start_indices);
   friend XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                   absl::Span<const XlaOp> start_indices);
 
@@ -1438,10 +1429,6 @@ XlaOp SliceInDim(XlaOp operand, int64 start_index, int64 limit_index,
 XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                    absl::Span<const int64> slice_sizes);
 
-ABSL_DEPRECATED("Use span-of-indices form instead")
-XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
-                   absl::Span<const int64> slice_sizes);
-
 // Enqueues a dynamic update slice operation onto the computation, which
 // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
 // The shape of 'update' determines the shape of the slice of 'operand'
@@ -1462,9 +1449,6 @@ XlaOp DynamicSlice(XlaOp operand, XlaOp start_indices,
 XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                          absl::Span<const XlaOp> start_indices);
 
-ABSL_DEPRECATED("Use span-of-indices form instead")
-XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update, XlaOp start_indices);
-
 // Enqueues a concatenate instruction onto the computation. 'operands' must
 // have >= 1 entry.
 XlaOp ConcatInDim(XlaBuilder* builder, absl::Span<const XlaOp> operands,
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 5a482305513..d575bbb1f3e 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -863,7 +863,7 @@ XLA_TEST_F(WhileTest, WhileWithDynamicUpdateSlice) {
     // Starts = iteration * 2;
     auto starts = Mul(iteration, ConstantR0<int32>(&builder, 2));
     // UpdateSlice.
-    auto out1 = DynamicUpdateSlice(input, update, starts);
+    auto out1 = DynamicUpdateSlice(input, update, {starts});
 
     Tuple(&builder, {out0, out1});
     body = builder.Build().ConsumeValueOrDie();

From 2db6e2e05da4ebea6d2faffb94a955abcb5248f9 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Tue, 12 May 2020 21:34:16 -0700
Subject: [PATCH 086/412] Support Core ML 3 for Core ML delegate when running
 on iOS 13

Added option to choose Core ML 2 even when running on iOS 13. Currently Reshape is not supported in Core ML 3, because it only accepts 5D tensor and it's not likely for model to have 5D tensor when coming from TFLite.

PiperOrigin-RevId: 311261992
Change-Id: I9161cfb734b11ccd053c8a22a142f8cf72132e5a
---
 .../delegates/coreml/builders/op_builder.cc   | 10 +++
 .../delegates/coreml/builders/op_builder.h    |  4 +
 .../delegates/coreml/builders/op_validator.h  |  3 +-
 .../coreml/builders/reshape_op_builder.cc     |  6 +-
 .../delegates/coreml/coreml_delegate.h        |  7 ++
 .../delegates/coreml/coreml_delegate.mm       | 85 +++++++++++--------
 .../delegates/coreml/coreml_delegate_kernel.h |  3 +
 .../coreml/coreml_delegate_kernel.mm          | 27 +++---
 .../delegates/coreml/coreml_executor.h        |  1 +
 .../delegates/coreml/coreml_executor.mm       | 46 ++++++++--
 .../swift/Sources/CoreMLDelegate.swift        |  4 +
 .../lite/g3doc/performance/coreml_delegate.md | 12 ++-
 12 files changed, 153 insertions(+), 55 deletions(-)

diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
index 09c386b55f0..2581b58f1e4 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.cc
@@ -87,6 +87,16 @@ OpBuilder* GraphBuilder::AddBuilder(
 
 CoreML::Specification::Model* GraphBuilder::BuildModel() {
   CoreML::Specification::Model* model = new CoreML::Specification::Model();
+  if (coreml_version_ == 2) {  // Core ML 2, iOS >= 12.0
+    model->set_specificationversion(3);
+  } else if (coreml_version_ == 3) {  // Core ML 3, iOS >= 13.0
+    model->set_specificationversion(4);
+    model->mutable_neuralnetwork()->set_arrayinputshapemapping(
+        CoreML::Specification::EXACT_ARRAY_MAPPING);
+  } else {
+    fprintf(stderr, "Unsupported Core ML version: %d\n", coreml_version_);
+    return nullptr;
+  }
   auto* neural_network = model->mutable_neuralnetwork();
   for (auto& builder : builders_) {
     CoreML::Specification::NeuralNetworkLayer* layer = builder->Build();
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h
index 5367ae20d2f..c59c30a5a28 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_builder.h
@@ -52,6 +52,8 @@ class TensorID {
 // API is experimental and subject to change.
 class GraphBuilder {
  public:
+  explicit GraphBuilder(int coreml_version) : coreml_version_(coreml_version) {}
+
   // Returns pointer to the created builder. Ownership still belongs
   // to the GraphBuilder.
   OpBuilder* AddBuilder(int builtin_code, const TfLiteNode* node);
@@ -79,6 +81,8 @@ class GraphBuilder {
   // This information is used to mark constant tensors that are used as input.
   bool IsTensorUsed(int tflite_tensor_index);
 
+  const int coreml_version_;
+
  private:
   std::vector<std::unique_ptr<OpBuilder>> builders_;
   // Index in the vector is the tflite_tensor_index, the value
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
index b0fe24ee288..501a304706c 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/op_validator.h
@@ -32,7 +32,8 @@ bool IsFullyConnectedOpSupported(const TfLiteRegistration* registration,
                                  const TfLiteNode* node,
                                  TfLiteContext* context);
 bool IsReshapeOpSupported(const TfLiteRegistration* registration,
-                          const TfLiteNode* node, TfLiteContext* context);
+                          const TfLiteNode* node, TfLiteContext* context,
+                          int coreml_version);
 bool IsResizeBilinearOpSupported(const TfLiteRegistration* registration,
                                  const TfLiteNode* node,
                                  TfLiteContext* context);
diff --git a/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc b/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
index 33040e2e070..b7b78653d36 100644
--- a/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/coreml/builders/reshape_op_builder.cc
@@ -114,7 +114,11 @@ TfLiteStatus ReshapeOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
 }
 
 bool IsReshapeOpSupported(const TfLiteRegistration* registration,
-                          const TfLiteNode* node, TfLiteContext* context) {
+                          const TfLiteNode* node, TfLiteContext* context,
+                          int coreml_version) {
+  if (coreml_version >= 3) {
+    return false;
+  }
   if (node->inputs->size == 1) {
     const auto* params =
         reinterpret_cast<TfLiteReshapeParams*>(node->builtin_data);
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h
index 0d75afc8e34..8ad81040499 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.h
@@ -31,6 +31,13 @@ typedef enum {
 typedef struct {
   // Only create delegate when Neural Engine is available on the device.
   TfLiteCoreMlDelegateEnabledDevices enabled_devices;
+  // Specifies target Core ML version for model conversion.
+  // Core ML 3 come with a lot more ops, but some ops (e.g. reshape) is not
+  // delegated due to input rank constraint.
+  // if not set to one of the valid versions, the delegate will use highest
+  // version possible in the platform.
+  // Valid versions: (2, 3)
+  int coreml_version;
   // This sets the maximum number of Core ML delegates created.
   // Each graph corresponds to one delegated node subset in the
   // TFLite model. Set this to 0 to delegate all possible partitions.
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
index 5d0564ebc48..58728659894 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate.mm
@@ -36,7 +36,7 @@ constexpr int kMinNodesPerCoreMlDelegate = 2;
 using delegates::coreml::CoreMlDelegateKernel;
 
 bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfLiteNode* node,
-                               TfLiteContext* context) {
+                               TfLiteContext* context, const TfLiteCoreMlDelegateOptions* options) {
   if (@available(iOS 11.0, *)) {
   } else {
     return false;
@@ -120,7 +120,8 @@ bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfL
       return true;
     }
     case kTfLiteBuiltinReshape: {
-      return delegates::coreml::IsReshapeOpSupported(registration, node, context);
+      return delegates::coreml::IsReshapeOpSupported(registration, node, context,
+                                                     options->coreml_version);
     }
     case kTfLiteBuiltinResizeBilinear: {
       return delegates::coreml::IsResizeBilinearOpSupported(registration, node, context);
@@ -142,6 +143,39 @@ bool IsNodeSupportedByDelegate(const TfLiteRegistration* registration, const TfL
   return false;
 }
 
+class CoreMlDelegate : public TfLiteDelegate {
+ public:
+  explicit CoreMlDelegate(const TfLiteCoreMlDelegateOptions* params)
+      : params_(params != nullptr ? *params : TfLiteCoreMlDelegateOptions()) {
+    {
+      if (@available(iOS 13.0, *)) {
+        if (params_.coreml_version != 2 && params_.coreml_version != 3) {
+          NSLog(@"coreml_version must be 2 or 3. Setting to 3.");
+          params_.coreml_version = 3;
+        }
+      } else if (@available(iOS 12.0, *)) {
+        if (params_.coreml_version != 2) {
+          NSLog(@"coreml_version must be 2 - using Core ML version 2.");
+          params_.coreml_version = 2;
+        }
+      }
+      if (params_.max_delegated_partitions <= 0) {
+        params_.max_delegated_partitions = std::numeric_limits<int>::max();
+      }
+      if (params_.min_nodes_per_partition <= 0) {
+        params_.min_nodes_per_partition = kMinNodesPerCoreMlDelegate;
+      }
+    }
+  }
+
+  TfLiteCoreMlDelegateOptions* params() { return &params_; }
+
+  bool VerifyDelegate() { return true; }
+
+ private:
+  TfLiteCoreMlDelegateOptions params_;
+};
+
 TfLiteRegistration GetCoreMlKernelRegistration() {
   // This is the registration for the Delegate Node that gets added to
   // the TFLite graph instead of the subGraph it replaces it.
@@ -158,8 +192,10 @@ TfLiteRegistration GetCoreMlKernelRegistration() {
   };
   kernel_registration.init = [](TfLiteContext* context, const char* buffer,
                                 size_t length) -> void* {
-    const TfLiteDelegateParams* params = reinterpret_cast<const TfLiteDelegateParams*>(buffer);
-    CoreMlDelegateKernel* coreml_kernel = new CoreMlDelegateKernel();
+    const auto* params = reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+    const auto* coreml_options =
+        (reinterpret_cast<CoreMlDelegate*>(params->delegate))->params();
+    CoreMlDelegateKernel* coreml_kernel = new CoreMlDelegateKernel(coreml_options->coreml_version);
     if (coreml_kernel->Init(context, params) != kTfLiteOk) {
       delete coreml_kernel;
       return nullptr;
@@ -187,14 +223,12 @@ TfLiteRegistration GetCoreMlKernelRegistration() {
 }
 
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  const auto* params =
-      reinterpret_cast<TfLiteCoreMlDelegateOptions*>(delegate->data_);
+  const auto* params = reinterpret_cast<TfLiteCoreMlDelegateOptions*>(delegate->data_);
 
-  delegates::IsNodeSupportedFn node_supported_fn =
-      [=](TfLiteContext* context, TfLiteNode* node,
-          TfLiteRegistration* registration,
-          std::string* unsupported_details) -> bool {
-    return IsNodeSupportedByDelegate(registration, node, context);
+  delegates::IsNodeSupportedFn node_supported_fn = [=](TfLiteContext* context, TfLiteNode* node,
+                                                       TfLiteRegistration* registration,
+                                                       std::string* unsupported_details) -> bool {
+    return IsNodeSupportedByDelegate(registration, node, context, params);
   };
 
   delegates::GraphPartitionHelper helper(context, node_supported_fn);
@@ -214,7 +248,8 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
 
   // Set first element to the number of nodes to replace.
   supported_nodes[0] = supported_nodes.size() - 1;
-  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO, "CoreML delegate: %d nodes delegated out of %d nodes, "
+  TFLITE_LOG_PROD(tflite::TFLITE_LOG_INFO,
+                  "CoreML delegate: %d nodes delegated out of %d nodes, "
                   "with %d partitions.\n",
                   supported_nodes[0], helper.num_total_nodes(), delegate_partitions.size());
 
@@ -223,28 +258,6 @@ TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
       reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()), delegate);
 }
 
-class CoreMlDelegate : public TfLiteDelegate {
- public:
-  explicit CoreMlDelegate(const TfLiteCoreMlDelegateOptions* params)
-      : params_(params != nullptr ? *params : TfLiteCoreMlDelegateOptions()) {
-    {
-      if (params_.max_delegated_partitions <= 0) {
-        params_.max_delegated_partitions = std::numeric_limits<int>::max();
-      }
-      if (params_.min_nodes_per_partition <= 0) {
-        params_.min_nodes_per_partition = kMinNodesPerCoreMlDelegate;
-      }
-    }
-  }
-
-  TfLiteCoreMlDelegateOptions* params() { return &params_; }
-
-  bool VerifyDelegate() { return true; }
-
- private:
-  TfLiteCoreMlDelegateOptions params_;
-};
-
 TfLiteDelegate* CreateCoreMlDelegate(const TfLiteCoreMlDelegateOptions* options) {
   TfLiteDelegate* delegate = new CoreMlDelegate(options);
   if (!static_cast<CoreMlDelegate*>(delegate)->VerifyDelegate()) {
@@ -288,7 +301,7 @@ bool IsNeuralEngineAvailable() {
 }  // namespace
 
 TfLiteDelegate* TfLiteCoreMlDelegateCreate(const TfLiteCoreMlDelegateOptions* options) {
-  if (@available(iOS 11.0, *)) {
+  if (@available(iOS 12.0, *)) {
     if (options->enabled_devices == TfLiteCoreMlDelegateDevicesWithNeuralEngine &&
         !IsNeuralEngineAvailable()) {
       NSLog(@"This device does not have Neural Engine, so Core ML delegate will not be enabled. "
@@ -299,7 +312,7 @@ TfLiteDelegate* TfLiteCoreMlDelegateCreate(const TfLiteCoreMlDelegateOptions* op
     return tflite::CreateCoreMlDelegate(options);
   } else {
     NSLog(@"Core ML delegate is not supported in this iOS version. "
-           "Minimum required iOS version is 11.0.");
+           "Minimum required iOS version is 12.0.");
     return nullptr;
   }
 }
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h
index 04053ea81c1..8c983fb11aa 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.h
@@ -29,6 +29,8 @@ namespace coreml {
 // implements Init/Prepare/Invoke as TFLite kernel nodes.
 class CoreMlDelegateKernel {
  public:
+  explicit CoreMlDelegateKernel(int coreml_version)
+      : coreml_version_(coreml_version) {}
   // Initialize the delegated graph and add required nodes.
   TfLiteStatus Init(TfLiteContext* context, const TfLiteDelegateParams* params);
 
@@ -56,6 +58,7 @@ class CoreMlDelegateKernel {
   std::unique_ptr<delegates::coreml::GraphBuilder> builder_;
   std::unique_ptr<CoreML::Specification::Model> model_;
   ::CoreMlExecutor* executor_;
+  int coreml_version_;
 
   std::vector<int> input_tensor_ids_;
   std::vector<TensorData> inputs_;
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm
index a36837bcc44..6a668bc971b 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_delegate_kernel.mm
@@ -60,7 +60,7 @@ void TransposeToHWC(const float* chw, float* hwc, const TfLiteIntArray* hwc_dims
 
 TfLiteStatus CoreMlDelegateKernel::Init(TfLiteContext* context,
                                         const TfLiteDelegateParams* delegate_params) {
-  if (@available(iOS 11.0, *)) {
+  if (@available(iOS 12.0, *)) {
     executor_ = [[::CoreMlExecutor alloc] init];
     TF_LITE_ENSURE_STATUS(BuildModel(context, delegate_params));
     // Serialize the model protocol buffer and compile it.
@@ -76,7 +76,7 @@ TfLiteStatus CoreMlDelegateKernel::Init(TfLiteContext* context,
     }
     return kTfLiteOk;
   } else {
-    TF_LITE_KERNEL_LOG(context, "Minimum required iOS version is 11.0.");
+    TF_LITE_KERNEL_LOG(context, "Minimum required iOS version is 12.0.");
     return kTfLiteError;
   }
 }
@@ -104,6 +104,9 @@ void CoreMlDelegateKernel::AddOutputTensors(const TfLiteIntArray* output_tensors
     int batch_size, height_size, width_size, depth_size;
     GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
     multi_array->set_datatype(CoreML::Specification::ArrayFeatureType::FLOAT32);
+    if (coreml_version_ >= 3) {
+      multi_array->mutable_shape()->Add(batch_size);
+    }
     multi_array->mutable_shape()->Add(depth_size);
     multi_array->mutable_shape()->Add(height_size);
     multi_array->mutable_shape()->Add(width_size);
@@ -114,7 +117,7 @@ TfLiteStatus CoreMlDelegateKernel::BuildModel(TfLiteContext* context,
                                               const TfLiteDelegateParams* delegate_params) {
   TfLiteNode* node;
   TfLiteRegistration* reg;
-  builder_.reset(new delegates::coreml::GraphBuilder());
+  builder_.reset(new delegates::coreml::GraphBuilder(coreml_version_));
   // Add Inputs
   AddInputTensors(delegate_params->input_tensors, context);
   // Build all ops.
@@ -144,8 +147,6 @@ TfLiteStatus CoreMlDelegateKernel::BuildModel(TfLiteContext* context,
     return kTfLiteError;
   }
   AddOutputTensors(delegate_params->output_tensors, context);
-  // TODO(karimnosseir): Set correct version ?
-  model_->set_specificationversion(1);
   auto* model_description = model_->mutable_description();
   for (int i = 0; i < delegate_params->input_tensors->size; ++i) {
     const int tensor_id = delegate_params->input_tensors->data[i];
@@ -158,6 +159,9 @@ TfLiteStatus CoreMlDelegateKernel::BuildModel(TfLiteContext* context,
       int batch_size, height_size, width_size, depth_size;
       GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor.dims);
       multi_array->set_datatype(CoreML::Specification::ArrayFeatureType::FLOAT32);
+      if (coreml_version_ >= 3) {
+        multi_array->mutable_shape()->Add(batch_size);
+      }
       multi_array->mutable_shape()->Add(depth_size);
       multi_array->mutable_shape()->Add(height_size);
       multi_array->mutable_shape()->Add(width_size);
@@ -181,9 +185,12 @@ TfLiteStatus CoreMlDelegateKernel::Prepare(TfLiteContext* context, TfLiteNode* n
     int batch_size, height_size, width_size, depth_size;
     GetDims(&batch_size, &height_size, &width_size, &depth_size, tensor->dims);
 
-    inputs_.push_back({std::vector<float>(input_size),
-                       builder_->GetTensorName(tensor_index),
-                       {depth_size, height_size, width_size}});
+    std::vector<int> input_shape = {depth_size, height_size, width_size};
+    if (coreml_version_ >= 3) {
+      input_shape.insert(input_shape.begin(), batch_size);
+    }
+    inputs_.push_back(
+        {std::vector<float>(input_size), builder_->GetTensorName(tensor_index), input_shape});
   }
 
   outputs_.reserve(node->outputs->size);
@@ -222,9 +229,7 @@ TfLiteStatus CoreMlDelegateKernel::Invoke(TfLiteContext* context, TfLiteNode* no
   }
 }
 
-CoreMlDelegateKernel::~CoreMlDelegateKernel() {
-  [executor_ cleanup];
-}
+CoreMlDelegateKernel::~CoreMlDelegateKernel() { [executor_ cleanup]; }
 
 }  // namespace coreml
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h
index edec3020cbc..5ce0a0ade6c 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.h
@@ -45,4 +45,5 @@ struct TensorData {
 @property MLModel* model API_AVAILABLE(ios(11));
 @property NSString* mlModelFilePath;
 @property NSString* compiledModelFilePath;
+@property(nonatomic, readonly) int coreMlVersion;
 @end
diff --git a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm
index 2091c0d7ca0..1f808e08d49 100644
--- a/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm
+++ b/tensorflow/lite/experimental/delegates/coreml/coreml_executor.mm
@@ -39,17 +39,22 @@ NSURL* createTemporaryFile() {
   NSSet* _featureNames;
 }
 
-- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs;
+- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs
+                 coreMlVersion:(int)coreMlVersion;
 - (MLFeatureValue*)featureValueForName:(NSString*)featureName API_AVAILABLE(ios(11));
 - (NSSet<NSString*>*)featureNames;
 
+@property(nonatomic, readonly) int coreMlVersion;
+
 @end
 
 @implementation MultiArrayFeatureProvider
 
-- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs {
+- (instancetype)initWithInputs:(const std::vector<TensorData>*)inputs
+                 coreMlVersion:(int)coreMlVersion {
   self = [super init];
   _inputs = inputs;
+  _coreMlVersion = coreMlVersion;
   for (auto& input : *_inputs) {
     if (input.name.empty()) {
       return nil;
@@ -74,8 +79,31 @@ NSURL* createTemporaryFile() {
   for (auto& input : *_inputs) {
     if ([featureName cStringUsingEncoding:NSUTF8StringEncoding] == input.name) {
       // TODO(b/141492326): Update shape handling for higher ranks
-      NSArray* shape = @[ @(input.shape[0]), @(input.shape[1]), @(input.shape[2]) ];
-      NSArray* strides = @[ @(input.shape[1] * input.shape[2]), @(input.shape[2]), @1 ];
+      NSArray* shape = @[
+        @(input.shape[0]),
+        @(input.shape[1]),
+        @(input.shape[2]),
+      ];
+      NSArray* strides = @[
+        @(input.shape[1] * input.shape[2]),
+        @(input.shape[2]),
+        @1,
+      ];
+
+      if ([self coreMlVersion] >= 3) {
+        shape = @[
+          @(input.shape[0]),
+          @(input.shape[1]),
+          @(input.shape[2]),
+          @(input.shape[3]),
+        ];
+        strides = @[
+          @(input.shape[1] * input.shape[2] * input.shape[3]),
+          @(input.shape[2] * input.shape[3]),
+          @(input.shape[3]),
+          @1,
+        ];
+      };
       NSError* error = nil;
       MLMultiArray* mlArray = [[MLMultiArray alloc] initWithDataPointer:(float*)input.data.data()
                                                                   shape:shape
@@ -106,7 +134,7 @@ NSURL* createTemporaryFile() {
   }
   NSError* error = nil;
   MultiArrayFeatureProvider* inputFeature =
-      [[MultiArrayFeatureProvider alloc] initWithInputs:&inputs];
+      [[MultiArrayFeatureProvider alloc] initWithInputs:&inputs coreMlVersion:[self coreMlVersion]];
   if (inputFeature == nil) {
     NSLog(@"inputFeature is not initialized.");
     return NO;
@@ -153,6 +181,14 @@ NSURL* createTemporaryFile() {
 - (NSURL*)saveModel:(CoreML::Specification::Model*)model {
   NSURL* modelUrl = createTemporaryFile();
   NSString* modelPath = [modelUrl path];
+  if (model->specificationversion() == 3) {
+    _coreMlVersion = 2;
+  } else if (model->specificationversion() == 4) {
+    _coreMlVersion = 3;
+  } else {
+    NSLog(@"Only Core ML models with specification version 3 or 4 are supported");
+    return nil;
+  }
   // Flush data to file.
   // TODO(karimnosseir): Can we mmap this instead of actual writing it to phone ?
   std::ofstream file_stream([modelPath UTF8String], std::ios::out | std::ios::binary);
diff --git a/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift b/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
index 9862de31e2c..5a1526d45ea 100644
--- a/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
@@ -35,6 +35,7 @@ public final class CoreMLDelegate: Delegate {
     self.options = options
     var delegateOptions = TfLiteCoreMlDelegateOptions()
     delegateOptions.enabled_devices = options.enabledDevices.cEnabledDevices
+    delegateOptions.coreml_version = Int32(options.coreMLVersion)
     delegateOptions.max_delegated_partitions = Int32(options.maxDelegatedPartitions)
     delegateOptions.min_nodes_per_partition = Int32(options.minNodesPerPartition)
     guard let delegate = TfLiteCoreMlDelegateCreate(&delegateOptions) else { return nil }
@@ -72,6 +73,9 @@ extension CoreMLDelegate {
     /// value is `.neuralEngine` indicating that the delegate is enabled for Neural Engine devices
     /// only.
     public var enabledDevices: EnabledDevices = .neuralEngine
+    /// Target Core ML version for the model conversion. When it's not set, Core ML version will
+    /// be set to highest available version for the platform.
+    public var coreMLVersion = 0
     /// The maximum number of Core ML delegate partitions created. Each graph corresponds to one
     /// delegated node subset in the TFLite model. The default value is `0` indicating that all
     /// possible partitions are delegated.
diff --git a/tensorflow/lite/g3doc/performance/coreml_delegate.md b/tensorflow/lite/g3doc/performance/coreml_delegate.md
index da3b943fd89..c267347cf3f 100644
--- a/tensorflow/lite/g3doc/performance/coreml_delegate.md
+++ b/tensorflow/lite/g3doc/performance/coreml_delegate.md
@@ -6,7 +6,7 @@ which results in faster model inference on iOS devices.
 
 Note: This delegate is in experimental (beta) phase.
 
-Note: Core ML delegate is using Core ML version 2.1.
+Note: Core ML delegate supports Core ML version 2 and later.
 
 **Supported iOS versions and devices:**
 
@@ -158,6 +158,14 @@ for more detail. Alternatively, you can implement your own set of blacklist
 devices using other libraries such as
 [DeviceKit](https://github.com/devicekit/DeviceKit).
 
+### Using older Core ML version
+
+Although iOS 13 supprots Core ML 3, the model might work better when it is
+converted with Core ML 2 model specification. The target conversion version is
+set to the latest version by default, but you can change this by setting
+`coreMLVersion` (in Swift, `coreml_version` in C API) in the delegate option to
+older version.
+
 ## Supported ops
 
 Following ops are supported by the Core ML delegate.
@@ -187,6 +195,8 @@ Following ops are supported by the Core ML delegate.
 *   ReluN1To1
 *   Relu6
 *   Reshape
+    *   Only supported when target Core ML version is 2, not supported when
+        targeting Core ML 3.
 *   ResizeBilinear
 *   SoftMax
 *   Tanh

From 816582d7eaf62bc12252791ef7701d329edee6ff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 May 2020 21:46:06 -0700
Subject: [PATCH 087/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311263158
Change-Id: I8cec18a5d0a7d93af71ec0a913936cf9c24c8131
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From e71443f838b2a16aea5b8ff38cc8e211449206d1 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Tue, 12 May 2020 22:19:52 -0700
Subject: [PATCH 088/412] Updated image_test to work with newer versions of
 keras_preprocessing

PiperOrigin-RevId: 311266601
Change-Id: I0cbcea629b4fff04c50628432515a8766dc10ec8
---
 tensorflow/python/keras/preprocessing/image_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
index a577381874e..d2f4b18f7dd 100644
--- a/tensorflow/python/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -146,8 +146,7 @@ class TestImage(keras_parameterized.TestCase):
       generator = preprocessing_image.ImageDataGenerator(
           data_format='unknown')
 
-    generator = preprocessing_image.ImageDataGenerator(
-        zoom_range=(2, 2))
+    generator = preprocessing_image.ImageDataGenerator(zoom_range=(2., 2.))
 
   def test_image_data_generator_fit(self):
     generator = preprocessing_image.ImageDataGenerator(

From 088fc3a9b5701eec8073489417143a32ef25cdd5 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Tue, 12 May 2020 22:46:26 -0700
Subject: [PATCH 089/412] Support setting Core ML delegate's target Core ML
 version in benchmark

PiperOrigin-RevId: 311269200
Change-Id: I343794d6af948c554d05a89c9e432c0975ddfa6c
---
 tensorflow/lite/tools/benchmark/README.md                   | 1 +
 tensorflow/lite/tools/delegates/README.md                   | 3 +++
 tensorflow/lite/tools/delegates/coreml_delegate_provider.cc | 6 ++++++
 3 files changed, 10 insertions(+)

diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index a4f632c40a9..c44129cbbd3 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -87,6 +87,7 @@ the reported data on hexagon is in cycles, not in ms like on cpu.
 
 #### CoreML delegate
 *   `use_coreml`: `bool` (default=false)
+*   `coreml_version`: `int` (default=0)
 
 #### External delegate
 *   `external_delegate_path`: `string` (default="")
diff --git a/tensorflow/lite/tools/delegates/README.md b/tensorflow/lite/tools/delegates/README.md
index f0e15e9e71a..709fcffb24d 100644
--- a/tensorflow/lite/tools/delegates/README.md
+++ b/tensorflow/lite/tools/delegates/README.md
@@ -93,6 +93,9 @@ TFLite delegate.
 *   `use_coreml`: `bool` (default=false) \
     Whether to use the [Core ML delegate](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/delegates/coreml).
     This option is only available in iOS.
+*   `coreml_version`: `int` (default=0) \
+    Target Core ML version for model conversion. The default value is 0 and it
+    means using the newest version that's available on the device.
 
 ### External delegate provider
 *   `external_delegate_path`: `string` (default="") \
diff --git a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
index 0d1a8ade368..c29555716a4 100644
--- a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
@@ -32,6 +32,7 @@ class CoreMlDelegateProvider : public DelegateProvider {
   CoreMlDelegateProvider() {
 #if defined(REAL_IPHONE_DEVICE)
     default_params_.AddParam("use_coreml", ToolParam::Create<bool>(true));
+    default_params_.AddParam("coreml_version", ToolParam::Create<int>(0));
 #endif
   }
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -49,6 +50,10 @@ std::vector<Flag> CoreMlDelegateProvider::CreateFlags(
 #if defined(REAL_IPHONE_DEVICE)
   std::vector<Flag> flags = {
       CreateFlag<bool>("use_coreml", params, "use Core ML"),
+      CreateFlag<int>("coreml_version", params,
+                      "Target Core ML version for model conversion. "
+                      "The default value is 0 and it means using the newest "
+                      "version that's available on the device."),
   };
   return flags;
 #else
@@ -71,6 +76,7 @@ TfLiteDelegatePtr CoreMlDelegateProvider::CreateTfLiteDelegate(
   if (params.Get<bool>("use_coreml")) {
     TfLiteCoreMlDelegateOptions coreml_opts = {
         .enabled_devices = TfLiteCoreMlDelegateAllDevices};
+    coreml_opts.coreml_version = params.Get<int>("coreml_version");
     coreml_opts.max_delegated_partitions =
         params.Get<int>("max_delegated_partitions");
     coreml_opts.min_nodes_per_partition =

From d5b3ec27d1d6bb157588ff3033a3d9bd2e46711f Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Tue, 12 May 2020 23:14:53 -0700
Subject: [PATCH 090/412] Allow dynamically configuring device placement

Enable setting soft device placement as well as logging dynamically.
This required ensuring the device placement policy was part of the cache
key.

Further, we fix the logging to ensure in eager mode if a kernel is
retrieved from the kernel cache, then the execution is still logged. We
also log closer to the actual op execution to avoid logging before all
checks have been done.

PiperOrigin-RevId: 311271808
Change-Id: I9765228894f84a3447cc03332a2559f6d933165b
---
 tensorflow/c/eager/c_api_experimental.cc      | 14 ++++++
 tensorflow/c/eager/c_api_experimental.h       | 12 +++++
 .../core/common_runtime/eager/context.h       |  7 +--
 .../core/common_runtime/eager/execute.cc      | 45 ++++++++++++-------
 tensorflow/python/client/session_test.py      | 11 ++++-
 tensorflow/python/eager/context.py            | 16 +++----
 tensorflow/python/eager/core_test.py          |  1 -
 tensorflow/python/framework/config_test.py    | 16 +++----
 tensorflow/python/tfe_wrapper.cc              | 12 +++++
 9 files changed, 95 insertions(+), 39 deletions(-)

diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index dd9e5e111d9..0d71b11531b 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -657,3 +657,17 @@ TFE_TensorHandle* TFE_CreatePackedTensorHandle(TFE_Context* ctx,
       std::move(tensor_handles), context, &handle);
   return tensorflow::wrap(handle);
 }
+
+void TFE_ContextSetSoftDevicePlacement(TFE_Context* ctx, unsigned char enable,
+                                       TF_Status* status) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  context->SetAllowSoftPlacement(enable);
+}
+
+void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx, unsigned char enable,
+                                      TF_Status* status) {
+  tensorflow::EagerContext* context =
+      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
+  context->SetLogDevicePlacement(enable);
+}
diff --git a/tensorflow/c/eager/c_api_experimental.h b/tensorflow/c/eager/c_api_experimental.h
index 584f7222111..1b8efe61ee0 100644
--- a/tensorflow/c/eager/c_api_experimental.h
+++ b/tensorflow/c/eager/c_api_experimental.h
@@ -549,6 +549,18 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_CreatePackedTensorHandle(
     TFE_Context* ctx, TFE_TensorHandle** handles, int* num_handles,
     TF_Status* status);
 
+// Configure soft device placement policy for the eager executor. Note this
+// policy is applied to any subsequent op executions.
+TF_CAPI_EXPORT void TFE_ContextSetSoftDevicePlacement(TFE_Context* ctx,
+                                                      unsigned char enable,
+                                                      TF_Status* status);
+
+// Configure device placement policy logging for the eager executor. Note this
+// policy is applied to any subsequent op executions.
+TF_CAPI_EXPORT void TFE_ContextSetLogDevicePlacement(TFE_Context* ctx,
+                                                     unsigned char enable,
+                                                     TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 683425919d1..d034aaf2f9c 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -300,7 +300,9 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   void AddKernelToCache(Fprint128 cache_key, KernelAndDevice* kernel);
 
   bool LogDevicePlacement() const { return log_device_placement_; }
+  void SetLogDevicePlacement(bool enable) { log_device_placement_ = enable; }
   bool AllowSoftPlacement() const { return allow_soft_placement_; }
+  void SetAllowSoftPlacement(bool enable) { allow_soft_placement_ = enable; }
   bool LogMemory() const { return log_memory_; }
 
   Rendezvous* GetRendezvous() const { return rendezvous_; }
@@ -625,9 +627,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   mutex metadata_mu_;
   RunMetadata run_metadata_ TF_GUARDED_BY(metadata_mu_);
   GraphCollector graph_collector_;
-  // TODO(fishx): Allow update following two bool after context creation.
-  const bool log_device_placement_;
-  const bool allow_soft_placement_;
+  std::atomic<bool> log_device_placement_;
+  std::atomic<bool> allow_soft_placement_;
 
   // Information related to step containers.
   std::atomic<int> num_active_steps_;
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 3036e6d7989..f6b4370bbdc 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -365,6 +365,9 @@ Status GetOrCreateKernelAndDevice(
   Device* device = absl::get<Device*>(op->Device());
 
   Fprint128 cache_key = op->MutableAttrs()->CacheKey(op->DeviceName());
+  /// Include soft placement policy in cache key since the placement strategy
+  // can change and thus affect which kernel is picked.
+  cache_key = FingerprintCat128(cache_key, ctx.AllowSoftPlacement());
 
   std::vector<Device*> input_dev_ptrs;
   absl::flat_hash_map<string, const std::vector<string>*> composite_devices;
@@ -488,13 +491,6 @@ Status GetOrCreateKernelAndDevice(
                << KernelsRegisteredForOp(op->Name());
       op->SetDevice(device);
     }
-    if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
-      string msg = strings::StrCat("Executing op ", ndef.op(), " in device ",
-                                   DeviceNameOrUnspecified(device));
-      if (!logging::LogToListeners(msg)) {
-        LOG(INFO) << msg;
-      }
-    }
 
     FunctionLibraryRuntime* flr =
         device == nullptr ? nullptr : ctx.func_lib(device);
@@ -607,6 +603,14 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
   int num_outputs = kernel->num_outputs();
   TF_RETURN_IF_ERROR(ValidateInputTypeAndPlacement(&ctx, op, kernel));
 
+  if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat("Executing op ", op->Name(), " in device ",
+                                 kernel->device()->name());
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
+  }
+
   GraphCollector* graph_collector = nullptr;
   if (ctx.ShouldStoreGraphs()) {
     graph_collector = ctx.GetGraphCollector();
@@ -841,6 +845,16 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
       ctx.GetContextViewId(), eager_client.get(),
       op->MutableAttrs()->BuildNodeDef(), op->EagerContext().FuncLibDef(),
       op->Inputs(), {retvals, num_outputs}));
+
+  if (op->EagerContext().LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat(
+        "Executing op ", op->Name(), " on task ",
+        DeviceNameUtils::ParsedNameToString(op->GetDeviceParsedName()));
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
+  }
+
   Status s = executor.AddOrExecute(std::move(node));
   // Since the operation failed, we need to Unref any outputs that were
   // allocated.
@@ -1119,15 +1133,6 @@ Status EagerExecute(EagerOperation* op, TensorHandle** retvals,
     return EagerLocalExecute(op, retvals, num_retvals);
   }
 
-  if (op->EagerContext().LogDevicePlacement() || VLOG_IS_ON(1)) {
-    string msg = strings::StrCat(
-        "Executing op ", op->Name(), " on task ",
-        DeviceNameUtils::ParsedNameToString(op->GetDeviceParsedName()));
-    if (!logging::LogToListeners(msg)) {
-      LOG(INFO) << msg;
-    }
-  }
-
 #if defined(IS_MOBILE_PLATFORM)
   return errors::Unimplemented(
       "Eager's remote execution is not available on mobile devices.");
@@ -1428,6 +1433,14 @@ void EagerLocalExecuteAsync(EagerOperation* op, TensorHandle** retvals,
     return;
   }
 
+  if (ctx.LogDevicePlacement() || VLOG_IS_ON(1)) {
+    string msg = strings::StrCat("Executing op ", op->Name(), " in device ",
+                                 kernel->device()->name());
+    if (!logging::LogToListeners(msg)) {
+      LOG(INFO) << msg;
+    }
+  }
+
   GraphCollector* graph_collector = nullptr;
   if (ctx.ShouldStoreGraphs()) {
     graph_collector = ctx.GetGraphCollector();
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index dd8e64ac182..1c244c1b297 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -1917,6 +1917,9 @@ class SessionTest(test_util.TensorFlowTestCase):
         a = constant_op.constant(1)
         b = constant_op.constant(2)
         c = a + b
+        # Ensure if the same kernel with the same arguments is executed then its
+        # execution is logged.
+        d = a + b
     else:
       # Passing the config to the server, but not the session should still
       # result in logging device placement.
@@ -1925,12 +1928,16 @@ class SessionTest(test_util.TensorFlowTestCase):
       a = constant_op.constant(1)
       b = constant_op.constant(2)
       c = a + b
+      d = a + b
       with session.Session(server.target) as sess:
         with CaptureStderr() as log:
-          sess.run(c)
+          c, d = sess.run([c, d])
 
+    self.assertEqual(c, 3)
+    self.assertEqual(d, 3)
     # Ensure that we did log device placement.
-    self.assertTrue('/replica:0/task:0/device:CPU:0' in str(log), str(log))
+    add_executions = [l for l in str(log).splitlines() if 'AddV2' in l]
+    self.assertEqual(len(add_executions), 2)
 
   @test_util.run_v1_only('b/120545219')
   def testLocalMasterSessionTimeout(self):
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 182b8478420..86b3d5cf95f 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -1509,9 +1509,11 @@ class Context(object):
     return self.config.allow_soft_placement
 
   @soft_device_placement.setter
-  def soft_device_placement(self, enabled):
-    self._soft_device_placement = enabled
+  def soft_device_placement(self, enable):
+    if self._context_handle is not None:
+      pywrap_tfe.TFE_ContextSetSoftDevicePlacement(self._handle, enable)
 
+    self._soft_device_placement = enable
     self._thread_local_data.function_call_options = None
 
   @property
@@ -1519,15 +1521,11 @@ class Context(object):
     return self.config.log_device_placement
 
   @log_device_placement.setter
-  def log_device_placement(self, enabled):
-    if self._log_device_placement == enabled:
-      return
-
+  def log_device_placement(self, enable):
     if self._context_handle is not None:
-      raise RuntimeError(
-          "Device placement logging must be set at program startup")
+      pywrap_tfe.TFE_ContextSetLogDevicePlacement(self._handle, enable)
 
-    self._log_device_placement = enabled
+    self._log_device_placement = enable
     self._thread_local_data.function_call_options = None
 
   @property
diff --git a/tensorflow/python/eager/core_test.py b/tensorflow/python/eager/core_test.py
index 47b3966827f..c1401fc56ee 100644
--- a/tensorflow/python/eager/core_test.py
+++ b/tensorflow/python/eager/core_test.py
@@ -1112,5 +1112,4 @@ class EagerTensorCacheTest(test_util.TensorFlowTestCase):
 
 
 if __name__ == '__main__':
-  context.set_log_device_placement(True)
   test.main()
diff --git a/tensorflow/python/framework/config_test.py b/tensorflow/python/framework/config_test.py
index b07bb874385..3051f1d0623 100644
--- a/tensorflow/python/framework/config_test.py
+++ b/tensorflow/python/framework/config_test.py
@@ -159,7 +159,6 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
     else:
       self.assertFalse(config.get_soft_device_placement())
 
-    @def_function.function
     def mod():
       with ops.device('/device:GPU:0'):
         a = constant_op.constant(1.0)
@@ -172,8 +171,10 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
         config.get_soft_device_placement(),
         context.context().soft_device_placement)
 
-    # Since soft placement is enabled, the mod operation should work with CPU
+    # Since soft placement is enabled, the mod operation should fallback to CPU
+    # with pure eager execution as well as functions
     mod()
+    def_function.function(mod)()
 
     config.set_soft_device_placement(False)
     self.assertEqual(config.get_soft_device_placement(), False)
@@ -182,8 +183,11 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
         context.context().soft_device_placement)
 
     # Since soft placement is disabled, the mod operation should fail on GPU
+    # with pure eager execution as well as functions
     with self.assertRaises(errors.InvalidArgumentError):
       mod()
+    with self.assertRaises(errors.InvalidArgumentError):
+      def_function.function(mod)()
 
   @reset_eager
   def testLogDevicePlacement(self):
@@ -203,12 +207,8 @@ class ConfigTest(test.TestCase, parameterized.TestCase):
 
     context.ensure_initialized()
 
-    with self.assertRaises(RuntimeError):
-      context.set_log_device_placement(True)
-
-    # If the setting the device placement is a no-op, do not throw a runtime
-    # exception.
-    context.set_log_device_placement(False)
+    # Changing the device placement should not throw an exception
+    context.set_log_device_placement(True)
 
   @reset_eager
   def testEnableMlirBridge(self):
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index ec54efa61cf..836cafbd494 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -488,6 +488,18 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
     // NOTE: different from TFE_ContextSyncExecutors that raises potential
     // errors, deliberately ignore executor statuses in cleanup.
   });
+  m.def("TFE_ContextSetSoftDevicePlacement", [](py::handle& ctx, bool enable) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    TFE_ContextSetSoftDevicePlacement(tensorflow::InputTFE_Context(ctx), enable,
+                                      status.get());
+  });
+  m.def("TFE_ContextSetLogDevicePlacement", [](py::handle& ctx, bool enable) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    TFE_ContextSetSoftDevicePlacement(tensorflow::InputTFE_Context(ctx), enable,
+                                      status.get());
+  });
 
   // TFE_Executor logic
   m.def(

From 843f3da02df95e95593af625091646e3ed49b8d6 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Tue, 12 May 2020 23:26:08 -0700
Subject: [PATCH 091/412] Disable flaky test

PiperOrigin-RevId: 311272834
Change-Id: Id8af3ac197f65dde4ae50c9b5ad63d2d328652f6
---
 tensorflow/python/ops/parallel_for/control_flow_ops_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 11380b2dac2..01776808525 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -1400,6 +1400,8 @@ class StatelessIfTest(PForTestCase):
 class IfTest(PForTestCase):
 
   def test_read_var(self):
+    self.skipTest("b/156438918")  # Flaky
+
     x = [1, 2, 3, 4, 5.]
     y = 2.5
     z = resource_variable_ops.ResourceVariable(5.)

From a88c46347c20f6e4875f4c1c75ffc5b5bf38edb8 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 13 May 2020 15:35:18 +0800
Subject: [PATCH 092/412] change and cleanup per review

---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.cc        | 9 ++++-----
 tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h  | 4 ++--
 .../tools/accuracy/ilsvrc/imagenet_model_evaluator.cc    | 2 --
 .../tools/accuracy/ilsvrc/imagenet_model_evaluator.h     | 3 ---
 .../tools/benchmark/benchmark_performance_options.cc     | 3 ---
 .../lite/tools/delegates/default_execution_provider.cc   | 4 ----
 .../lite/tools/delegates/nnapi_delegate_provider.cc      | 2 +-
 .../tools/evaluation/evaluation_delegate_provider.cc     | 4 ----
 .../evaluation/tasks/coco_object_detection/run_eval.cc   | 5 -----
 .../tasks/imagenet_image_classification/run_eval.cc      | 5 -----
 .../tools/evaluation/tasks/inference_diff/run_eval.cc    | 4 ----
 11 files changed, 7 insertions(+), 38 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 867d03f5227..ff6ad0dc0d9 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -3151,7 +3151,7 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context,
                                     "creating NNAPI model", nnapi_errno);
     nn_model_.reset(model);
 
-    TF_LITE_ENSURE_STATUS(BuildGraph(context, params->delegate,
+    TF_LITE_ENSURE_STATUS(BuildGraph(context, delegate_options,
                                      params->input_tensors,
                                      params->output_tensors, nnapi_errno));
   }
@@ -3203,7 +3203,6 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context,
 
   const auto delegate_options =
       StatefulNnApiDelegate::GetOptions(node->delegate);
-
   ANeuralNetworksCompilation* compilation = nullptr;
   if (!nnapi_devices_.empty()) {
     // Compile for the selected accelerator.
@@ -3877,7 +3876,8 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(TfLiteContext* context,
 }
 
 TfLiteStatus NNAPIDelegateKernel::BuildGraph(
-    TfLiteContext* context, TfLiteDelegate* delegate,
+    TfLiteContext* context,
+    const StatefulNnApiDelegate::Options& delegate_options,
     const TfLiteIntArray* input_tensors, const TfLiteIntArray* output_tensors,
     int* nnapi_errno) {
   // Build the ops and tensors.
@@ -3888,7 +3888,6 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
   std::vector<uint32_t> outputs;
   outputs.reserve(output_tensors->size);
 
-  const auto delegate_options = StatefulNnApiDelegate::GetOptions(delegate);
   size_t total_input_byte_size = 0;
   // Make the TensorFlow Lite inputs and outputs to ann_indices.
   for (int i : TfLiteIntArrayView(input_tensors)) {
@@ -4025,9 +4024,9 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
   delegate_data_.disallow_nnapi_cpu = options.disallow_nnapi_cpu;
   delegate_data_.max_number_delegated_partitions =
       options.max_number_delegated_partitions;
+  delegate_data_.allow_fp16 = options.allow_fp16;
   TFLITE_LOG_PROD_ONCE(tflite::TFLITE_LOG_INFO,
                        "Created TensorFlow Lite delegate for NNAPI.");
-  delegate_data_.allow_fp16 = options.allow_fp16;
   Prepare = DoPrepare;
   CopyFromBufferHandle = DoCopyFromBufferHandle;
   CopyToBufferHandle = DoCopyToBufferHandle;
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 60151196372..5d0ea63ab4c 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -349,8 +349,8 @@ class NNAPIDelegateKernel {
   TfLiteStatus AddOpsAndTensors(TfLiteContext* context, int* nnapi_errno);
 
   TfLiteStatus BuildGraph(TfLiteContext* context,
-                          TfLiteDelegate* delegate,
-                          const TfLiteIntArray* input_tensors,
+                          const StatefulNnApiDelegate::Options& options,
+			  const TfLiteIntArray* input_tensors,
                           const TfLiteIntArray* output_tensors,
                           int* nnapi_errno);
 };
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
index 64ce87ae8aa..f318dc68d09 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.cc
@@ -141,8 +141,6 @@ class CompositeObserver : public ImagenetModelEvaluator::Observer {
       tflite::Flag::CreateFlag(kNumRanksFlag, &params.num_ranks,
                                "Generates the top-1 to top-k accuracy values"
                                "where k = num_ranks. Default: 10"),
-      tflite::Flag::CreateFlag("nnapi_allow_fp16", &params.nnapi_allow_fp16,
-                               "allow fp16 in nnapi"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
 
diff --git a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
index 3ba22cbc2af..65d4a2c49f8 100644
--- a/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
+++ b/tensorflow/lite/tools/accuracy/ilsvrc/imagenet_model_evaluator.h
@@ -78,9 +78,6 @@ class ImagenetModelEvaluator {
 
     // Number of interpreter threads.
     int num_interpreter_threads = 1;
-
-    // allow fp16
-    bool nnapi_allow_fp16 = false;
   };
 
   // An evaluation observer.
diff --git a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
index c2d9374506e..cfce23c4595 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_performance_options.cc
@@ -303,9 +303,6 @@ void BenchmarkPerformanceOptions::CreatePerformanceOptions() {
                         BenchmarkParam::Create<bool>(false));
         params.AddParam("max_delegated_partitions",
                         BenchmarkParam::Create<int>(0));
-        params.AddParam("max_delegated_partitions",
-        params.AddParam("nnapi_allow_fp16",
-                        BenchmarkParam::Create<bool>(false));
         all_run_params_.emplace_back(std::move(params));
       }
     }
diff --git a/tensorflow/lite/tools/delegates/default_execution_provider.cc b/tensorflow/lite/tools/delegates/default_execution_provider.cc
index 67c38308206..f75fd791072 100644
--- a/tensorflow/lite/tools/delegates/default_execution_provider.cc
+++ b/tensorflow/lite/tools/delegates/default_execution_provider.cc
@@ -30,7 +30,6 @@ class DefaultExecutionProvider : public DelegateProvider {
                              ToolParam::Create<int32_t>(0));
     default_params_.AddParam("min_nodes_per_partition",
                              ToolParam::Create<int32_t>(0));
-    default_params_.AddParam("allow_fp16", ToolParam::Create<bool>(false));
   }
 
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -45,7 +44,6 @@ std::vector<Flag> DefaultExecutionProvider::CreateFlags(
   std::vector<Flag> flags = {
       CreateFlag<int32_t>("num_threads", params,
                           "number of threads used for inference on CPU."),
-      CreateFlag<bool>("allow_fp16", params, "allow_fp16"),
       CreateFlag<int32_t>("max_delegated_partitions", params,
                           "Max number of partitions to be delegated."),
       CreateFlag<int32_t>(
@@ -63,8 +61,6 @@ void DefaultExecutionProvider::LogParams(const ToolParams& params) const {
                    << params.Get<int32_t>("max_delegated_partitions") << "]";
   TFLITE_LOG(INFO) << "Min nodes per partition : ["
                    << params.Get<int32_t>("min_nodes_per_partition") << "]";
-  TFLITE_LOG(INFO) << "allow_fp16: ["
-                   << params.Get<bool>("allow_fp16") << "]";
 }
 
 TfLiteDelegatePtr DefaultExecutionProvider::CreateTfLiteDelegate(
diff --git a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
index 6492ba82849..2fbfb791e8c 100644
--- a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
@@ -88,7 +88,7 @@ void NnapiDelegateProvider::LogParams(const ToolParams& params) const {
                        << params.Get<bool>("disable_nnapi_cpu") << "]";
     }
     if (params.Get<bool>("nnapi_allow_fp16")) {
-      TFLITE_LOG(INFO) << "nnapi_allow_fp16: ["
+      TFLITE_LOG(INFO) << "Allow fp16 in NNAPI: ["
                        << params.Get<bool>("nnapi_allow_fp16") << "]";
     }
   }
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
index ea07378a8fa..42f2666ba9b 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider.cc
@@ -132,10 +132,6 @@ tools::ToolParams DelegateProviders::GetAllParams(
     tool_params.Set<int32_t>("num_threads", params.num_threads());
   }
 
-  if (params.has_nnapi_allow_fp16()) {
-    tool_params.Set<bool>("nnapi_allow_fp16", params.nnapi_allow_fp16());
-  }
-
   const auto type = params.delegate();
   switch (type) {
     case TfliteInferenceParams::NNAPI:
diff --git a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
index de1ae6e2e94..765e8fc6465 100644
--- a/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/coco_object_detection/run_eval.cc
@@ -65,7 +65,6 @@ class CocoObjectDetection : public TaskExecutor {
   bool debug_mode_;
   std::string delegate_;
   int num_interpreter_threads_;
-  bool allow_fp16_;
   DelegateProviders delegate_providers_;
 };
 
@@ -105,9 +104,6 @@ CocoObjectDetection::CocoObjectDetection(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'xnnpack', 'hexagon'}"),
-      tflite::Flag::CreateFlag(
-          "nnapi_allow_fp16", &allow_fp16_,
-          "nnapi allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   DelegateProviders delegate_providers;
@@ -136,7 +132,6 @@ absl::optional<EvaluationStageMetrics> CocoObjectDetection::Run() {
   inference_params->set_model_file_path(model_file_path_);
   inference_params->set_num_threads(num_interpreter_threads_);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
-  inference_params->set_nnapi_allow_fp16(allow_fp16_);
 
   // Get ground truth data.
   absl::flat_hash_map<std::string, ObjectDetectionResult> ground_truth_map;
diff --git a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
index 8a7fd864c6e..13eeb313ad4 100644
--- a/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/imagenet_image_classification/run_eval.cc
@@ -67,7 +67,6 @@ class ImagenetClassification : public TaskExecutor {
   std::string delegate_;
   int num_images_;
   int num_interpreter_threads_;
-  bool allow_fp16_;
   DelegateProviders delegate_providers_;
 };
 
@@ -107,9 +106,6 @@ ImagenetClassification::ImagenetClassification(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
-      tflite::Flag::CreateFlag(
-          "nnapi_allow_fp16", &allow_fp16_,
-          "nnapi allow fp16"),
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
@@ -159,7 +155,6 @@ absl::optional<EvaluationStageMetrics> ImagenetClassification::Run() {
   inference_params->set_model_file_path(model_file_path_);
   inference_params->set_num_threads(num_interpreter_threads_);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
-  inference_params->set_nnapi_allow_fp16(allow_fp16_);
   classification_params->mutable_topk_accuracy_eval_params()->set_k(10);
 
   ImageClassificationStage eval(eval_config);
diff --git a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
index c85d997974b..814ebe3b3bf 100644
--- a/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
+++ b/tensorflow/lite/tools/evaluation/tasks/inference_diff/run_eval.cc
@@ -50,7 +50,6 @@ class InferenceDiff : public TaskExecutor {
   std::string delegate_;
   int num_runs_;
   int num_interpreter_threads_;
-  bool allow_fp16_;
   DelegateProviders delegate_providers_;
 };
 
@@ -72,8 +71,6 @@ InferenceDiff::InferenceDiff(int* argc, char* argv[])
           kDelegateFlag, &delegate_,
           "Delegate to use for test inference, if available. "
           "Must be one of {'nnapi', 'gpu', 'hexagon', 'xnnpack'}"),
-      tflite::Flag::CreateFlag("nnapi_allow_fp16", &allow_fp16_,
-                               "nnapi allow fp16")
   };
   tflite::Flags::Parse(argc, const_cast<const char**>(argv), flag_list);
   delegate_providers_.InitFromCmdlineArgs(argc, const_cast<const char**>(argv));
@@ -91,7 +88,6 @@ absl::optional<EvaluationStageMetrics> InferenceDiff::Run() {
   // generating random data.
   inference_params->set_invocations_per_run(3);
   inference_params->set_delegate(ParseStringToDelegateType(delegate_));
-  inference_params->set_nnapi_allow_fp16(allow_fp16_);
   if (!delegate_.empty() &&
       inference_params->delegate() == TfliteInferenceParams::NONE) {
     TFLITE_LOG(WARN) << "Unsupported TFLite delegate: " << delegate_;

From 9083aa48e7634edcbc41d63804e5df662e6a8c4b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 01:55:49 -0700
Subject: [PATCH 093/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311289097
Change-Id: Ic47747fe7d0fd7269c0203be9b1009e400b4b297
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..53aa48bd33c 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 5eb1be50238dca9a5b92757391b4750b3529aae3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 02:01:33 -0700
Subject: [PATCH 094/412] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/58bc507b6fe6

PiperOrigin-RevId: 311289597
Change-Id: I1471895afdb961a19df531bc566898e486162d96
---
 tensorflow/compiler/mlir/lite/BUILD           |  2 +-
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  2 +-
 tensorflow/compiler/mlir/tensorflow/BUILD     |  4 ++--
 .../compiler/mlir/tensorflow/ir/tf_op_base.td |  2 +-
 tensorflow/compiler/mlir/tfjs/BUILD           |  2 +-
 tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td  |  2 +-
 tensorflow/compiler/mlir/tfrt/BUILD           |  2 +-
 .../runtime_fallback/runtime_fallback_ops.td  |  2 +-
 tensorflow/compiler/mlir/xla/BUILD            |  4 ++--
 tensorflow/compiler/mlir/xla/ir/chlo_ops.td   |  2 +-
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  2 +-
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.td   |  2 +-
 third_party/mlir/BUILD                        | 24 +++++++++----------
 third_party/mlir/test.BUILD                   |  2 +-
 14 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index f99b2806faf..9b5b0c209e5 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -31,7 +31,7 @@ filegroup(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_td_files",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 13b8ae83e34..fdf1501dbef 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -20,7 +20,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/lite/ir/tfl_op_interfaces.td"
 include "tensorflow/compiler/mlir/lite/quantization/quantization.td"
 
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 0edf0f33a23..54b560ed6ce 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -36,7 +36,7 @@ filegroup(
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -1075,7 +1075,7 @@ genrule(
     srcs = [
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         "ir/tf_generated_ops.td",
         "ir/tf_op_base.td",
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
index cd20cc79c17..dbd8ab0fae2 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td
@@ -23,7 +23,7 @@ limitations under the License.
 #define TF_OP_BASE
 
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td"
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tfjs/BUILD b/tensorflow/compiler/mlir/tfjs/BUILD
index 806a77e9c38..ac629ac4573 100644
--- a/tensorflow/compiler/mlir/tfjs/BUILD
+++ b/tensorflow/compiler/mlir/tfjs/BUILD
@@ -40,7 +40,7 @@ gentbl(
         "ir/tfjs_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/LoopLikeInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
index 172347bc0f5..134aa010d8c 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.td
@@ -23,7 +23,7 @@ limitations under the License.
 #define TFJS_DIALECT
 
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // TensorFlow.js dialect definitions
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 78787245bd6..edcfc574452 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -40,7 +40,7 @@ filegroup(
     srcs = [
         "runtime_fallback/runtime_fallback_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
         "@tf_runtime//:OpBaseTdFiles",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
index aeed800a1c3..c33c6f8d73d 100644
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
@@ -20,7 +20,7 @@ limitations under the License.
 #define TFRT_DELEGATE_DIALECT
 
 include "tfrt/tfrt_op_base.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // Type definitions
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index d9108e8f3bc..590595a668f 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -38,7 +38,7 @@ filegroup(
         "ir/lhlo_ops.td",
         "@llvm-project//mlir:OpBaseTdFiles",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -822,7 +822,7 @@ genrule(
     name = "operator_writer_inc",
     srcs = [
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
         "@llvm-project//mlir:include/mlir/IR/OpBase.td",
         ":ir/hlo_ops.td",
         ":ir/hlo_ops_base.td",
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.td b/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
index a244985c9b5..f9672c1a95a 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.td
@@ -31,7 +31,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 
 def HLOClient_Dialect : Dialect {
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 0db9563a4c1..f78ac7624d2 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -23,7 +23,7 @@ limitations under the License.
 
 include "mlir/IR/OpBase.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_utils.td"
 
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
index d7e838a6f2b..db75bbd1f67 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.td
@@ -19,7 +19,7 @@ limitations under the License.
 #define LHLO_OPS
 
 include "mlir/IR/OpBase.td"
-include "mlir/Interfaces/SideEffects.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td"
 
 def LHLO_Dialect : Dialect {
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 75b32c73260..ce5468fe679 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -178,7 +178,7 @@ filegroup(
         "include/mlir/Dialect/Affine/IR/AffineOps.td",
         "include/mlir/Dialect/Affine/IR/AffineOpsBase.td",
         "include/mlir/Interfaces/LoopLikeInterface.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -217,7 +217,7 @@ filegroup(
         "include/mlir/Dialect/AVX512/AVX512.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/IR/OpBase.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
 )
 
@@ -302,7 +302,7 @@ filegroup(
         "include/mlir/Dialect/SCF/SCFOps.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
         "include/mlir/Interfaces/LoopLikeInterface.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -374,7 +374,7 @@ filegroup(
         "include/mlir/IR/OpAsmInterface.td",
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         "include/mlir/Interfaces/ViewLikeInterface.td",
         ":OpBaseTdFiles",
     ],
@@ -997,7 +997,7 @@ filegroup(
         "include/mlir/Dialect/GPU/GPUOps.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/IR/SymbolInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1130,7 +1130,7 @@ filegroup(
         "include/mlir/Dialect/LLVMIR/LLVMOps.td",
         "include/mlir/IR/SymbolInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1417,7 +1417,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/LLVMIR/NVVMOps.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1489,7 +1489,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/LLVMIR/ROCDLOps.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -1541,7 +1541,7 @@ filegroup(
         "include/mlir/IR/SymbolInterfaces.td",
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ] + glob(["include/mlir/Dialect/SPIRV/*.td"]),
 )
@@ -2244,7 +2244,7 @@ gentbl(
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Interfaces/SideEffects.td",
+    td_file = "include/mlir/Interfaces/SideEffectInterfaces.td",
     td_srcs = [
         ":OpBaseTdFiles",
     ],
@@ -2910,7 +2910,7 @@ filegroup(
     srcs = [
         "include/mlir/Dialect/Quant/QuantOps.td",
         "include/mlir/Dialect/Quant/QuantOpsBase.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         ":OpBaseTdFiles",
     ],
 )
@@ -3390,7 +3390,7 @@ exports_files(
         "include/mlir/Interfaces/CallInterfaces.td",
         "include/mlir/Interfaces/ControlFlowInterfaces.h",
         "include/mlir/Interfaces/ControlFlowInterfaces.td",
-        "include/mlir/Interfaces/SideEffects.td",
+        "include/mlir/Interfaces/SideEffectInterfaces.td",
         "include/mlir/Interfaces/ViewLikeInterface.td",
         "include/mlir/Dialect/LLVMIR/LLVMOpBase.td",
         "include/mlir/Dialect/StandardOps/IR/Ops.td",
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index a0312a54b68..c19d312d082 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -77,7 +77,7 @@ gentbl(
         "@llvm-project//mlir:include/mlir/Interfaces/CallInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/ControlFlowInterfaces.td",
         "@llvm-project//mlir:include/mlir/Interfaces/InferTypeOpInterface.td",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffects.td",
+        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
     ],
     test = True,
 )

From 9c8ca4905e335120e8fb19ea316674416ed1a27e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 02:02:49 -0700
Subject: [PATCH 095/412] Update GraphDef version to 400.

PiperOrigin-RevId: 311289755
Change-Id: Ibae7d2dcd3f4b697e7f2735183c62d4669ead6ba
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 23e6138d553..68df6a1b632 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 399  // Updated: 2020/5/12
+#define TF_GRAPH_DEF_VERSION 400  // Updated: 2020/5/13
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 101d46ab716931f27c76b86c2f4d1e5780b43e64 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 02:02:55 -0700
Subject: [PATCH 096/412] compat: Update forward compatibility horizon to
 2020-05-13

PiperOrigin-RevId: 311289765
Change-Id: I6167b9a3d737248f831fbd4405339a9e59220944
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 627979a5cb1..26d291877cb 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 12)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 13)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From ab67ad7c4490c268abd7d46f457fbe1c425fe070 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 13 May 2020 02:44:55 -0700
Subject: [PATCH 097/412] Bump open source llvm revision to
 1c44430e738ba83eefe6d56a245ee30649d8988d

PiperOrigin-RevId: 311293944
Change-Id: I97e99d957847f7e7664549795c1a3fd30fedd987
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index fe548fdec05..83e74f3d105 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -162,8 +162,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         print("path_prefix was specified to tf_workspace but is no longer used " +
               "and will be removed in the future.")
 
-    TFRT_COMMIT = "341ba0448c117af4e29ae3911141265ee8e57860"
-    TFRT_SHA256 = "27716458f8ca7d91fc2d0f681127dbdd478eea78d6da5153c51b4696ebd14d55"
+    TFRT_COMMIT = "26fb26d716545388edb9785f8f4b3e60a4ad5092"
+    TFRT_SHA256 = "f7419a3eaab8b7137a4de5b428045a731d93da91ef1bce9ba91fab81ed23a676"
     TFRT_URLS = [
         "http://mirror.tensorflow.org/github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),
         "https://github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),
@@ -679,8 +679,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "123bee602a260150ff55c74287f583a67ee78f36"
-    LLVM_SHA256 = "313ec75e47ea3f128724a61b8b6b45b7d305ba2ae57a5084b4bf1f881b4ec8f2"
+    LLVM_COMMIT = "1c44430e738ba83eefe6d56a245ee30649d8988d"
+    LLVM_SHA256 = "81ad47eaf74dfaea1befbe7b41facfd9bcee5ca3d5635325584dbabf4bf1fa5e"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From eed4bb5cc10125abc6d175050062372dce34bfd2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 03:47:08 -0700
Subject: [PATCH 098/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311299753
Change-Id: Ia881aec05fa7e6a9a5f0a559c79bf3ab5fa954a3
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 53aa48bd33c..a90fc2e3e26 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25651,7 +25651,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25714,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25965,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26449,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45537,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47477,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47548,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48537,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From c117a875a220dd9e097027f308566e6a9398bc18 Mon Sep 17 00:00:00 2001
From: Koan-Sin Tan <koansin.tan@gmail.com>
Date: Wed, 13 May 2020 20:26:04 +0800
Subject: [PATCH 099/412] fix bad indent and remove leftover

---
 tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h        | 2 +-
 tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto | 3 ---
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 5d0ea63ab4c..668fdf5b5f6 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -350,7 +350,7 @@ class NNAPIDelegateKernel {
 
   TfLiteStatus BuildGraph(TfLiteContext* context,
                           const StatefulNnApiDelegate::Options& options,
-			  const TfLiteIntArray* input_tensors,
+                          const TfLiteIntArray* input_tensors,
                           const TfLiteIntArray* output_tensors,
                           int* nnapi_errno);
 };
diff --git a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
index cecdb22c637..09765d71726 100644
--- a/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
+++ b/tensorflow/lite/tools/evaluation/proto/evaluation_stages.proto
@@ -121,9 +121,6 @@ message TfliteInferenceParams {
   // This helps benchmark cases where extensive pre-processing might not be
   // required for every input.
   optional int32 invocations_per_run = 4 [default = 1];
-
-  // nnapi_allow_fp16
-  optional bool nnapi_allow_fp16 = 5 [default = false];
 }
 
 // Metrics specific to TFLite inference.

From dc4c6d305ba3d2de4a795ec77b483b0fa695b9ee Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Wed, 13 May 2020 06:03:54 -0700
Subject: [PATCH 100/412] Change the default value of 'use_coreml' parameter to
 false

PiperOrigin-RevId: 311313238
Change-Id: Id85bd1ad4b86cafbddeba924714256587a7da732
---
 tensorflow/lite/tools/delegates/coreml_delegate_provider.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
index c29555716a4..c6509618aee 100644
--- a/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/coreml_delegate_provider.cc
@@ -31,7 +31,7 @@ class CoreMlDelegateProvider : public DelegateProvider {
  public:
   CoreMlDelegateProvider() {
 #if defined(REAL_IPHONE_DEVICE)
-    default_params_.AddParam("use_coreml", ToolParam::Create<bool>(true));
+    default_params_.AddParam("use_coreml", ToolParam::Create<bool>(false));
     default_params_.AddParam("coreml_version", ToolParam::Create<int>(0));
 #endif
   }

From 5530521a577d7b939391d4c1bf4672b26e7abac4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 06:50:32 -0700
Subject: [PATCH 101/412] Qualify uses of std::string

PiperOrigin-RevId: 311319203
Change-Id: Ia312681455cb0518879cf323518914f49ea88b33
---
 .../core/grappler/inputs/trivial_test_graph_input_yielder.h   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
index 74e5080a30f..bf776bcd2bc 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
@@ -30,7 +30,7 @@ class TrivialTestGraphInputYielder : public InputYielder {
  public:
   TrivialTestGraphInputYielder(int num_stages, int width, int tensor_size,
                                bool insert_queue,
-                               const std::vector<string>& device_names);
+                               const std::vector<std::string>& device_names);
   bool NextItem(GrapplerItem* item) override;
 
  private:
@@ -38,7 +38,7 @@ class TrivialTestGraphInputYielder : public InputYielder {
   const int width_;
   const int tensor_size_;
   const bool insert_queue_;
-  std::vector<string> device_names_;
+  std::vector<std::string> device_names_;
 };
 
 }  // end namespace grappler

From e4702e19bb1ef0d5fc4e63833fcc88e533371f96 Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Wed, 13 May 2020 06:58:18 -0700
Subject: [PATCH 102/412] Add SSE4 path for Tanh and Logistic.

PiperOrigin-RevId: 311320167
Change-Id: Ie62fd09adf8e41827796d2102c5f1d505429a139
---
 .../internal/optimized/optimized_ops.h        | 101 ++++++++++++++++++
 tensorflow/workspace.bzl                      |   8 +-
 2 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 5f183de7269..a6d37f4f1ed 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -4332,6 +4332,41 @@ inline void Logistic(const LogisticParams& params,
     }
   }
 #endif
+#ifdef GEMMLOWP_SSE4
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 3>;
+
+    for (; c <= flat_size - 16; c += 16) {
+      F3 input0 = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+      F3 input1 = F3::FromRaw(gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+          reinterpret_cast<const __m128i*>(input_data_ptr + 8))));
+      F0 output0 = gemmlowp::logistic(input0);
+      F0 output1 = gemmlowp::logistic(input1);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                       output0.raw().v);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                       output1.raw().v);
+      input_data_ptr += 16;
+      output_data_ptr += 16;
+    }
+    for (; c <= flat_size - 8; c += 8) {
+      F3 input = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+      F0 output = gemmlowp::logistic(input);
+      _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                       output.raw().v);
+      input_data_ptr += 8;
+      output_data_ptr += 8;
+    }
+  }
+#endif
+
   {
     // F0 uses 0 integer bits, range [-1, 1].
     // This is the return type of math functions such as tanh, logistic,
@@ -4438,6 +4473,72 @@ inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
     }
   }
 #endif
+#ifdef GEMMLOWP_SSE4
+  {
+    // F0 uses 0 integer bits, range [-1, 1].
+    // This is the return type of math functions such as tanh, logistic,
+    // whose range is in [-1, 1].
+    using F0 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 0>;
+    // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+    using F3 = gemmlowp::FixedPoint<gemmlowp::int16x8_m128i, 3>;
+
+    if (input_left_shift == 0) {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+            _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+        F3 input1 = F3::FromRaw(gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+            reinterpret_cast<const __m128i*>(input_data_ptr + 8))));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output0.raw().v);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                         output1.raw().v);
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::to_int16x8_m128i(
+            _mm_loadu_si128(reinterpret_cast<const __m128i*>(input_data_ptr))));
+        F0 output = gemmlowp::tanh(input);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output.raw().v);
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    } else {
+      for (; c <= flat_size - 16; c += 16) {
+        F3 input0 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr)))));
+        F3 input1 = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr + 8)))));
+        F0 output0 = gemmlowp::tanh(input0);
+        F0 output1 = gemmlowp::tanh(input1);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output0.raw().v);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr + 8),
+                         output1.raw().v);
+
+        input_data_ptr += 16;
+        output_data_ptr += 16;
+      }
+      for (; c <= flat_size - 8; c += 8) {
+        F3 input = F3::FromRaw(gemmlowp::SaturatingRoundingMultiplyByPOT<1>(
+            gemmlowp::to_int16x8_m128i(_mm_loadu_si128(
+                reinterpret_cast<const __m128i*>(input_data_ptr)))));
+        F0 output = gemmlowp::tanh(input);
+        _mm_storeu_si128(reinterpret_cast<__m128i*>(output_data_ptr),
+                         output.raw().v);
+        input_data_ptr += 8;
+        output_data_ptr += 8;
+      }
+    }
+  }
+#endif
+
   {
     // F0 uses 0 integer bits, range [-1, 1].
     // This is the return type of math functions such as tanh, logistic,
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 83e74f3d105..31389d7c459 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -354,11 +354,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "gemmlowp",
-        sha256 = "6678b484d929f2d0d3229d8ac4e3b815a950c86bb9f17851471d143f6d4f7834",  # SHARED_GEMMLOWP_SHA
-        strip_prefix = "gemmlowp-12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3",
+        sha256 = "43146e6f56cb5218a8caaab6b5d1601a083f1f31c06ff474a4378a7d35be9cfb",  # SHARED_GEMMLOWP_SHA
+        strip_prefix = "gemmlowp-fda83bdc38b118cc6b56753bd540caa49e570745",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
-            "https://github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip",
+            "https://github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip",
         ],
     )
 

From 0e7612ea0d2ee8ddd65e7bf3e96800911384976e Mon Sep 17 00:00:00 2001
From: Michael Moffitt <moffitt@google.com>
Date: Wed, 13 May 2020 09:21:30 -0500
Subject: [PATCH 103/412] Removes duplicate space from retracing warning
 message

---
 tensorflow/python/eager/def_function.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 48c9b06fa38..c61f39111b1 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -109,7 +109,7 @@ class _FrequentTracingDetector(object):
             "retracing. Tracing is expensive and the excessive number of "
             "tracings could be due to (1) creating @tf.function repeatedly in "
             "a loop, (2) passing tensors with different shapes, (3) passing "
-            "Python objects instead of tensors. For (1), please  define your "
+            "Python objects instead of tensors. For (1), please define your "
             "@tf.function outside of the loop. For (2), @tf.function has "
             "experimental_relax_shapes=True option that relaxes argument "
             "shapes that can avoid unnecessary retracing. For (3), please "

From f8429e72fc992f6b9b353e8db2ae846a1c69d7b8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 07:18:43 -0700
Subject: [PATCH 104/412] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/897d8ee5cd69

PiperOrigin-RevId: 311323011
Change-Id: I0d60709d46dffa171e299a7e8bdfc9a1ae43fc06
---
 .../tests/chlo_infer_shape_type_methods.mlir  |  4 ++--
 .../chlo_legalize_to_hlo_broadcasts.mlir      | 12 +++++-----
 .../xla/tests/legalize-tf-BatchMatMulV2.mlir  |  8 +++----
 third_party/mlir/BUILD                        | 24 -------------------
 third_party/mlir/test.BUILD                   |  2 +-
 5 files changed, 13 insertions(+), 37 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir b/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir
index ce0243e416c..d67a7d09f7c 100644
--- a/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/chlo_infer_shape_type_methods.mlir
@@ -6,8 +6,8 @@
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>,
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?xf32>
 func @broadcast_add(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<1xindex> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = "shape.shape_of"(%[[ARG0]])
-  // CHECK-DAG: %[[ARG1_S:.+]] = "shape.shape_of"(%[[ARG1]])
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-DAG: %[[BCAST_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[EXTENTS:.+]] = "shape.to_extent_tensor"(%[[BCAST_S]])
   // CHECK: return %[[EXTENTS]]
diff --git a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
index 2bc1e0c6852..7194f7034b5 100644
--- a/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/chlo_legalize_to_hlo_broadcasts.mlir
@@ -14,8 +14,8 @@ func @addWithoutBroadcast(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = "shape.shape_of"(%[[ARG0]])
-  // CHECK-DAG: %[[ARG1_S:.+]] = "shape.shape_of"(%[[ARG1]])
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_S]])
   // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
@@ -31,8 +31,8 @@ func @dynamicBroadcast(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xcomplex<f32>> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = "shape.shape_of"(%[[ARG0]])
-  // CHECK-DAG: %[[ARG1_S:.+]] = "shape.shape_of"(%[[ARG1]])
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_S]])
   // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
@@ -48,8 +48,8 @@ func @dynamicBroadcastComplex(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> t
 // CHECK-SAME: %[[ARG0:.+]]: tensor<?xf32>
 // CHECK-SAME: %[[ARG1:.+]]: tensor<?x?xf32>
 func @dynamicBroadcastCompare(%arg0: tensor<?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xi1> {
-  // CHECK-DAG: %[[ARG0_S:.+]] = "shape.shape_of"(%[[ARG0]])
-  // CHECK-DAG: %[[ARG1_S:.+]] = "shape.shape_of"(%[[ARG1]])
+  // CHECK-DAG: %[[ARG0_S:.+]] = shape.shape_of %[[ARG0]]
+  // CHECK-DAG: %[[ARG1_S:.+]] = shape.shape_of %[[ARG1]]
   // CHECK-DAG: %[[RESULT_S:.+]] = "shape.broadcast"(%[[ARG0_S]], %[[ARG1_S]])
   // CHECK: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_S]])
   // CHECK-DAG: %[[ARG0_B:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[ARG0]], %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
index 08df9fd3808..3605e2a0d5c 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-BatchMatMulV2.mlir
@@ -7,8 +7,8 @@
 func @batchmatmulv2_basic(%arg0: tensor<1x4x2xf32>, %arg1: tensor<3x2x4xf32>) -> tensor<3x4x4xf32> {
 // CHECK-LABEL:   func @batchmatmulv2_basic
 // CHECK-SAME:        ([[LHS:%.*]]: tensor<1x4x2xf32>, [[RHS:%.*]]: tensor<3x2x4xf32>) -> tensor<3x4x4xf32>
-// CHECK:           [[LHSSHAPE:%.*]] = "shape.shape_of"([[LHS]]) : (tensor<1x4x2xf32>) -> !shape.shape
-// CHECK:           [[RHSSHAPE:%.*]] = "shape.shape_of"([[RHS]]) : (tensor<3x2x4xf32>) -> !shape.shape
+// CHECK:           [[LHSSHAPE:%.*]] = shape.shape_of [[LHS]] : tensor<1x4x2xf32>
+// CHECK:           [[RHSSHAPE:%.*]] = shape.shape_of [[RHS]] : tensor<3x2x4xf32>
 // CHECK:           [[CM2:%.*]] = constant -2 : i32
 // CHECK:           [[LHSHEAD:%.*]], [[LHSTAIL:%.*]] = "shape.split_at"([[LHSSHAPE]], [[CM2]]) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
 // CHECK:           [[RHSHEAD:%.*]], [[RHSTAIL:%.*]] = "shape.split_at"([[RHSSHAPE]], [[CM2]]) : (!shape.shape, i32) -> (!shape.shape, !shape.shape)
@@ -86,8 +86,8 @@ func @batchmatmulv2_adj_complex(%arg0: tensor<5x2xcomplex<f32>>, %arg1: tensor<2
 // CHECK:           [[RHSIM:%.*]] = "xla_hlo.imag"([[RHS]])
 // CHECK:           [[RHSIMNEG:%.*]] = "xla_hlo.negate"([[RHSIM]])
 // CHECK:           [[RHSCONJ:%.*]] = "xla_hlo.complex"([[RHSRE]], [[RHSIMNEG]])
-// CHECK:           "shape.shape_of"([[LHSCONJ]])
-// CHECK:           "shape.shape_of"([[RHSCONJ]])
+// CHECK:           shape.shape_of [[LHSCONJ]]
+// CHECK:           shape.shape_of [[RHSCONJ]]
   %0 = "tf.BatchMatMulV2"(%arg0, %arg1) {adj_x = true, adj_y = true, device = ""} : (tensor<5x2xcomplex<f32>>, tensor<2x4xcomplex<f32>>) -> tensor<5x4xcomplex<f32>>
   return %0 : tensor<5x4xcomplex<f32>>
 }
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index ce5468fe679..8b61ce98dab 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -1801,28 +1801,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "StandardToStandard",
-    srcs = glob([
-        "lib/Conversion/StandardToStandard/*.cpp",
-        "lib/Conversion/StandardToStandard/*.h",
-    ]),
-    hdrs = glob([
-        "include/mlir/Conversion/StandardToStandard/*.h",
-    ]),
-    includes = [
-        "include",
-        "lib/Conversion/StandardToStandard",
-    ],
-    deps = [
-        ":ConversionPassIncGen",
-        ":IR",
-        ":Pass",
-        ":StandardOps",
-        ":Transforms",
-    ],
-)
-
 cc_library(
     name = "SPIRVSerialization",
     srcs = glob(
@@ -2485,7 +2463,6 @@ cc_library(
         ":SCFTransforms",
         ":StandardOpsTransforms",
         ":StandardToSPIRVConversions",
-        ":StandardToStandard",
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
@@ -2584,7 +2561,6 @@ cc_library(
         ":StandardOpsTransforms",
         ":StandardOpsTransformsPassIncGen",
         ":StandardToSPIRVConversions",
-        ":StandardToStandard",
         ":Transforms",
         ":TransformsPassIncGen",
         ":VectorOps",
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index c19d312d082..eb5d8a650eb 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -106,7 +106,7 @@ cc_library(
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SideEffects",
         "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:StandardToStandard",
+        "@llvm-project//mlir:StandardOpsTransforms",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
     ],

From cac1acba3f47ace5027dde0f45df15ff508f2d7a Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 13 May 2020 07:52:00 -0700
Subject: [PATCH 105/412] Bump open source llvm revision to
 897d8ee5cd693e17f95a7e84194bca4c089a520b

PiperOrigin-RevId: 311327327
Change-Id: Ib247eab7624ca88b999ccd871f0b1fb0f824ef1f
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 31389d7c459..7cc156a2985 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -679,8 +679,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "1c44430e738ba83eefe6d56a245ee30649d8988d"
-    LLVM_SHA256 = "81ad47eaf74dfaea1befbe7b41facfd9bcee5ca3d5635325584dbabf4bf1fa5e"
+    LLVM_COMMIT = "897d8ee5cd693e17f95a7e84194bca4c089a520b"
+    LLVM_SHA256 = "994677daedf23bc93ce04f1a527c07c09b7fbbd0986d867b60bd6710057a40de"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From d11a1769c509b303e814ddbfcf3d60a07e993440 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Wed, 13 May 2020 08:16:59 -0700
Subject: [PATCH 106/412] Separate out creating arguments from
 PromoteResourcesToArguments in PromoteResourcesToArgsPass (NFC).

This simplifies some logic where there is special handling depending on if the resource was originally an argument already present or a VarHandleOp.

PiperOrigin-RevId: 311331121
Change-Id: I603f007c28558e3604c62fb991ac82ca560e143e
---
 .../tests/promote_resources_to_args.mlir      |  76 ++--
 .../transforms/promote_resources_to_args.cc   | 357 ++++++++++--------
 2 files changed, 251 insertions(+), 182 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
index e7f4873594b..eb6d40d20d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
@@ -1,11 +1,11 @@
 // RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-promote-resources-to-args | FileCheck %s -dump-input-on-failure
 
 // One resource, one read. The initial value of the resource is read.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg0, %[[CONST:[0-9]*]])
+  // CHECK: %[[ADD:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST:[0-9]*]])
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD]])
   // CHECK: return %[[PACK]]
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
@@ -19,8 +19,8 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // One resource, one write. The initial value of the resource is not read.
-// CHECK-LABEL: func @main() -> (tensor<f32> {tf.resource_name = "x"})
-func @main() {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>) -> (tensor<f32> {tf.resource_name = "x"})
+func @main(%arg0: tensor<i1>) {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.AssignVariableOp"
   // CHECK: return %[[CONST]]
@@ -33,12 +33,12 @@ func @main() {
 // -----
 
 // One resource, two reads using different resource handles.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg0, %[[CONST:[0-9]*]])
-  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg0)
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST:[0-9]*]])
+  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg1)
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD2]])
   // CHECK: return %[[PACK]]
 
@@ -56,12 +56,12 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // Two resources, two reads using different resources.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}, %arg1: tensor<f32> {tf.resource_name = "y"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}, %arg2: tensor<f32> {tf.resource_name = "y"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.VarHandleOp"
   // CHECK-NOT: "tf.ReadVariableOp"
-  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg0, %[[CONST:[0-9]*]])
-  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg1)
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %[[CONST:[0-9]*]])
+  // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %arg2)
   // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%[[CONST]], %[[ADD2]])
   // CHECK: return %[[PACK]]
 
@@ -79,12 +79,12 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // One resource with read and write. The initial value of the resource is read.
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.aliasing_output = 1 : i64, tf.resource_name = "x"}) -> (tensor<2xf32>, tensor<f32>)
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.aliasing_output = 1 : i64, tf.resource_name = "x"}) -> (tensor<2xf32>, tensor<f32>)
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.AssignVariableOp"
-  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg0, %{{[0-9]*}})
+  // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%arg1, %{{[0-9]*}})
   // CHECK: %[[ADD2:[0-9]*]] = "tf.AddV2"(%[[ADD1]], %[[ADD1]])
-  // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%arg0, %[[ADD2]])
+  // CHECK: %[[PACK:[0-9]*]] = "tf.Pack"(%arg1, %[[ADD2]])
   // CHECK: return %[[PACK]], %[[ADD1]]
 
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
@@ -102,8 +102,8 @@ func @main() -> tensor<2xf32> {
 // -----
 
 // One resource with read and write. The initial value of the resource is not read.
-// CHECK-LABEL: func @main() -> (tensor<2xf32>, tensor<f32> {tf.resource_name = "x"})
-func @main() -> tensor<2xf32> {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>) -> (tensor<2xf32>, tensor<f32> {tf.resource_name = "x"})
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> {
   // CHECK-NOT: "tf.AssignVariableOp"
   // CHECK: %[[CONST:[a-z0-9]+]] = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>}
   // CHECK: %[[ADD1:[0-9]*]] = "tf.AddV2"(%[[CONST]], %[[CONST]])
@@ -138,8 +138,8 @@ func @cond_true(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<f32>) ->
   return %2 : tensor<f32>
 }
 
-// CHECK-LABEL: func @main(%arg0: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
-func @main() -> tensor<2xf32> attributes {tf.entry_function = {inputs = "", outputs = "result"}} {
+// CHECK-LABEL: func @main(%arg0: tensor<i1>, %arg1: tensor<f32> {tf.resource_name = "x"}) -> tensor<2xf32>
+func @main(%arg0: tensor<i1>) -> tensor<2xf32> attributes {tf.entry_function = {inputs = "", outputs = "result"}} {
   %0 = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>} : () -> tensor<f32>
   %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
   %2 = "tf.ReadVariableOp"(%1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
@@ -157,10 +157,11 @@ func @main() -> tensor<2xf32> attributes {tf.entry_function = {inputs = "", outp
 // Tests resource passed in as an argument is not modified and not returned.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32>
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
-  %0 = "tf.ReadVariableOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
-  // CHECK-NEXT: "tf.AddV2"(%[[ARG_0]], %[[ARG_0]])
+// CHECK-SAME: %arg0: tensor<i1>
+// CHECK-SAME: %[[ARG_1:[a-z0-9]+]]: tensor<f32>
+func @main(%arg0: tensor<i1>, %arg1: tensor<!tf.resource<tensor<f32>>>) {
+  %0 = "tf.ReadVariableOp"(%arg1) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+  // CHECK-NEXT: "tf.AddV2"(%[[ARG_1]], %[[ARG_1]])
   %1 = "tf.AddV2"(%0, %0) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-NEXT: return
   return
@@ -171,9 +172,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
 // Tests resource passed in as an argument is modified but not returned.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> tensor<f32>
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) {
   // CHECK-NEXT: %[[CONST:[a-z0-9]+]] = "tf.Const"
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
@@ -186,9 +188,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
 // Tests last resource assign is returned as a result.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 0 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> tensor<f32>
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) {
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
   // CHECK: %[[CONST:[a-z0-9]+]] = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>}
@@ -204,9 +207,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) {
 // returns the same value prior.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> (tensor<f32>, tensor<f32>)
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<f32> {
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
   // CHECK: %[[CONST:[a-z0-9]+]] = "tf.Const"() {value = dense<1.050000e+03> : tensor<f32>}
@@ -221,9 +225,10 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
 // Tests read interleaved between writes.
 
 // CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG_0:[a-z0-9]+]]: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %{{[a-z0-9]+}}: tensor<f32> {tf.aliasing_output = 1 : i64}
+// CHECK-SAME: %arg1: tensor<i1>
 // CHECK-SAME: -> (tensor<f32>, tensor<f32>)
-func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<f32> {
+func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<i1>) -> tensor<f32> {
   // CHECK-NEXT: %[[CONST_0:[a-z0-9]+]] = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>}
   %0 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
   "tf.AssignVariableOp"(%arg0, %0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
@@ -312,9 +317,10 @@ func @main() {
 // Tests resource argument has users that are not ReadVariableOp or
 // AssignVariableOp.
 
-// expected-error@+1 {{expects users of resource argument 0 to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp'}}
+// expected-error@+1 {{expects users of resource argument 0 to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp', got [tf.UnknownOp, tf.VarIsInitializedOp]}}
 func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<i1> {
   %0 = "tf.VarIsInitializedOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
+  %1 = "tf.UnknownOp"(%arg0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
   return %0 : tensor<i1>
 }
 
@@ -323,7 +329,7 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>) -> tensor<i1> {
 // Tests VarHandleOp has users that are not removed.
 
 func @main() -> tensor<i1> {
-  // expected-error@+1 {{expects no uses but used by operations: tf.UnknownOp, tf.VarIsInitializedOp}}
+  // expected-error@+1 {{expects users to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp', got [tf.UnknownOp, tf.VarIsInitializedOp]}}
   %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
   %1 = "tf.VarIsInitializedOp"(%0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
   %2 = "tf.UnknownOp"(%0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<i1>
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index fa4fe461317..9001c00bebe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -47,11 +47,14 @@ limitations under the License.
 //  . Dead functions have already been removed, as resource arguments in dead
 //    functions can cause the pass to fail.
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
@@ -73,31 +76,117 @@ constexpr char kResourceFunctionMsg[] =
     "expects function level resource argument";
 constexpr char kInvalidResourceMsg[] =
     "expects resource to be a VarHandleOp or function argument";
+constexpr char kResourceNameArgAttr[] = "tf.resource_name";
 
-// Records the input argument index and the current live value for a resource
-// variable.
-//
-// . If the input argument already exists or has been added, input_index is the
-//   index of the function, and live_value_or_type tracks the live value of the
-//   resource.
-//
-// . If the input argument has not been added in the pass, input_index is
-//   kInputUnassigned, live_value_or_type represents the type of the resource.
-//   (a) If this resource is read, add a new argument whose type is obtained
-//       from live_value_or_type, and input_index and live_value_or_type will be
-//       updated to reference the new argument.
-//   (b) If this resource is written, live_value_or_type will track the new
-//       value of the resource. input_index will remain to be kInputUnassigned.
+// Collects names of users of a resource that are not `tf.ReadVariableOp` and
+// not `tf.AssignVariableOp`.
+llvm::SmallSet<llvm::StringRef, 1> GetCompositeResourceUserNames(
+    Value resource) {
+  // SmallSet will use a vector when there is only one element and use std::set
+  // when there are more than one elements. This ensures that the operations in
+  // the error message are ordered.
+  llvm::SmallSet<llvm::StringRef, 1> composite_users;
+  for (Operation* user : resource.getUsers())
+    if (!llvm::isa<TF::ReadVariableOp>(user) &&
+        !llvm::isa<TF::AssignVariableOp>(user))
+      composite_users.insert(user->getName().getStringRef());
+
+  return composite_users;
+}
+
+// Checks if `tf.VarHandleOp` has a valid resource subtype and its users are of
+// `tf.ReadVariableOp` and `tf.AssignVariableOp` only.
+mlir::LogicalResult ValidateVarHandle(TF::VarHandleOp var_handle_op) {
+  auto resource_type =
+      getElementTypeOrSelf(var_handle_op.getType()).cast<TF::ResourceType>();
+  if (resource_type.getSubtypes().size() != 1)
+    return var_handle_op.emitOpError()
+           << "expects resource type to have one subtype, got "
+           << resource_type;
+
+  auto composite_ops = GetCompositeResourceUserNames(var_handle_op);
+  if (!composite_ops.empty())
+    return var_handle_op.emitOpError()
+           << "expects users to be 'tf.ReadVariableOp' or "
+              "'tf.AssignVariableOp', got ["
+           << llvm::join(composite_ops.begin(), composite_ops.end(), ", ")
+           << "]";
+
+  return success();
+}
+
+// Checks if resource argument has a valid resource subtype and its users are of
+// `tf.ReadVariableOp` and `tf.AssignVariableOp` only.
+mlir::LogicalResult ValidateResourceArgument(FuncOp function,
+                                             BlockArgument resource_arg,
+                                             TF::ResourceType resource_type) {
+  if (resource_type.getSubtypes().size() != 1)
+    return function.emitError()
+           << "expects resource type of argument "
+           << resource_arg.getArgNumber() << " to have one subtype, got "
+           << resource_type;
+
+  auto composite_ops = GetCompositeResourceUserNames(resource_arg);
+  if (!composite_ops.empty())
+    return function.emitError()
+           << "expects users of resource argument "
+           << resource_arg.getArgNumber()
+           << " to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp', got ["
+           << llvm::join(composite_ops.begin(), composite_ops.end(), ", ")
+           << "]";
+
+  return success();
+}
+
+// Adds resource arguments for every unique (name) variable handle. Associated
+// `tf.VarHandleOp` are removed from the function. Variable shared names are
+// returned in `var_handle_shared_names` based on the ordering of added resource
+// arguments.
+mlir::LogicalResult PromoteVarHandlesToArguments(
+    FuncOp function, bool add_validation,
+    llvm::SmallVectorImpl<std::string>* var_handle_shared_names) {
+  Block& block = function.front();
+  auto func_type = function.getType();
+
+  auto func_arg_types = llvm::to_vector<4>(func_type.getInputs());
+  llvm::SmallDenseMap<llvm::StringRef, int> var_arg_index_by_name;
+  for (auto var_handle_op :
+       llvm::make_early_inc_range(block.getOps<TF::VarHandleOp>())) {
+    if (add_validation && failed(ValidateVarHandle(var_handle_op)))
+      return failure();
+
+    llvm::StringRef name = var_handle_op.shared_nameAttr().getValue();
+    auto it = var_arg_index_by_name.insert({name, func_arg_types.size()});
+    if (it.second) {
+      var_handle_shared_names->emplace_back(name);
+      auto resource_type = var_handle_op.resource().getType();
+      func_arg_types.push_back(resource_type);
+      var_handle_op.resource().replaceAllUsesWith(
+          block.addArgument(resource_type));
+    } else {
+      var_handle_op.resource().replaceAllUsesWith(
+          block.getArgument(it.first->getSecond()));
+    }
+    var_handle_op.erase();
+  }
+
+  if (!var_handle_shared_names->empty())
+    function.setType(FunctionType::get(func_arg_types, func_type.getResults(),
+                                       function.getContext()));
+
+  return success();
+}
+
+// Records the current live value for a resource variable and whether a read or
+// write on the variable occurred.
 struct ResourceInfo {
-  static constexpr int64_t kInputUnassigned = -1;
-  int64_t input_index;
-  llvm::PointerUnion<Value, Type> live_value_or_type;
+  Value live_value = nullptr;
+  bool read = false;
+  bool write = false;
 };
 
-using ArgOrName = llvm::PointerUnion<BlockArgument, Attribute>;
-using ResourceMap = llvm::SmallDenseMap<ArgOrName, ResourceInfo>;
-
-LogicalResult PromoteResourcesToArguments(FuncOp function) {
+LogicalResult PromoteResourcesToArguments(
+    FuncOp function, llvm::ArrayRef<std::string> var_handle_shared_names) {
   Block& block = function.front();
 
   auto return_op = llvm::dyn_cast_or_null<ReturnOp>(block.getTerminator());
@@ -105,82 +194,61 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
     return function.emitError(
         "expects 'main' function to have a MLIR ReturnOp");
 
-  ResourceMap resource_map;
+  llvm::SmallVector<ResourceInfo, 4> resources(function.getNumArguments());
   auto argument_types = llvm::to_vector<4>(function.getType().getInputs());
+  bool has_resources = false;
+  auto add_resource_argument = [&](BlockArgument arg,
+                                   TF::ResourceType resource_type) {
+    Type arg_type = resource_type.getSubtypes().front();
+    arg.setType(arg_type);
+    resources[arg.getArgNumber()].live_value = arg;
+    argument_types[arg.getArgNumber()] = arg_type;
+    has_resources = true;
+  };
 
-  // Loop through the resource arguments in the function and store a mapping
-  // from that argument to its index and itself as the current live value.
-  for (BlockArgument& func_arg : function.getArguments()) {
+  // Loop through the non `tf.VarHandleOp` resource arguments in the function,
+  // validate its uses and subtype, and store a mapping from that argument to
+  // itself as the current live value.
+  auto func_args = function.getArguments().take_front(
+      function.getNumArguments() - var_handle_shared_names.size());
+  for (BlockArgument& func_arg : func_args) {
     auto resource_type =
         getElementTypeOrSelf(func_arg.getType()).dyn_cast<TF::ResourceType>();
     if (!resource_type) continue;
-    if (resource_type.getSubtypes().size() != 1)
-      return function.emitError()
-             << "expects resource type of argument " << func_arg.getArgNumber()
-             << " to have one subtype, got " << resource_type;
+    if (failed(ValidateResourceArgument(function, func_arg, resource_type)))
+      return failure();
 
-    for (auto* user : func_arg.getUsers())
-      if (!llvm::isa<TF::ReadVariableOp>(user) &&
-          !llvm::isa<TF::AssignVariableOp>(user))
-        return function.emitError()
-               << "expects users of resource argument "
-               << func_arg.getArgNumber()
-               << " to be 'tf.ReadVariableOp' or 'tf.AssignVariableOp'";
-
-    Type arg_type = resource_type.getSubtypes().front();
-    func_arg.setType(arg_type);
-    resource_map[func_arg] = {func_arg.getArgNumber(), func_arg};
-    argument_types[func_arg.getArgNumber()] = arg_type;
+    add_resource_argument(func_arg, resource_type);
   }
 
-  // Loop through the VarHandleOp in the function. When the first VarHandleOp
-  // for a resource variable is encountered, add an entry to the resource_map to
-  // record the information. Do not add a new function argument yet.
-  for (auto var_handle_op : block.getOps<TF::VarHandleOp>()) {
-    if (resource_map.count(var_handle_op.shared_nameAttr())) continue;
-
+  // Loop through `tf.VarHandleOp` resource arguments in the function and store
+  // a mapping from that argument to itself as the current live value. No
+  // validations are necessary here as these arguments were validated prior to
+  // being added.
+  auto var_handle_args =
+      function.getArguments().take_back(var_handle_shared_names.size());
+  for (BlockArgument& var_handle_arg : var_handle_args) {
     auto resource_type =
-        getElementTypeOrSelf(var_handle_op.getType()).cast<TF::ResourceType>();
-    if (resource_type.getSubtypes().size() != 1)
-      return var_handle_op.emitOpError()
-             << "expects resource type to have one subtype, got "
-             << resource_type;
-
-    resource_map[var_handle_op.shared_nameAttr()] = {
-        ResourceInfo::kInputUnassigned, resource_type.getSubtypes().front()};
+        getElementTypeOrSelf(var_handle_arg.getType()).cast<TF::ResourceType>();
+    add_resource_argument(var_handle_arg, resource_type);
   }
 
-  if (resource_map.empty()) return success();
+  if (!has_resources) return success();
 
   // We initially assign the argument for a resource as the live value for the
   // resource. We then walk through the operations in the function in their
   // lexical order, to update the live value for the resource when we see a
   // store to the resource and replace reads of the resource with uses of its
-  // live value. For the reads, if the resource does not have a live value yet,
-  // we add a new argument and use it as the live value.
+  // live value.
   for (Operation& op : llvm::make_early_inc_range(block)) {
     if (auto read_op = llvm::dyn_cast<TF::ReadVariableOp>(&op)) {
       if (auto func_arg = read_op.resource().dyn_cast<BlockArgument>()) {
         if (func_arg.getOwner() != &block)
           return read_op.emitOpError(kResourceFunctionMsg);
 
-        // resource_map[func_arg] is always a Value when func_arg is a
-        // BlockArgument.
-        read_op.value().replaceAllUsesWith(
-            resource_map[func_arg].live_value_or_type.get<Value>());
-      } else if (auto var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(
-                     read_op.resource().getDefiningOp())) {
-        ResourceInfo& info = resource_map[var_handle_op.shared_nameAttr()];
-        if (auto live_value = info.live_value_or_type.dyn_cast<Value>()) {
-          read_op.value().replaceAllUsesWith(live_value);
-        } else {
-          auto arg_type = info.live_value_or_type.get<Type>();
-          BlockArgument arg = block.addArgument(arg_type);
-          info.input_index = argument_types.size();
-          info.live_value_or_type = arg;
-          argument_types.push_back(arg_type);
-          read_op.value().replaceAllUsesWith(arg);
-        }
+        ResourceInfo& resource_info = resources[func_arg.getArgNumber()];
+        resource_info.read = true;
+        read_op.value().replaceAllUsesWith(resource_info.live_value);
       } else {
         return read_op.emitOpError(kInvalidResourceMsg);
       }
@@ -191,11 +259,9 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
         if (func_arg.getOwner() != &block)
           return write_op.emitOpError(kResourceFunctionMsg);
 
-        resource_map[func_arg].live_value_or_type = write_op.value();
-      } else if (auto var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(
-                     write_op.resource().getDefiningOp())) {
-        resource_map[var_handle_op.shared_nameAttr()].live_value_or_type =
-            write_op.value();
+        ResourceInfo& resource_info = resources[func_arg.getArgNumber()];
+        resource_info.write = true;
+        resource_info.live_value = write_op.value();
       } else {
         return read_op.emitOpError(kInvalidResourceMsg);
       }
@@ -206,67 +272,68 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
 
   const int64_t num_results_before = function.getNumResults();
   auto return_operands = llvm::to_vector<4>(return_op.getOperands());
-  return_operands.reserve(num_results_before + resource_map.size());
   auto result_types = llvm::to_vector<4>(return_op.getOperandTypes());
-  result_types.reserve(num_results_before + resource_map.size());
-  llvm::SmallVector<std::pair<int64_t, Attribute>, 4> output_only_resources;
-  output_only_resources.reserve(resource_map.size());
+  llvm::SmallVector<std::pair<int64_t, llvm::StringRef>, 4>
+      output_only_resources;
   llvm::SmallVector<std::pair<int64_t, int64_t>, 4> input_output_alias;
-  input_output_alias.reserve(resource_map.size());
 
-  // Collect new return values and either (a) output-only resource attributes
-  // (if the resource is not promoted to an argument) or (b) mapping from
-  // resource input index to output alias (if the resource has been promoted to
-  // an argument). If the last live value is itself (argument), then that live
-  // value will not be returned as the resource is unmodified.
-  for (auto& resource : resource_map) {
-    int64_t input_index = resource.getSecond().input_index;
-    auto live_value = resource.getSecond().live_value_or_type.dyn_cast<Value>();
-    if (input_index == ResourceInfo::kInputUnassigned) {
-      if (!live_value) continue;
-
-      output_only_resources.push_back(
-          {return_operands.size(), resource.getFirst().dyn_cast<Attribute>()});
-    } else {
-      // live_value is not nullptr because any input-assigned resource has a
-      // Value as live_value.
-      auto live_arg = live_value.dyn_cast<BlockArgument>();
-      if (live_arg && live_arg.getOwner() == &block &&
-          live_arg.getArgNumber() == input_index)
-        continue;
-
-      input_output_alias.push_back({input_index, return_operands.size()});
-    }
-    return_operands.push_back(live_value);
-    result_types.push_back(live_value.getType());
-  }
-
-  // Erase all VarHandleOp.
-  for (Operation& op : llvm::make_early_inc_range(function.front())) {
-    auto var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(op);
-    if (!var_handle_op) continue;
-    if (!var_handle_op.use_empty()) {
-      // SmallSet will use a vector when there is only one element and use
-      // std::set when there are more than one elements. This ensures that
-      // the operations in the error message are ordered.
-      llvm::SmallSet<std::string, 2> unique_operations;
-      llvm::for_each(
-          var_handle_op.getOperation()->getUsers(), [&](Operation* user) {
-            unique_operations.insert(user->getName().getStringRef().str());
-          });
-
-      return var_handle_op.emitOpError(
-                 "expects no uses but used by operations: ")
-             << llvm::join(unique_operations.begin(), unique_operations.end(),
-                           ", ");
-    }
-
-    op.erase();
-  }
-
-  // Rewrite return if more results need to be returned by the function.
+  // Collect new return values for variable writes and either (a) output-only
+  // resource attributes (if the resource is not promoted to an argument) or (b)
+  // mapping from resource input index to output alias (if the resource has been
+  // promoted to an argument). Resource arguments that were originally
+  // `tf.VarHandleOp` but not read are collected and then removed.
   OpBuilder builder(return_op);
-  if (!output_only_resources.empty() || !input_output_alias.empty()) {
+  const int var_handles_start_idx =
+      function.getNumArguments() - var_handle_shared_names.size();
+  int new_argument_index = 0;
+  llvm::SmallVector<int, 4> argument_indices_to_remove;
+  for (auto resource_and_index : llvm::enumerate(resources)) {
+    const auto& resource = resource_and_index.value();
+    if (!resource.live_value) {
+      // Ignore non resource arguments.
+      ++new_argument_index;
+      continue;
+    }
+
+    const auto index = resource_and_index.index();
+    const bool is_var_handle = index >= var_handles_start_idx;
+    if (resource.write) {
+      if (!is_var_handle || resource.read) {
+        input_output_alias.push_back(
+            {new_argument_index, return_operands.size()});
+      } else if (is_var_handle) {
+        output_only_resources.push_back(
+            {return_operands.size(),
+             var_handle_shared_names[index - var_handles_start_idx]});
+      }
+      return_operands.push_back(resource.live_value);
+      result_types.push_back(resource.live_value.getType());
+    }
+
+    if (is_var_handle && !resource.read) {
+      assert(block.getArgument(index).getUses().empty());
+      argument_indices_to_remove.push_back(index);
+    } else {
+      if (is_var_handle) {
+        // Add resource_name attribute to VarHandleOp read.
+        function.setArgAttr(
+            new_argument_index, kResourceNameArgAttr,
+            builder.getStringAttr(
+                var_handle_shared_names[index - var_handles_start_idx]));
+      }
+      ++new_argument_index;
+    }
+  }
+
+  // Remove unread var handle arguments.
+  for (int argument_index_to_remove :
+       llvm::reverse(argument_indices_to_remove)) {
+    block.eraseArgument(argument_index_to_remove);
+    argument_types.erase(argument_types.begin() + argument_index_to_remove);
+  }
+
+  // Rewrite return if there are variable writes.
+  if (return_operands.size() > num_results_before) {
     builder.create<ReturnOp>(return_op.getLoc(), return_operands);
     return_op.erase();
   }
@@ -274,17 +341,10 @@ LogicalResult PromoteResourcesToArguments(FuncOp function) {
   // Update function argument and result types with new resource subtypes.
   function.setType(builder.getFunctionType(argument_types, result_types));
 
-  // Add resource_name attribute to the input argument for the resources.
-  for (auto& resource : resource_map) {
-    if (auto attr = resource.getFirst().dyn_cast<Attribute>()) {
-      int64_t input_index = resource.getSecond().input_index;
-      if (input_index != ResourceInfo::kInputUnassigned)
-        function.setArgAttr(input_index, "tf.resource_name", attr);
-    }
-  }
   // Add resource_name attribute to the output for the resources.
   for (auto& resource : output_only_resources)
-    function.setResultAttr(resource.first, "tf.resource_name", resource.second);
+    function.setResultAttr(resource.first, kResourceNameArgAttr,
+                           builder.getStringAttr(resource.second));
 
   // Add aliasing_output attribute to the input argument for the resources that
   // are updated by the function.
@@ -315,8 +375,11 @@ void PromoteResourcesToArgsPass::runOnOperation() {
     return signalPassFailure();
   }
 
+  llvm::SmallVector<std::string, 4> var_handle_shared_names;
   if (failed(ResourceLiftingForFunctionalControlFlow(main_func)) ||
-      failed(PromoteResourcesToArguments(main_func)))
+      failed(PromoteVarHandlesToArguments(main_func, /*add_validation=*/true,
+                                          &var_handle_shared_names)) ||
+      failed(PromoteResourcesToArguments(main_func, var_handle_shared_names)))
     return signalPassFailure();
 }
 

From c87e5c70c282a73565138099da864b258cc3b2ff Mon Sep 17 00:00:00 2001
From: Yash Katariya <yashkatariya@google.com>
Date: Wed, 13 May 2020 08:50:44 -0700
Subject: [PATCH 107/412] Fix batch_to_space's formatting.

PiperOrigin-RevId: 311336470
Change-Id: I74d238ae04c0e9938b765ca4bc8fac44e7872866
---
 tensorflow/python/ops/array_ops.py | 140 ++++++++++++++++-------------
 1 file changed, 80 insertions(+), 60 deletions(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 1cb6fdbd726..0ee37e186fb 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -3820,68 +3820,88 @@ def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=
            block_shape[0] - crops[0,0] - crops[0,1], ..., input_shape[M] *
            block_shape[M-1] - crops[M-1,0] - crops[M-1,1],  input_shape[M+1],
            ..., input_shape[N-1]]
-      Some Examples:
-      (1) For the following input of shape `[4, 1, 1, 1]`,
-         `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-         ```python
-         [[[[1]]],
-          [[[2]]],
-          [[[3]]],
-          [[[4]]]]
-         ```
-         The output tensor has shape `[1, 2, 2, 1]` and value:
-         ``` x = [[[[1], [2]],
-                   [[3], [4]]]] ```
-      (2) For the following input of shape `[4, 1, 1, 3]`,
-         `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-         ```python
-         [[[1,  2,   3]],
-          [[4,  5,   6]],
-          [[7,  8,   9]],
-          [[10, 11, 12]]]
-         ```
-         The output tensor has shape `[1, 2, 2, 3]` and value:
-         ```python
-         x = [[[[1, 2, 3], [4,  5,  6 ]],
-               [[7, 8, 9], [10, 11, 12]]]]
-         ```
-      (3) For the following
-         input of shape `[4, 2, 2, 1]`,
-         `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
-         ```python
-         x = [[[[1], [3]], [[ 9], [11]]],
-              [[[2], [4]], [[10], [12]]],
-              [[[5], [7]], [[13], [15]]],
-              [[[6], [8]], [[14], [16]]]]
-         ```
-         The output tensor has shape `[1, 4, 4, 1]` and value:
-         ```python
-         x = [[[1],  [2],  [ 3], [ 4]],
-              [[5],  [6],  [ 7], [ 8]],
-              [[9],  [10], [11], [12]],
-              [[13], [14], [15], [16]]]
-         ```
-       (4) For the following input of shape
-          `[8, 1, 3, 1]`,
-          `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:
-          ```python
-          x = [[[[0], [ 1], [ 3]]],
-               [[[0], [ 9], [11]]],
-               [[[0], [ 2], [ 4]]],
-               [[[0], [10], [12]]],
-               [[[0], [ 5], [ 7]]],
-               [[[0], [13], [15]]],
-               [[[0], [ 6], [ 8]]],
-               [[[0], [14], [16]]]]
-          ```
-          The output tensor has shape `[2, 2, 4, 1]` and value:
-          ```python
-          x = [[[[ 1], [ 2], [ 3], [ 4]],
-                [[ 5], [ 6], [ 7], [ 8]]],
-               [[[ 9], [10], [11], [12]],
-                [[13], [14], [15], [16]]]] ```
     name: A name for the operation (optional).
 
+  Examples:
+
+  (1) For the following input of shape `[4, 1, 1, 1]`,
+     `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+     ```python
+     [[[[1]]],
+      [[[2]]],
+      [[[3]]],
+      [[[4]]]]
+     ```
+
+    The output tensor has shape `[1, 2, 2, 1]` and value:
+
+     ```
+     x = [[[[1], [2]],
+         [[3], [4]]]]
+     ```
+
+  (2) For the following input of shape `[4, 1, 1, 3]`,
+     `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+     ```python
+     [[[1,  2,   3]],
+      [[4,  5,   6]],
+      [[7,  8,   9]],
+      [[10, 11, 12]]]
+     ```
+
+    The output tensor has shape `[1, 2, 2, 3]` and value:
+
+    ```python
+     x = [[[[1, 2, 3], [4,  5,  6 ]],
+           [[7, 8, 9], [10, 11, 12]]]]
+     ```
+
+  (3) For the following
+     input of shape `[4, 2, 2, 1]`,
+     `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+     ```python
+     x = [[[[1], [3]], [[ 9], [11]]],
+          [[[2], [4]], [[10], [12]]],
+          [[[5], [7]], [[13], [15]]],
+          [[[6], [8]], [[14], [16]]]]
+     ```
+
+    The output tensor has shape `[1, 4, 4, 1]` and value:
+
+    ```python
+     x = [[[1],  [2],  [ 3], [ 4]],
+          [[5],  [6],  [ 7], [ 8]],
+          [[9],  [10], [11], [12]],
+          [[13], [14], [15], [16]]]
+     ```
+
+   (4) For the following input of shape
+      `[8, 1, 3, 1]`,
+      `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:
+
+      ```python
+      x = [[[[0], [ 1], [ 3]]],
+           [[[0], [ 9], [11]]],
+           [[[0], [ 2], [ 4]]],
+           [[[0], [10], [12]]],
+           [[[0], [ 5], [ 7]]],
+           [[[0], [13], [15]]],
+           [[[0], [ 6], [ 8]]],
+           [[[0], [14], [16]]]]
+      ```
+
+      The output tensor has shape `[2, 2, 4, 1]` and value:
+
+      ```python
+      x = [[[[ 1], [ 2], [ 3], [ 4]],
+            [[ 5], [ 6], [ 7], [ 8]]],
+           [[[ 9], [10], [11], [12]],
+            [[13], [14], [15], [16]]]]
+      ```
+
   Returns:
     A `Tensor`. Has the same type as `input`.
   """

From 13f445fb39c84526a838b59a3bf48031031543f2 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Wed, 13 May 2020 08:58:13 -0700
Subject: [PATCH 108/412] For python op generation: add dispatch to all
 generated ops (don't skip ops with VISIBILITY=HIDDEN)

PiperOrigin-RevId: 311337843
Change-Id: Ibe9a4c31e3776e1b4dce23bd4f686025dbe5a31d
---
 tensorflow/python/framework/python_op_gen.cc | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 857cc7b6638..02b659528b0 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -371,9 +371,7 @@ void GenEagerPythonOp::HandleGraphMode(
     const string& function_setup, const std::vector<string>& output_sizes) {
   strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
   strings::StrAppend(&result_, function_setup);
-  if (api_def_.visibility() == ApiDef::VISIBLE) {
-    strings::StrAppend(&result_, "  try:\n  ");
-  }
+  strings::StrAppend(&result_, "  try:\n  ");
   strings::StrAppend(
       &result_, "  _, _, _op, _outputs = _op_def_library._apply_op_helper(\n");
   AddBodyNoReturn(strings::StrCat("        \"", op_def_.name(), "\", "));
@@ -690,9 +688,7 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error) {
-  if (api_def_.visibility() == ApiDef::VISIBLE) {
-    strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
-  }
+  strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
 
   AddExport();
   AddDefLine(function_name_, parameters);
@@ -955,8 +951,6 @@ void GenEagerPythonOp::AddEagerExecute(const string& indentation,
 }
 
 void GenEagerPythonOp::AddDispatch(const string& prefix) {
-  if (api_def_.visibility() != ApiDef::VISIBLE) return;
-
   strings::StrAppend(&result_, prefix, "except (TypeError, ValueError):\n");
   strings::StrAppend(&result_, prefix, "  result = _dispatch.dispatch(\n");
   AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_, ", "));

From e8ac8116c76824c370cbbb9fc3a77b637d160106 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Wed, 13 May 2020 09:02:09 -0700
Subject: [PATCH 109/412] Add PromoteVarHandlesToArgsPass to lift
 tf.VarHandleOps from function bodies to function arguments.

This reuses PromoteResourcesToArgsPass in terms of converting tf.VarHandleOps to function resource arguments.

PiperOrigin-RevId: 311338598
Change-Id: Ic83f1234fa51b7536ce1d3f88ee89404e9ab6689
---
 .../tests/promote_resources_to_args.mlir      |  4 +-
 .../tests/promote_var_handles_to_args.mlir    | 46 ++++++++++++++++
 .../mlir/tensorflow/transforms/passes.h       |  4 ++
 .../transforms/promote_resources_to_args.cc   | 55 ++++++++++++++++---
 4 files changed, 100 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
index eb6d40d20d9..60663f4bd4a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_resources_to_args.mlir
@@ -276,7 +276,7 @@ func @main(%arg0: tensor<!tf.resource<tensor<f32>>>, %arg1: tensor<!tf.resource<
 
 // Tests main function with multiple blocks.
 
-// expected-error@+1 {{expects 'main' function to have 1 block, got 2}}
+// expected-error@+1 {{expects function 'main' to have 1 block, got 2}}
 func @main() {
   br ^bb1
 ^bb1:
@@ -287,7 +287,7 @@ func @main() {
 
 // Tests main function is terminated with a non MLIR ReturnOp.
 
-// expected-error@+1 {{expects 'main' function to have a MLIR ReturnOp}}
+// expected-error@+1 {{expects function 'main' to have a MLIR ReturnOp}}
 func @main() {
 ^bb0:
   tf_device.return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
new file mode 100644
index 00000000000..5e53a457ecb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
@@ -0,0 +1,46 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-promote-var-handles-to-args | FileCheck %s -dump-input-on-failure
+
+// Tests main function with multiple blocks.
+
+// expected-error@+1 {{expects function 'main' to have 1 block, got 2}}
+func @main() {
+  br ^bb1
+^bb1:
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func @no_args
+// CHECK-SAME: (%arg0: tensor<!tf.resource> {tf.resource_name = "x"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @no_args() {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource>
+  return
+}
+
+// CHECK-LABEL: func @some_args
+// CHECK-SAME: (%arg0: tensor<i1>, %arg1: tensor<!tf.resource> {tf.resource_name = "x"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @some_args(%arg0: tensor<i1>) {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource>
+  return
+}
+
+// CHECK-LABEL: func @unique_vars
+// CHECK-SAME: (%arg0: tensor<!tf.resource<tensor<f32>>> {tf.resource_name = "x"}, %arg1: tensor<!tf.resource<tensor<i32>>> {tf.resource_name = "y"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @unique_vars() {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "y"} : () -> tensor<!tf.resource<tensor<i32>>>
+  return
+}
+
+// CHECK-LABEL: func @duplicate_vars
+// CHECK-SAME: (%arg0: tensor<!tf.resource<tensor<f32>>> {tf.resource_name = "x"})
+// CHECK-NOT: "tf.VarHandleOp"
+func @duplicate_vars() {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 0b1ff2beebb..81d0259d2d6 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -91,6 +91,10 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateResourceDeviceInferencePass();
 // of their aliasing output arguments.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteResourcesToArgsPass();
 
+// Creates a pass that promotes tf.VarHandleOp to resource arguments for all
+// functions.
+std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass();
+
 // Marks function visibility using tf.entry_function specification. That is,
 // functions with tf.entry_function attributes are marked with public
 // visibility while the other functions are marked with private visibility.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
index 9001c00bebe..cece23b4750 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/promote_resources_to_args.cc
@@ -78,6 +78,16 @@ constexpr char kInvalidResourceMsg[] =
     "expects resource to be a VarHandleOp or function argument";
 constexpr char kResourceNameArgAttr[] = "tf.resource_name";
 
+// Checks if a function has only one block.
+mlir::LogicalResult CheckSingleBlockFunction(FuncOp function) {
+  if (!hasSingleElement(function.getBlocks()))
+    return function.emitError()
+           << "expects function '" << function.getName()
+           << "' to have 1 block, got " << function.getBlocks().size();
+
+  return success();
+}
+
 // Collects names of users of a resource that are not `tf.ReadVariableOp` and
 // not `tf.AssignVariableOp`.
 llvm::SmallSet<llvm::StringRef, 1> GetCompositeResourceUserNames(
@@ -191,8 +201,8 @@ LogicalResult PromoteResourcesToArguments(
 
   auto return_op = llvm::dyn_cast_or_null<ReturnOp>(block.getTerminator());
   if (!return_op)
-    return function.emitError(
-        "expects 'main' function to have a MLIR ReturnOp");
+    return function.emitError() << "expects function '" << function.getName()
+                                << "' to have a MLIR ReturnOp";
 
   llvm::SmallVector<ResourceInfo, 4> resources(function.getNumArguments());
   auto argument_types = llvm::to_vector<4>(function.getType().getInputs());
@@ -369,11 +379,7 @@ void PromoteResourcesToArgsPass::runOnOperation() {
   // This routine should only be called when control flow operations are still
   // represented with TF IfOp and WhileOp operations. In this case, there should
   // be only one basic blocks in the MLIR representation.
-  if (!hasSingleElement(main_func.getBlocks())) {
-    main_func.emitError() << "expects 'main' function to have 1 block, got "
-                          << main_func.getBlocks().size();
-    return signalPassFailure();
-  }
+  if (failed(CheckSingleBlockFunction(main_func))) return signalPassFailure();
 
   llvm::SmallVector<std::string, 4> var_handle_shared_names;
   if (failed(ResourceLiftingForFunctionalControlFlow(main_func)) ||
@@ -383,15 +389,50 @@ void PromoteResourcesToArgsPass::runOnOperation() {
     return signalPassFailure();
 }
 
+class PromoteVarHandlesToArgsPass
+    : public PassWrapper<PromoteVarHandlesToArgsPass, OperationPass<ModuleOp>> {
+ public:
+  void runOnOperation() override;
+};
+
+void PromoteVarHandlesToArgsPass::runOnOperation() {
+  ModuleOp module = getOperation();
+  MLIRContext* context = module.getContext();
+  for (auto function : module.getOps<FuncOp>()) {
+    if (failed(CheckSingleBlockFunction(function))) return signalPassFailure();
+
+    llvm::SmallVector<std::string, 4> var_handle_shared_names;
+    PromoteVarHandlesToArguments(function, /*add_validation=*/false,
+                                 &var_handle_shared_names);
+
+    // Add resource names for each `tf.VarHandleOp` that were promoted to
+    // resource arguments.
+    const int var_handle_args_offset =
+        function.getNumArguments() - var_handle_shared_names.size();
+    for (auto var_name_and_index : llvm::enumerate(var_handle_shared_names))
+      function.setArgAttr(var_name_and_index.index() + var_handle_args_offset,
+                          kResourceNameArgAttr,
+                          StringAttr::get(var_name_and_index.value(), context));
+  }
+}
+
 }  // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteResourcesToArgsPass() {
   return std::make_unique<PromoteResourcesToArgsPass>();
 }
 
+std::unique_ptr<OperationPass<ModuleOp>> CreatePromoteVarHandlesToArgsPass() {
+  return std::make_unique<PromoteVarHandlesToArgsPass>();
+}
+
 static PassRegistration<PromoteResourcesToArgsPass> pass(
     "tf-promote-resources-to-args",
     "Promote resources reads/writes to function inputs/outputs.");
 
+static PassRegistration<PromoteVarHandlesToArgsPass> var_handle_pass(
+    "tf-promote-var-handles-to-args",
+    "Promote tf.VarHandleOps to function arguments.");
+
 }  // namespace TF
 }  // namespace mlir

From 260cba17979ac92d2a365159b9f00dc1922aff2c Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Wed, 13 May 2020 09:07:33 -0700
Subject: [PATCH 110/412] Add support for global operation dispatchers.  (This
 is intended for use by TF-internal classes only.)

PiperOrigin-RevId: 311339670
Change-Id: Id4491f9152cec34aaa78a3d90797e0a6bbc1dea3
---
 tensorflow/python/util/dispatch.py      | 21 +++++++++
 tensorflow/python/util/dispatch_test.py | 58 ++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
index e94e3345348..3868da14b44 100644
--- a/tensorflow/python/util/dispatch.py
+++ b/tensorflow/python/util/dispatch.py
@@ -39,6 +39,10 @@ from tensorflow.python.util import tf_inspect
 DISPATCH_ATTR = "_tf_dispatchers"
 
 
+# OpDispatchers which should be used for all operations.
+_GLOBAL_DISPATCHERS = []
+
+
 class OpDispatcher(object):
   """Abstract base class for TensorFlow operator dispatchers.
 
@@ -82,6 +86,19 @@ class OpDispatcher(object):
     getattr(op, DISPATCH_ATTR).append(self)
 
 
+class GlobalOpDispatcher(object):
+  """Abstract base class for TensorFlow global operator dispatchers."""
+
+  NOT_SUPPORTED = OpDispatcher.NOT_SUPPORTED
+
+  def handle(self, op, args, kwargs):
+    """Handle the specified operation with the specified arguments."""
+
+  def register(self):
+    """Register this dispatcher as a handler for all ops."""
+    _GLOBAL_DISPATCHERS.append(self)
+
+
 def dispatch(op, *args, **kwargs):
   """Returns the result from the first successful dispatcher for a given op.
 
@@ -101,6 +118,10 @@ def dispatch(op, *args, **kwargs):
     result = dispatcher.handle(args, kwargs)
     if result is not OpDispatcher.NOT_SUPPORTED:
       return result
+  for dispatcher in _GLOBAL_DISPATCHERS:
+    result = dispatcher.handle(op, args, kwargs)
+    if result is not OpDispatcher.NOT_SUPPORTED:
+      return result
   return OpDispatcher.NOT_SUPPORTED
 
 
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index 89999fcf843..bd35c391924 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -45,6 +45,47 @@ def test_op(x, y, z):
   return x + (2 * y) + (3 * z)
 
 
+class TensorTracer(object):
+  """An object used to trace TensorFlow graphs.
+
+  This is an example class that is used to test global op dispatchers.  The
+  global op dispatcher for TensorTracers is defined below.
+  """
+
+  def __init__(self, name, args=None, kwargs=None):
+    self.name = name
+    self.args = args
+    self.kwargs = kwargs
+
+  def __repr__(self):
+    if self.args is None and self.kwargs is None:
+      return self.name
+    else:
+      args = [str(x) for x in self.args]
+      args += sorted(
+          ["{}={}".format(name, x) for (name, x) in self.kwargs.items()])
+      return "{}({})".format(self.name, ", ".join(args))
+
+
+class TensorTracerOpDispatcher(dispatch.GlobalOpDispatcher):
+  """Global op dispatcher for TensorTracer."""
+
+  def handle(self, op, args, kwargs):
+    # Dispatcher only applies if at least one arg is a TensorTracer.
+    if not (any(self.is_tensor_tracer_arg(x) for x in args) or
+            any(self.is_tensor_tracer_arg(x) for x in kwargs.values())):
+      return self.NOT_SUPPORTED
+
+    return TensorTracer(op.__name__, args, kwargs)
+
+  def is_tensor_tracer_arg(self, value):
+    if isinstance(value, TensorTracer):
+      return True
+    if isinstance(value, (list, tuple)):
+      if any(isinstance(x, TensorTracer) for x in value):
+        return True
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class DispatchTest(test_util.TensorFlowTestCase):
 
@@ -131,8 +172,21 @@ class DispatchTest(test_util.TensorFlowTestCase):
         r".*some_op \(from __main__\) is deprecated and will be "
         "removed in a future version.*")
 
+  def testGlobalDispatcher(self):
+    original_global_dispatchers = dispatch._GLOBAL_DISPATCHERS
+    try:
+      TensorTracerOpDispatcher().register()
+
+      x = TensorTracer("x")
+      y = TensorTracer("y")
+      trace = math_ops.reduce_sum(math_ops.add(math_ops.abs(x), y), axis=3)
+      self.assertEqual(
+          str(trace), "reduce_sum(add(name=None, x=abs(x), y=y), axis=3)")
+
+    finally:
+      # Clean up.
+      dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
+
 
 if __name__ == "__main__":
   googletest.main()
-
-

From 8d25e4bf616b7ae4ed101c580a23421616bf674c Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 13 May 2020 09:08:28 -0700
Subject: [PATCH 111/412] Disable failed test for now.

PiperOrigin-RevId: 311339876
Change-Id: Ie6cbff49091892e39127e511ddfe5874ebe0576d
---
 tensorflow/python/keras/layers/cudnn_recurrent_test.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
index 9cf132d68df..d25851f6569 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent_test.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -267,6 +267,7 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
       self.assertEqual(len(layer.trainable_weights), 3)
       self.assertEqual(len(layer.non_trainable_weights), 0)
 
+  # TODO(b/156439419): Reenable after the bug is fixed.
   @parameterized.named_parameters(
       *test_util.generate_combinations_with_testcase_name(
           rnn_type=['LSTM', 'GRU'], to_cudnn=[True, False],
@@ -274,9 +275,9 @@ class CuDNNV1OnlyTest(keras_parameterized.TestCase):
           model_nest_level=[1, 2], model_type=['seq', 'func']))
   @test_util.run_v1_only('b/120911602, b/112083752')
   @test_util.run_gpu_only
-  def test_load_weights_between_noncudnn_rnn(self, rnn_type, to_cudnn,
-                                             bidirectional, implementation,
-                                             model_nest_level, model_type):
+  def DISALBED_test_load_weights_between_noncudnn_rnn(
+      self, rnn_type, to_cudnn, bidirectional, implementation,
+      model_nest_level, model_type):
     input_size = 10
     timesteps = 6
     input_shape = (timesteps, input_size)

From 410c66fa83537ea07e5158df915744931f461bfa Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 13 May 2020 16:22:40 +0000
Subject: [PATCH 112/412] Explain the reason to call
 _truediv_python3/_div_python2 explicitly (not through registered '/'

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py      | 21 +++++++++++++++++++--
 tensorflow/python/ops/math_ops_test.py |  3 +--
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index b981af72e83..2c141483eb1 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -439,13 +439,30 @@ def divide(x, y, name=None):
     # override names. Use a dummy class to track the runtime division behavior
     return DivideDelegateWithName(x, name) / y
   else:
-    if not (isinstance(x, ops.Tensor)  or isinstance(y, ops.Tensor)):
+    # tf.math.divide will compute python style division x / y. As python 2
+    # and python 3 have very much different semantics on `/` (__div__ vs.
+    # __truediv__), it would be natural to just use `x / y` as the operator
+    # '/' has already been registered for tensors, see
+    # _OverrideBinaryOperatorHelper for more details.
+    # However, in case both x and y are not tensors, the registered '/'
+    # _OverrideBinaryOperatorHelper will not take effect. In this case,
+    # python's default '/' operator will take effect which result in the return
+    # value of `tf.math.divide` as a non-Tensor.
+    # For that reason we excplicitly calls _truediv_python3/_div_python2
+    # in case both x and y are not tensors.
+    # Since _truediv_python3/_div_python2 operates on tensors and will convert
+    # to tensor if needed. This avoid the situation of the following if not
+    # explicitly calling _truediv_python3/_div_python2:
+    # >>> tf.divide(5, 2)
+    # 2.5 <= should be <tf.Tensor: shape=(), dtype=float64, numpy=2.5> instead.
+    if not (isinstance(x, ops.Tensor) or isinstance(y, ops.Tensor)):
       if sys.version_info.major < 3:
-        return _truediv_python2(x, y)
+        return _div_python2(x, y)
       else:
         return _truediv_python3(x, y)
     return x / y
 
+
 @tf_export("math.multiply", "multiply")
 @dispatch.add_dispatch_support
 def multiply(x, y, name=None):
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index dab0ea88ba8..1debed531b6 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -498,10 +498,9 @@ class DivAndModTest(test_util.TensorFlowTestCase):
   def testWithPythonValue(self):
     # Test case for GitHub issue 39475:
     # https://github.com/tensorflow/tensorflow/issues/39475
-    x = math_ops.divide(5,  2)
+    x = math_ops.divide(5, 2)
     self.assertTrue(isinstance(x, ops.Tensor))
 
-
 @test_util.run_all_in_graph_and_eager_modes
 class DivNoNanTest(test_util.TensorFlowTestCase):
 

From 902ffede1f57b4a11c8e570e414a960392413878 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Wed, 13 May 2020 09:48:37 -0700
Subject: [PATCH 113/412] Internal change

PiperOrigin-RevId: 311347501
Change-Id: I2f98f650bc5119d0aa977bed9d4c5b1305523f86
---
 tensorflow/compiler/mlir/BUILD                |   5 -
 tensorflow/compiler/mlir/tfrt/BUILD           | 183 -------
 .../mlir/tfrt/analysis/analysis.proto         |  25 -
 .../tfrt/analysis/compatibility_analysis.cc   | 193 -------
 .../tfrt/analysis/compatibility_analysis.h    |  30 --
 .../dialect_static_registration.cc            |  31 --
 .../tfrt/runtime_fallback/lower_tf_to_tfd.cc  | 390 --------------
 .../runtime_fallback_combine.cc               |  80 ---
 .../runtime_fallback/runtime_fallback_ops.cc  |  45 --
 .../runtime_fallback/runtime_fallback_ops.h   |  41 --
 .../runtime_fallback/runtime_fallback_ops.td  | 158 ------
 .../mlir/tfrt/saved_model/saved_model.cc      | 131 -----
 .../mlir/tfrt/saved_model/saved_model.h       |  78 ---
 tensorflow/compiler/mlir/tfrt/tests/BUILD     |  19 -
 .../compiler/mlir/tfrt/tests/analysis/BUILD   |  19 -
 .../analysis/compatibility_analysis.mlir      |  65 ---
 .../compiler/mlir/tfrt/tests/basics.mlir      |  19 -
 .../mlir/tfrt/tests/err_partial_convert.mlir  |   9 -
 tensorflow/compiler/mlir/tfrt/tests/opt.mlir  |  26 -
 .../mlir/tfrt/tests/tf_to_corert/BUILD        |  19 -
 .../tfrt/tests/tf_to_corert/attributes.mlir   |  21 -
 .../mlir/tfrt/tests/tf_to_corert/basic.mlir   |  34 --
 .../tests/tf_to_corert/derived_attrs.mlir     |  21 -
 .../tests/tf_to_corert/device_conversion.mlir |  12 -
 .../mlir/tfrt/tests/tf_to_corert/fold.mlir    |  12 -
 .../tests/tf_to_corert/string_tensor.mlir     |  10 -
 .../tf_executor_to_corert_pipeline.mlir       |  24 -
 .../mlir/tfrt/tests/tf_to_tfd_lowering.mlir   | 111 ----
 .../compiler/mlir/tfrt/tf_legalize_to_hex.cc  | 163 ------
 .../compiler/mlir/tfrt/transforms/optimize.cc | 122 -----
 .../compiler/mlir/tfrt/transforms/passes.h    |  64 ---
 .../mlir/tfrt/transforms/tf_to_corert.cc      | 484 ------------------
 32 files changed, 2644 deletions(-)
 delete mode 100644 tensorflow/compiler/mlir/tfrt/BUILD
 delete mode 100644 tensorflow/compiler/mlir/tfrt/analysis/analysis.proto
 delete mode 100644 tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h
 delete mode 100644 tensorflow/compiler/mlir/tfrt/runtime_fallback/dialect_static_registration.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/runtime_fallback/lower_tf_to_tfd.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_combine.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
 delete mode 100644 tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
 delete mode 100644 tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/BUILD
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/analysis/compatibility_analysis.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/basics.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/err_partial_convert.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/opt.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/derived_attrs.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fold.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/string_tensor.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_executor_to_corert_pipeline.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tests/tf_to_tfd_lowering.mlir
 delete mode 100644 tensorflow/compiler/mlir/tfrt/tf_legalize_to_hex.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
 delete mode 100644 tensorflow/compiler/mlir/tfrt/transforms/passes.h
 delete mode 100644 tensorflow/compiler/mlir/tfrt/transforms/tf_to_corert.cc

diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 4a4d566f163..c0066ecda03 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -77,10 +77,6 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/compiler/mlir/tensorflow:tf_legalize_hlo",
         "//tensorflow/compiler/mlir/tfjs:tensorflow_js_passes",
-        "//tensorflow/compiler/mlir/tfrt:lower_tf_to_tfd_alwayslink",
-        "//tensorflow/compiler/mlir/tfrt:runtime_fallback_opdefs_alwayslink",
-        "//tensorflow/compiler/mlir/tfrt:tf_legalize_to_tfrt",
-        "//tensorflow/compiler/mlir/tfrt:tf_to_corert",
     ],
 )
 
@@ -152,7 +148,6 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tensorflow:translate_registration",
         "//tensorflow/compiler/mlir/tensorflow:translate_tf_dialect_op",
-        "//tensorflow/compiler/mlir/tfrt:compatibility_analysis",
         "//tensorflow/compiler/mlir/xla:xla_mlir_translate",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
deleted file mode 100644
index edcfc574452..00000000000
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ /dev/null
@@ -1,183 +0,0 @@
-load("//third_party/mlir:tblgen.bzl", "gentbl")
-load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library_cc")
-
-# TF to TFRT kernels conversion.
-package(
-    default_visibility = [":friends"],
-    licenses = ["notice"],  # Apache 2.0
-)
-
-package_group(
-    name = "friends",
-    packages = [
-        "//learning/brain/experimental/tfrt/...",
-        "//tensorflow/compiler/...",
-        "//tensorflow/core/runtime_fallback/...",
-        "//tensorflow/core/tfrt/experimental/saved_model/...",
-        "//third_party/tf_runtime_google/...",
-    ],
-)
-
-cc_library(
-    name = "tf_legalize_to_tfrt",
-    srcs = [
-        "tf_legalize_to_hex.cc",
-    ],
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow",
-        "@com_google_absl//absl/memory",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
-    ],
-    alwayslink = 1,
-)
-
-filegroup(
-    name = "runtime_fallback_ops_td_files",
-    srcs = [
-        "runtime_fallback/runtime_fallback_ops.td",
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:include/mlir/Interfaces/SideEffectInterfaces.td",
-        "@tf_runtime//:OpBaseTdFiles",
-    ],
-)
-
-gentbl(
-    name = "runtime_fallback_ops_inc_gen",
-    tbl_outs = [
-        (
-            "-gen-op-decls",
-            "runtime_fallback_ops.h.inc",
-        ),
-        (
-            "-gen-op-defs",
-            "runtime_fallback_ops.cc.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "runtime_fallback/runtime_fallback_ops.td",
-    td_includes = [
-        "external/tf_runtime/include",
-    ],
-    td_srcs = [
-        ":runtime_fallback_ops_td_files",
-    ],
-)
-
-cc_library(
-    name = "runtime_fallback_opdefs_alwayslink",
-    srcs = [
-        "runtime_fallback/dialect_static_registration.cc",
-        "runtime_fallback/runtime_fallback_combine.cc",
-        "runtime_fallback/runtime_fallback_ops.cc",
-    ],
-    hdrs = [
-        "runtime_fallback/runtime_fallback_ops.h",
-    ],
-    deps = [
-        ":runtime_fallback_ops_inc_gen",
-        "@llvm-project//mlir:Dialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:SideEffects",
-        "@llvm-project//mlir:Support",
-        "@tf_runtime//:basic_kernels_opdefs_alwayslink",
-        "@tf_runtime//:tensor_opdefs_alwayslink",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "lower_tf_to_tfd_alwayslink",
-    srcs = ["runtime_fallback/lower_tf_to_tfd.cc"],
-    deps = [
-        "runtime_fallback_opdefs_alwayslink",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Transforms",
-        "@tf_runtime//:basic_kernels_opdefs_alwayslink",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "tf_to_corert",
-    srcs = [
-        "transforms/optimize.cc",
-        "transforms/tf_to_corert.cc",
-    ],
-    hdrs = [
-        "transforms/passes.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_tensor",
-        "//tensorflow/core:framework",
-        "//tensorflow/core/platform:tstring",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Transforms",
-        "@tf_runtime//:basic_kernels_opdefs_alwayslink",
-        "@tf_runtime//:core_runtime_opdefs_alwayslink",
-    ],
-    alwayslink = 1,
-)
-
-cc_library(
-    name = "saved_model",
-    srcs = [
-        "saved_model/saved_model.cc",
-    ],
-    hdrs = [
-        "saved_model/saved_model.h",
-    ],
-    deps = [
-        ":tf_to_corert",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
-        "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
-        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "//tensorflow/core/platform:status",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@tf_runtime//:core_runtime",
-        "@tf_runtime//:hostcontext",
-        "@tf_runtime//:mlirtobef",
-        "@tf_runtime//:support",
-        "@tf_runtime//:tensor",
-    ],
-)
-
-cc_library(
-    name = "compatibility_analysis",
-    srcs = [
-        "analysis/compatibility_analysis.cc",
-    ],
-    hdrs = [
-        "analysis/compatibility_analysis.h",
-    ],
-    deps = [
-        ":analysis/analysis_proto_cc",
-        ":tf_to_corert",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/core:lib_proto_parsing",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Translation",
-    ],
-    alwayslink = 1,
-)
-
-tf_proto_library_cc(
-    name = "analysis/analysis_proto",
-    srcs = ["analysis/analysis.proto"],
-    cc_api_version = 2,
-)
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/analysis.proto b/tensorflow/compiler/mlir/tfrt/analysis/analysis.proto
deleted file mode 100644
index 0716a243bb3..00000000000
--- a/tensorflow/compiler/mlir/tfrt/analysis/analysis.proto
+++ /dev/null
@@ -1,25 +0,0 @@
-syntax = "proto3";
-
-package mlir.tfrt;
-
-message CompatibilityAnalysisReportProto {
-  bool unknown_dialect = 1;
-  bool ref_variable = 2;
-  bool incompatible_variable = 3;
-  bool incompatible_attribute = 4;
-  bool control_flow_v1 = 5;
-
-  // TODO(chky): add more checks, eg. tensor datatypes.
-}
-
-message CompatibilityAnalysisProto {
-  CompatibilityAnalysisReportProto summary = 1;
-
-  message OpInfo {
-    int32 count = 1;
-
-    CompatibilityAnalysisReportProto report = 2;
-  }
-
-  map<string, OpInfo> ops = 2;
-}
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.cc b/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.cc
deleted file mode 100644
index 7e9c5544c25..00000000000
--- a/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.cc
+++ /dev/null
@@ -1,193 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h"
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Function.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Translation.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/core/platform/protobuf.h"
-
-namespace tensorflow {
-namespace {
-
-class CompatibilityAnalysis {
- public:
-  void AnalyzeOperation(mlir::Operation* op);
-
-  const mlir::tfrt::CompatibilityAnalysisProto& GetResult() const {
-    return analysis_;
-  }
-
- private:
-  // Return true if some attributes in the op are not supported.
-  bool AnalyzeOpAttributes(mlir::Operation* op);
-  // Return true if this op has unsupported operation (eg. mutate) on resource
-  // variables.
-  bool AnalyzeVariable(mlir::Operation* op);
-
-  void UpdateReport(
-      const mlir::tfrt::CompatibilityAnalysisReportProto& new_report,
-      mlir::tfrt::CompatibilityAnalysisReportProto* old_report);
-
-  mlir::tfrt::CompatibilityAnalysisProto analysis_;
-};
-
-void CompatibilityAnalysis::AnalyzeOperation(mlir::Operation* op) {
-  // Skip the standard ops that are allowed in tf dialect.
-  if (llvm::isa<mlir::ReturnOp>(op) || llvm::isa<mlir::FuncOp>(op) ||
-      llvm::isa<mlir::ModuleOp>(op) || llvm::isa<mlir::ModuleTerminatorOp>(op))
-    return;
-
-  auto op_name = op->getName();
-
-  std::string name = op_name.getStringRef().str();
-
-  mlir::tfrt::CompatibilityAnalysisReportProto op_report;
-
-  if (op_name.getDialect() ==
-      mlir::TF::TensorFlowDialect::getDialectNamespace()) {
-    // Analyze op attributes.
-    if (AnalyzeOpAttributes(op)) op_report.set_incompatible_attribute(true);
-
-    // Analyze variable operations.
-    if (AnalyzeVariable(op)) op_report.set_incompatible_variable(true);
-
-    // Reference variable is not supported.
-    if (op_name.getStringRef() == "tf.VariableV2")
-      op_report.set_ref_variable(true);
-  } else if (op_name.getDialect() == "tf_executor") {
-    if (llvm::isa<mlir::tf_executor::SwitchOp>(op) ||
-        llvm::isa<mlir::tf_executor::SwitchNOp>(op) ||
-        llvm::isa<mlir::tf_executor::MergeOp>(op) ||
-        llvm::isa<mlir::tf_executor::EnterOp>(op) ||
-        llvm::isa<mlir::tf_executor::NextIterationSourceOp>(op) ||
-        llvm::isa<mlir::tf_executor::NextIterationSinkOp>(op)) {
-      op_report.set_control_flow_v1(true);
-    } else {
-      // Skip the rest of the tf_executor ops as they can be handled.
-      //
-      // TODO(chky): consider adding whitelist here.
-      return;
-    }
-  } else {
-    // Mark unknown dialect in the report.
-    op_report.set_unknown_dialect(true);
-  }
-
-  auto& op_info = (*analysis_.mutable_ops())[name];
-  op_info.set_count(op_info.count() + 1);
-
-  UpdateReport(op_report, op_info.mutable_report());
-  UpdateReport(op_report, analysis_.mutable_summary());
-}
-
-bool CompatibilityAnalysis::AnalyzeOpAttributes(mlir::Operation* op) {
-  // tf.Const gets special handling so it is always compatible.
-  if (llvm::isa<mlir::TF::ConstOp>(op)) return false;
-
-  // TODO(chky): Derived attributes should be also analyzed here.
-  for (auto attr : op->getAttrs()) {
-    if (attr.first.strref() == "_output_shapes") continue;
-    if (attr.first.strref() == "_class") continue;
-
-    // Symbol attributes (eg. function names) is currently not supported.
-    //
-    // TODO(chky): CoreRT should ideally support function call operatoins.
-    // Remove this condition once that is implemented.
-    if (attr.second.isa<mlir::FlatSymbolRefAttr>()) return true;
-
-    // Currently only tensors of simple dtypes (i1, i32, i64, f32, f64) are
-    // supported.
-    if (auto elements_attr = attr.second.dyn_cast<mlir::ElementsAttr>()) {
-      if (!elements_attr.isa<mlir::DenseElementsAttr>()) return true;
-      auto element_type = elements_attr.getType().getElementType();
-      if (element_type.isa<mlir::TF::TensorFlowType>()) return true;
-    }
-
-    // Currently only arrays of simple element types (i1, i32, i64, f32, f64)
-    // are supported.
-    if (auto array_attr = attr.second.dyn_cast<mlir::ArrayAttr>()) {
-      if (array_attr.size() > 0) {
-        if (array_attr[0].isa<mlir::ElementsAttr>()) return true;
-
-        if (array_attr[0].isa<mlir::StringAttr>()) return true;
-
-        if (array_attr[0].isa<mlir::FlatSymbolRefAttr>()) return true;
-      }
-    }
-  }
-  return false;
-}
-
-bool CompatibilityAnalysis::AnalyzeVariable(mlir::Operation* op) {
-  // Currently only supported variable op is ReadVariableOp.
-  if (llvm::isa<mlir::TF::ReadVariableOp>(op)) return false;
-
-  for (auto value : op->getOperands()) {
-    auto type = value.getType();
-    if (auto tensor_type = type.dyn_cast<mlir::TensorType>()) {
-      auto element_type = tensor_type.getElementType();
-      if (element_type.isa<mlir::TF::ResourceType>()) return true;
-    }
-  }
-
-  return false;
-}
-
-void CompatibilityAnalysis::UpdateReport(
-    const mlir::tfrt::CompatibilityAnalysisReportProto& new_report,
-    mlir::tfrt::CompatibilityAnalysisReportProto* old_report) {
-  if (new_report.unknown_dialect()) old_report->set_unknown_dialect(true);
-
-  if (new_report.ref_variable()) old_report->set_ref_variable(true);
-
-  if (new_report.incompatible_variable())
-    old_report->set_incompatible_variable(true);
-
-  if (new_report.incompatible_attribute())
-    old_report->set_incompatible_attribute(true);
-
-  if (new_report.control_flow_v1()) old_report->set_control_flow_v1(true);
-}
-
-}  // namespace
-
-mlir::tfrt::CompatibilityAnalysisProto AnalyzeTFCompatibility(
-    mlir::ModuleOp op) {
-  CompatibilityAnalysis analysis;
-  op.walk([&analysis](mlir::Operation* op) { analysis.AnalyzeOperation(op); });
-  return analysis.GetResult();
-}
-
-static mlir::TranslateFromMLIRRegistration registration(
-    "analyze-tf-for-tfrt", [](mlir::ModuleOp op, llvm::raw_ostream& output) {
-      auto analysis_proto = AnalyzeTFCompatibility(op);
-      std::string text_proto;
-      if (tensorflow::protobuf::TextFormat::PrintToString(analysis_proto,
-                                                          &text_proto)) {
-        output << text_proto;
-        return mlir::success();
-      }
-
-      return mlir::failure();
-    });
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h b/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h
deleted file mode 100644
index 7250a9493bc..00000000000
--- a/tensorflow/compiler/mlir/tfrt/analysis/compatibility_analysis.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COMPATIBILITY_ANALYSIS_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COMPATIBILITY_ANALYSIS_H_
-
-#include "mlir/IR/Module.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tfrt/analysis/analysis.pb.h"
-
-namespace tensorflow {
-
-// Analyze a MLIR module in tf dialect.
-mlir::tfrt::CompatibilityAnalysisProto AnalyzeTFCompatibility(
-    mlir::ModuleOp op);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COMPATIBILITY_ANALYSIS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/dialect_static_registration.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/dialect_static_registration.cc
deleted file mode 100644
index 7632b0546fa..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/dialect_static_registration.cc
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-//===- dialect_static_registration.cc -------------------------------------===//
-//
-// This file registers the RuntimeFallbackDialect.
-//
-//===----------------------------------------------------------------------===//
-
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h"
-
-namespace mlir {
-namespace tfd {
-
-// Static initialization for dialect registration.
-static DialectRegistration<RuntimeFallbackDialect> tfd_registration;
-
-}  // namespace tfd
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/lower_tf_to_tfd.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/lower_tf_to_tfd.cc
deleted file mode 100644
index 5f831c9ef6a..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/lower_tf_to_tfd.cc
+++ /dev/null
@@ -1,390 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstddef>
-#include <string>
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Function.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h"
-#include "tfrt/basic_kernels/opdefs/basic_kernels.h"
-
-namespace mlir {
-namespace {
-
-constexpr const char kTmpLoweringCastOpName[] = "tmp_lowering_cast_op";
-
-static Type GetChainType(MLIRContext* context) {
-  auto hexDialect = Identifier::get("hex", context);
-  return OpaqueType::get(hexDialect, "chain", context);
-}
-
-static Type GetTfdTensorType(MLIRContext* context) {
-  auto tfdDialect = Identifier::get("tfd", context);
-  return OpaqueType::get(tfdDialect, "tf_tensor", context);
-}
-
-struct TfToTfdLoweringPass
-    : public PassWrapper<TfToTfdLoweringPass, OperationPass<ModuleOp>> {
-  void runOnOperation() final;
-};
-
-class FuncOpSignatureConversion : public OpConversionPattern<FuncOp> {
- public:
-  using OpConversionPattern<FuncOp>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      FuncOp funcOp, llvm::ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const override {
-    auto ctx = funcOp.getContext();
-    auto chain_type = GetChainType(ctx);
-    auto tfd_tensor_type = GetTfdTensorType(ctx);
-    FunctionType type = funcOp.getType();
-
-    // Convert function return results. The lowered function is expected to
-    // return a chain as the first return result. For each original TF tensor,
-    // the lowered function returns a TFD tensor instead.
-    llvm::SmallVector<Type, 2> converted_results;
-    if (type.getNumResults() > 0) {
-      // Add a chain as the first return result.
-      converted_results.push_back(chain_type);
-
-      // Convert the original TF tensor return results.
-      for (unsigned i = 0, e = type.getNumResults(); i != e; ++i) {
-        if (auto tensor_type = type.getResult(i).dyn_cast<TensorType>()) {
-          // Each TF tensor is converted to a TFD tensor.
-          converted_results.push_back(tfd_tensor_type);
-        } else {
-          // Only handle TF tensor conversion for now.
-          return failure();
-        }
-      }
-    }
-
-    // Create the new function signature. The lowered function is expected to
-    // take a Chain as the first argument. Then for each TF tensor argument,
-    // expect a TFD tensor argument instead.
-    TypeConverter::SignatureConversion new_func_sig(type.getNumInputs() + 1);
-    if (type.getNumInputs() > 0) {
-      // Add the first chain argument.
-      new_func_sig.addInputs(chain_type);
-      for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i) {
-        // For each original TF tensor type, convert it to one TFD tensor type.
-        if (auto tensor_type = type.getInput(i).dyn_cast<TensorType>()) {
-          new_func_sig.addInputs(i, {tfd_tensor_type});
-        } else {
-          // Only handle TF tensor argument for now.
-          return failure();
-        }
-      }
-    }
-    // Each function has a single region. In general, each region can have
-    // multiple blocks. Assume that all TF-dialect functions only have a
-    // single entry block.
-    Block* entry = &funcOp.front();
-
-    // Tell the rewriter to convert the region signature. After this, the
-    // function region takes the new function signature, which means index
-    // shifts by one.
-    Block* convertedEntry =
-        rewriter.applySignatureConversion(&funcOp.getBody(), new_func_sig);
-
-    {
-      // Generate the "fake" mapping ops. The insertion guard restores rewriter
-      // insertion pointer when it gets out of scope.
-      OpBuilder::InsertionGuard guard(rewriter);
-      rewriter.setInsertionPointToStart(convertedEntry);
-      // Replace block arguments. For example,
-      // func @example(i64, i1) -> i64 {
-      //   ^bb0(%a: i64, %cond: i1):  // replacing this.
-      for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i) {
-        // For each original block argument, create a fake op that takes the
-        // input the input chain argument to the function, and the tfd tensor
-        // argument, and returns the original TF tensor input. Note that the
-        // function signature has been replaced, so entry->getArgument(0) is the
-        // input chain. And we need to add 1 to index to get the original
-        // argument.
-        Type orig_input = type.getInput(i);
-        OperationState tmp_lowering_cast_op(
-            funcOp.getLoc(), kTmpLoweringCastOpName,
-            {convertedEntry->getArgument(0),
-             convertedEntry->getArgument(i + 1)},
-            orig_input, {});
-        Value repl_value =
-            rewriter.createOperation(tmp_lowering_cast_op)->getResult(0);
-        // Replace original uses of TF tensor block argument with the result of
-        // the fake op. This sets up the lowering passes for individual ops
-        // which at this point still expect TF tensors rather than TFD tensor
-        // inputs.
-        rewriter.replaceUsesOfBlockArgument(entry->getArgument(i), repl_value);
-      }
-    }
-
-    // Create a new function op with an updated signature.
-    auto new_func_op = rewriter.cloneWithoutRegions(funcOp);
-    rewriter.inlineRegionBefore(funcOp.getBody(), new_func_op.getBody(),
-                                new_func_op.end());
-    new_func_op.setType(FunctionType::get(new_func_sig.getConvertedTypes(),
-                                          converted_results, ctx));
-    // Remove the old function op.
-    rewriter.eraseOp(funcOp);
-    return success();
-  }
-};
-
-// Lower each TF op to a tfd.delegate_kernel op. For example,
-//
-// %1 = "tf.ReadVariableOp"(%arg) {
-//     dtype = "tfdtype$DT_FLOAT"
-// } : (tensor<*x!tf.resource>) -> tensor<10xf32>
-//
-// would be lowered to
-//
-// %1:2 = "tfd.delegate_kernel"(%chain_in, %arg) {
-//   _name = "tf.ReadVariableOp",
-//   attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-// } : (!hex.chain, !tfd.tf_tensor) -> (!hex.chain, !tfd.tf_tensor)
-//
-// Each tfd.delegate_kernel op expects a chain as the first input. This chain
-// may come from the first function argument or the previous converted op
-// output. The rest of inputs would be converted to a tfd tensor input.
-// Each tfd.delegate_kernel op returns a chain as the first output. Each
-// original output TensorType is converted a tfd tensor type.
-// The TF op name becomes an _name attribute. Each TF attribute is lowered to
-// two TFD attributes, one for the name, one for the type and value.
-//
-// Because delegate_kernel ops are threaded through chains, we lowered to a
-// serial execution plan.
-// TODO(zhangqiaorjc): Do analysis to allow concurrent execution.
-template <typename TF_OP>
-class TFOpConversion : public OpConversionPattern<TF_OP> {
- public:
-  using OpConversionPattern<TF_OP>::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      TF_OP op, llvm::ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter)  // NOLINT(google-runtime-references
-      const override {
-    auto ctx = op.getContext();
-    // Handle new op operands.
-    // Delegate kernel expects the first argument to be a chain, followed by
-    // original arguments to the target TF op converted to TFD tensors.
-    llvm::SmallVector<Value, 4> delegate_kernel_op_operands;
-    int num_new_operands = op.getOperation()->getNumOperands() + 1;
-    delegate_kernel_op_operands.reserve(num_new_operands);
-
-    // Get the input chain from the previous delegate_kernel op or first block
-    // argument.
-    Value chain_input = nullptr;
-    auto* block = op.getOperation()->getBlock();
-    assert(block->isEntryBlock() && "only supports a single block");
-    // Find a previous delegate_kernel op for its output chain.
-    auto* prev_op = op.getOperation()->getPrevNode();
-    while (prev_op != nullptr && !isa<tfd::DelegateKernelOp>(prev_op)) {
-      prev_op = prev_op->getPrevNode();
-    }
-    if (prev_op != nullptr) {
-      // There is another delegate kernel op before this op.
-      auto prev_op_result_0 = prev_op->getResult(0);
-      assert(prev_op_result_0.getType() == GetChainType(ctx));
-      chain_input = prev_op_result_0;
-    } else {
-      // This op is the first delegate kernel op in a block.
-      auto arg_0 = block->getArgument(0);
-      assert(arg_0.getType() == GetChainType(ctx));
-      chain_input = arg_0;
-    }
-    delegate_kernel_op_operands.push_back(chain_input);
-
-    // Convert each TensorType operand to the corresponding TFD tensor operand.
-    for (auto operand : operands) {
-      auto* tmp_lowering_cast_op = operand.getDefiningOp();
-      assert(tmp_lowering_cast_op->getName().getStringRef() ==
-             kTmpLoweringCastOpName);
-      delegate_kernel_op_operands.push_back(
-          tmp_lowering_cast_op->getOperand(1));
-    }
-
-    // Handle new op results.
-    llvm::SmallVector<Type, 4> delegate_kernel_op_results;
-    // The first output is a chain.
-    delegate_kernel_op_results.push_back(GetChainType(ctx));
-    // For each original output, there is a corresponding TFD tensor output.
-    for (int i = 0, e = op.getOperation()->getNumResults(); i != e; ++i) {
-      delegate_kernel_op_results.push_back(GetTfdTensorType(ctx));
-    }
-
-    // Convert TF attribute to TFD attribute.
-    llvm::SmallVector<NamedAttribute, 4> delegate_kernel_op_attributes;
-    NamedAttribute op_name_attr(Identifier::get("_name", ctx),
-                                StringAttr::get(op.getOperationName(), ctx));
-    delegate_kernel_op_attributes.push_back(op_name_attr);
-
-    int attr_idx = 0;
-    for (const NamedAttribute& tf_attr : op.getAttrs()) {
-      // Small std::string benefits from small string optimization in libc++.
-      NamedAttribute attr_name(
-          Identifier::get("attr" + std::to_string(attr_idx) + "_name", ctx),
-          StringAttr::get(tf_attr.first, ctx));
-      NamedAttribute attr_value(
-          Identifier::get("attr" + std::to_string(attr_idx) + "_value", ctx),
-          tf_attr.second);
-      delegate_kernel_op_attributes.push_back(attr_name);
-      delegate_kernel_op_attributes.push_back(attr_value);
-      attr_idx++;
-    }
-
-    // Replace the TF op with TFD delegate kernel op.
-    auto new_op = rewriter.create<tfd::DelegateKernelOp>(
-        op.getLoc(), delegate_kernel_op_results, delegate_kernel_op_operands,
-        delegate_kernel_op_attributes);
-
-    // Create lowering cast ops for non-chain results.
-    llvm::SmallVector<Value, 4> lowering_cast_ops_values;
-    // Skip the first result. It's a chain which has no current users.
-    for (int i = 1, e = new_op.getOperation()->getNumResults(); i != e; ++i) {
-      Type orig_input = op.getType();
-      OperationState tmp_lowering_cast_op(new_op.getLoc(),
-                                          kTmpLoweringCastOpName,
-                                          {new_op.getOperation()->getResult(0),
-                                           new_op.getOperation()->getResult(i)},
-                                          {orig_input}, {});
-      Value repl_value =
-          rewriter.createOperation(tmp_lowering_cast_op)->getResult(0);
-      lowering_cast_ops_values.push_back(repl_value);
-    }
-
-    rewriter.replaceOp(op, lowering_cast_ops_values);
-    return success();
-  }
-};
-
-class ReturnOpConversion : public OpConversionPattern<ReturnOp> {
- public:
-  using OpConversionPattern<ReturnOp>::OpConversionPattern;
-
-  // Replace std.return with hex.return. The first result is always a chain and
-  // each original TF tensor result is converted to a TFD tensor.
-  LogicalResult matchAndRewrite(
-      ReturnOp return_op, llvm::ArrayRef<Value> operands,
-      ConversionPatternRewriter& rewriter) const override {
-    auto ctx = return_op.getContext();
-    Value chain_output = nullptr;
-    llvm::SmallVector<Value, 4> new_return_op_operands;
-    new_return_op_operands.reserve(return_op.getNumOperands() + 1);
-    // Convert each TF tensor operand to the corresponding TFD tensor operand.
-    for (auto operand : operands) {
-      auto* tmp_lowering_cast_op = operand.getDefiningOp();
-      if (tmp_lowering_cast_op->getName().getStringRef() !=
-          kTmpLoweringCastOpName) {
-        assert(false && "unexpected producer of operand");
-      }
-      if (chain_output == nullptr) {
-        // Get the input chain from the previous op or first block argument.
-        auto* block = return_op.getOperation()->getBlock();
-        if (!block->isEntryBlock()) {
-          assert(false && "only supports a single block");
-        }
-        // Find a previous delegate_kernel op for its output chain.
-        auto* prev_op = return_op.getOperation()->getPrevNode();
-        while (prev_op != nullptr && !isa<tfd::DelegateKernelOp>(prev_op)) {
-          prev_op = prev_op->getPrevNode();
-        }
-        if (prev_op != nullptr) {
-          // There is another delegate kernel op before this op.
-          auto prev_op_result_0 = prev_op->getResult(0);
-          if (prev_op_result_0.getType() != GetChainType(ctx)) {
-            assert(false &&
-                   "delegate kernel must produce chain as the first result");
-          }
-          chain_output = prev_op_result_0;
-        } else {
-          // This op is the first delegate kernel op in a block.
-          auto arg_0 = block->getArgument(0);
-          if (arg_0.getType() != GetChainType(ctx)) {
-            assert(false && "first block argument must be a chain");
-          }
-          chain_output = arg_0;
-        }
-        new_return_op_operands.push_back(chain_output);
-      }
-      new_return_op_operands.push_back(tmp_lowering_cast_op->getOperand(1));
-    }
-    // Replace the old std.return op with the new hex.return op.
-    rewriter.create<tfrt::hex::ReturnOp>(return_op.getLoc(),
-                                         new_return_op_operands);
-    rewriter.eraseOp(return_op);
-
-    return success();
-  }
-};
-
-void TfToTfdLoweringPass::runOnOperation() {
-  ConversionTarget target(getContext());
-
-  // Make tmp_lowering_cast_op legal for conversion. But delete them after the
-  // passes.
-  OperationName tmp_lowering_cast_op_name(kTmpLoweringCastOpName,
-                                          &getContext());
-  target.setOpAction(tmp_lowering_cast_op_name,
-                     ConversionTarget::LegalizationAction::Legal);
-
-  // target.addLegalDialect<TF::TensorFlowDialect,
-  // tfd::RuntimeFallbackDialect>();
-  target.addLegalDialect<tfd::RuntimeFallbackDialect>();
-
-  target.addDynamicallyLegalOp<FuncOp>([](FuncOp function) {
-    // Returns true if this function is legal, i.e. all inputs and outputs are
-    // TFRT types.
-    FunctionType type = function.getType();
-    for (unsigned i = 0, e = type.getNumInputs(); i != e; ++i) {
-      if (type.getInput(i).isa<TensorType>()) return false;
-    }
-    for (unsigned i = 0, e = type.getNumResults(); i != e; ++i) {
-      if (type.getResult(i).isa<TensorType>()) return false;
-    }
-    return true;
-  });
-
-  target.addLegalOp<mlir::ModuleTerminatorOp, mlir::ModuleOp,
-                    tfrt::hex::ReturnOp>();
-
-  OwningRewritePatternList patterns;
-  patterns.insert<FuncOpSignatureConversion, TFOpConversion<TF::ReadVariableOp>,
-                  TFOpConversion<TF::MatMulOp>, TFOpConversion<TF::AddV2Op>,
-                  TFOpConversion<TF::ReluOp>, TFOpConversion<TF::IdentityOp>,
-                  ReturnOpConversion>(&getContext());
-
-  if (failed(applyPartialConversion(getOperation(), target, patterns)))
-    signalPassFailure();
-
-  // Delete the tmp_lowering_cast_op's since they are illegal.
-  getOperation().walk([&tmp_lowering_cast_op_name](Operation* op) {
-    if (op->getName() == tmp_lowering_cast_op_name) op->erase();
-  });
-}
-
-}  // namespace
-}  // namespace mlir
-
-static mlir::PassRegistration<mlir::TfToTfdLoweringPass> pass(
-    "tf-to-tfd-lowering", "Lowers the TF dialect to Runtime Fallback dialect.");
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_combine.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_combine.cc
deleted file mode 100644
index 4fd57af55cc..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_combine.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-//===----------------------------------------------------------------------===//
-//
-// This file implements a set of simple combiners for optimizing operations in
-// the Runtime Fallback dialect.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h"
-
-// This optimizes the following scenario:
-// %tft0, %c2 = "tfd.move_dht_to_tft"(%dht0, %c1)
-//     : (!dht.host_tensor, !hex.chain) -> (!tfd.tf_tensor, !hex.chain)
-// %dht1, %c3 = "tfd.convert_tft_to_dht"(%tft0, %c2)
-//     : (!tfd.tf_tensor, !hex.chain) -> (!dht.host_tensor, !hex.chain)
-// some_op %dht1, %c3
-//
-// becomes
-// some_op %dht0, %c1
-
-struct SimplifyDoubleConversion
-    : public mlir::OpRewritePattern<mlir::tfd::ConvertTftToDhtOp> {
-  // We register this pattern to match every tfd.move_dht_to_tft op.
-  // The "benefit" is used by the framework to order the patterns and process
-  // them in order of profitability.
-  explicit SimplifyDoubleConversion(mlir::MLIRContext* context)
-      : mlir::OpRewritePattern<mlir::tfd::ConvertTftToDhtOp>(context,
-                                                             /*benefit=*/1) {}
-
-  // This method attempts to match a pattern and rewrite it. The rewriter
-  // argument is the orchestrator of the sequence of rewrites. The pattern is
-  // expected to interact with it to perform any changes to the IR from here.
-  mlir::LogicalResult matchAndRewrite(
-      mlir::tfd::ConvertTftToDhtOp op,
-      mlir::PatternRewriter& rewriter) const override {
-    // Look through the inputs of the ConvertTftToDhtOp.
-    mlir::Value convert_op_input_0 = op.getOperand(0);
-    mlir::Value convert_op_input_1 = op.getOperand(1);
-    mlir::tfd::MoveDhtToTftOp move_input_op_0 =
-        llvm::dyn_cast_or_null<mlir::tfd::MoveDhtToTftOp>(
-            convert_op_input_0.getDefiningOp());
-    mlir::tfd::MoveDhtToTftOp move_input_op_1 =
-        llvm::dyn_cast_or_null<mlir::tfd::MoveDhtToTftOp>(
-            convert_op_input_1.getDefiningOp());
-
-    // The inputs should be MoveDhtToTftOp.
-    if (!move_input_op_0 || !move_input_op_1) return mlir::failure();
-    // Both inputs are the same MoveDhtToTftOp.
-    if (move_input_op_0 != move_input_op_1) return mlir::failure();
-
-    // Use the rewriter to replace the ConvertTftToDhtOp's users with the
-    // operands of MoveDhtToTftOp.
-    rewriter.replaceOp(
-        op, {move_input_op_0.getOperand(0), move_input_op_0.getOperand(1)});
-    return mlir::success();
-  }
-};
-
-// Register rewrite pattern as "canonicalization" patterns on the MoveDhtToTftOp
-// so that they can be picked up by the Canonicalization framework.
-void mlir::tfd::ConvertTftToDhtOp::getCanonicalizationPatterns(
-    OwningRewritePatternList& results, MLIRContext* context) {
-  results.insert<SimplifyDoubleConversion>(context);
-}
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.cc b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.cc
deleted file mode 100644
index 9c69154673b..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h"
-
-namespace mlir {
-namespace tfd {
-
-//===----------------------------------------------------------------------===//
-// TfrtDelegate Dialect
-//===----------------------------------------------------------------------===//
-
-RuntimeFallbackDialect::RuntimeFallbackDialect(MLIRContext *context)
-    : Dialect(/*name=*/"tfd", context) {
-  allowUnknownTypes();
-
-  allowUnknownOperations();
-
-  addOperations<
-#define GET_OP_LIST
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback_ops.cc.inc"
-      >();
-}
-
-//===----------------------------------------------------------------------===//
-// TableGen'd op method definitions
-//===----------------------------------------------------------------------===//
-
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback_ops.cc.inc"
-
-}  // namespace tfd
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
deleted file mode 100644
index 009d565e40d..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file defines the operations used in the Runtime Fallback dialect.
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
-
-#include "mlir/IR/Dialect.h"  // from @llvm-project
-#include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
-
-namespace mlir {
-namespace tfd {
-
-// Dialect for TFRT delegate operations.
-class RuntimeFallbackDialect : public Dialect {
- public:
-  explicit RuntimeFallbackDialect(MLIRContext* context);
-  static StringRef getDialectNamespace() { return "tfd"; }
-};
-
-#define GET_OP_CLASSES
-#include "tensorflow/compiler/mlir/tfrt/runtime_fallback_ops.h.inc"
-
-}  // namespace tfd
-}  // namespace mlir
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_RUNTIME_FALLBACK_RUNTIME_FALLBACK_OPS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td b/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
deleted file mode 100644
index c33c6f8d73d..00000000000
--- a/tensorflow/compiler/mlir/tfrt/runtime_fallback/runtime_fallback_ops.td
+++ /dev/null
@@ -1,158 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This is the definition file for the Runtime Fallback Dialect.
-
-#ifdef TFRT_DELEGATE_DIALECT
-#else
-#define TFRT_DELEGATE_DIALECT
-
-include "tfrt/tfrt_op_base.td"
-include "mlir/Interfaces/SideEffectInterfaces.td"
-
-//===----------------------------------------------------------------------===//
-// Type definitions
-//===----------------------------------------------------------------------===//
-def TfTensorType : OpaqueType<"tfd", "tf_tensor", "!tfd.tf_tensor type">;
-
-//===----------------------------------------------------------------------===//
-// Runtime Fallback Dialect definitions
-//===----------------------------------------------------------------------===//
-
-def RuntimeFallback_Dialect : Dialect {
-  let name = "tfd";
-
-  let description = [{
-    The Runtime Fallback dialect.
-
-    This dialect contains operations to run existing TF kernels on TFRT by
-    invoking TF Eager API.
-  }];
-
-  let cppNamespace = "tfd";
-}
-
-//===----------------------------------------------------------------------===//
-// Runtime Fallback Dialect Ops definitions
-//===----------------------------------------------------------------------===//
-
-// Base class for the operation in this dialect.
-class RuntimeFallbackDialect_Op<string mnemonic, list<OpTrait> traits = []> :
-    Op<RuntimeFallback_Dialect, mnemonic, traits> { }
-
-def InitEagerContextOp : RuntimeFallbackDialect_Op<"init_eager_context"> {
-  let summary = "eager context initialization operation";
-  let description = [{
-    The "tfd.init_eager_context" operation takes an input chain, creates and
-    initializes the TF EagerContext and returns an output chain.
-
-    Example:
-      %c1 = "tfd.init_eager_context"(%c0): (!hex.chain) -> !hex.chain
-  }];
-
-  let arguments = (ins ChainType);
-  let results = (outs ChainType);
-}
-
-def DelegateKernelOp : RuntimeFallbackDialect_Op<"delegate_kernel"> {
-  let summary = "delegate kernel operation";
-  let description = [{
-    The "tfd.delegate_kernel" operation takes an input chain, and arbitrary
-    number of input arguments, and runs a specified TF op via TFE C API. It
-    returns an output chain and variable number of outputs from the TF op.
-
-    The input arguments and attributes are passed to the TF op. The ouputs are
-    outputs of the TF op.
-
-    Note that `_name` is a required attribute specifying the TF op to run.
-    TFRT attributes are sorted alphabetically, passed in as positional
-    attributes to the TFRT kernel, rather than as named attributes.
-
-    Example:
-      To run "tf.MatMul" op, which has two boolean attributes,
-        1. Set _name = "MatMul"
-        2. For each TF attribute, split it into two attributes, one for name of
-           the TF attribute, and the other for the type and value of the
-           attribute value. Attribute value is a string with the format of
-           "type$val", where type can be "bool", "string", "tfdtype", "tfshape",
-           "tftensor".
-           The value serialization format can be found in attr_util.h.
-
-      %out_c, %out_tensor = "tfd.delegate_kernel"(
-        %in_c, %in1_tensor, %in2_tensor) {
-        _name = "MatMul",
-        attr1_name = "transpose_a", attr1_value = "bool$false",
-        attr2_name = "transpose_b", attr2_value = "bool$false"
-      } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor) -> (
-        !hex.chain, !tfd.tf_tensor)
-  }];
-
-  let arguments = (ins ChainType, Variadic<AnyType>);
-  let results = (outs ChainType, Variadic<AnyType>);
-}
-
-def PrintTftOp : RuntimeFallbackDialect_Op<"print_tft"> {
-  let summary = "print TF tensor operation";
-  let description = [{
-    The "tfd.print_tft" operation prints the input TF tensor. It takes an input
-    TF tensor to be printed and an input chain, and returns an output chain.
-
-    Example:
-      %c1 = "tfd.print_tft"(%t, %c) : (!tfd.tf_tensor, !hex.chain) -> !hex.chain
-
-  }];
-
-  let arguments = (ins TfTensorType, ChainType);
-  let results = (outs ChainType);
-}
-
-def ConvertTftToDhtOp : RuntimeFallbackDialect_Op<"convert_tft_to_dht", [NoSideEffect]> {
-  let summary = "convert TF tensor to TFRT DHT tensor operation";
-  let description = [{
-    The "tfd.convert_tft_to_dht" operation converts a TF tensor to a TFRT
-    DenseHostTensor.
-
-    It takes as input a TF Tensor and an input chain, and returns a converted
-    TFRT DHT tensor and an output chain.
-
-    Example:
-      %dht, %c0 = "tfd.convert_tft_to_dht"(%tft, %c)
-      : (!tfd.tf_tensor, !hex.chain) -> (!dht.host_tensor, !hex.chain)
-  }];
-
-  let arguments = (ins TfTensorType, ChainType);
-  // Enable registering canonicalization patterns with this operation.
-  let hasCanonicalizer = 1;
-  let results = (outs TensorType, ChainType);
-}
-
-def MoveDhtToTftOp : RuntimeFallbackDialect_Op<"move_dht_to_tft", [NoSideEffect]> {
-  let summary = "convert TFRT DHT tensor to DHT tensor operation";
-  let description = [{
-    The "tfd.move_dht_to_tft" operation moves a TFRT tensor into a TF Tensor.
-
-    It takes as input a TFRT Tensor and an input chain, and returns a TF tensor
-    with the same underlying buffer and an output chain.
-
-    Example:
-      %dht, %c0 = "tfd.convert_tft_to_dht"(%tft, %c)
-        : (!tfd.tf_tensor, !hex.chain) -> (!dht.host_tensor, !hex.chain)
-  }];
-
-  let arguments = (ins TensorType, ChainType);
-  let results = (outs TfTensorType, ChainType);
-}
-
-#endif // TFRT_DELEGATE_DIALECT
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
deleted file mode 100644
index 92571148cff..00000000000
--- a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h"
-
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
-#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
-#include "tfrt/bef_converter/mlir_to_bef.h"
-#include "tfrt/core_runtime/core_runtime.h"
-#include "tfrt/core_runtime/op_handler.h"
-#include "tfrt/host_context/host_context.h"
-#include "tfrt/tensor/dense_host_tensor_view.h"
-
-namespace tensorflow {
-namespace {
-
-llvm::StringRef ProcessIndexPath(mlir::ArrayAttr index_path) {
-  if (index_path.size() == 1 && index_path[0].isa<mlir::StringAttr>()) {
-    // TODO(chky): Support cases where index_path is not a single string.
-    return index_path[0].cast<mlir::StringAttr>().getValue();
-  }
-  return "";
-}
-
-}  // namespace
-
-void MapFunctionSignaturesFromTFSavedModelMLIR(
-    mlir::ModuleOp module,
-    llvm::function_ref<void(
-        llvm::StringRef func_name,
-        llvm::ArrayRef<std::pair<llvm::StringRef, llvm::StringRef>>
-            input_names_and_devices,
-        llvm::ArrayRef<llvm::StringRef> output_names,
-        llvm::ArrayRef<mlir::tf_saved_model::GlobalTensorOp> global_tensors)>
-        map_fn) {
-  // Create global_tensors for each functions.
-  mlir::SymbolTable symbol_table(module);
-  module.walk([&symbol_table, map_fn](mlir::FuncOp func) {
-    // Use the exported name as the function name, and skip non-exported
-    // functions.
-    auto func_names = mlir::tf_saved_model::GetExportedNames(func);
-    if (func_names.empty()) return;
-
-    // Here we walk through each arguments and find out the input/output names,
-    // and input devices, variables used by this function.
-    llvm::SmallVector<std::pair<llvm::StringRef, llvm::StringRef>, 4>
-        input_names_and_devices;
-    llvm::SmallVector<mlir::tf_saved_model::GlobalTensorOp, 4> global_tensors;
-    for (unsigned i = 0, e = func.getNumArguments(); i != e; ++i) {
-      if (auto input_index_path = func.getArgAttrOfType<mlir::ArrayAttr>(
-              i, "tf_saved_model.index_path")) {
-        std::pair<llvm::StringRef, llvm::StringRef> name_and_device;
-        name_and_device.first = ProcessIndexPath(input_index_path);
-        if (auto input_device =
-                func.getArgAttrOfType<mlir::StringAttr>(i, "tf.device")) {
-          name_and_device.second = input_device.getValue();
-        }
-        input_names_and_devices.push_back(name_and_device);
-      }
-      if (auto variable =
-              mlir::tf_saved_model::LookupBoundInput(func, i, symbol_table)) {
-        global_tensors.push_back(variable);
-      }
-    }
-
-    llvm::SmallVector<llvm::StringRef, 4> output_names;
-    for (unsigned i = 0, e = func.getNumResults(); i != e; ++i) {
-      if (auto output_index_path = func.getResultAttrOfType<mlir::ArrayAttr>(
-              i, "tf_saved_model.index_path")) {
-        output_names.push_back(ProcessIndexPath(output_index_path));
-      }
-    }
-
-    for (auto func_name : func_names)
-      map_fn(func_name, input_names_and_devices, output_names, global_tensors);
-  });
-}
-
-Status CompileTFSavedModelMLIRToBEF(const TFRTSavedModelCompileOptions& options,
-                                    mlir::ModuleOp module,
-                                    tfrt::AlignedBuffer<8>* bef_buffer) {
-  VLOG(1) << "TF Dialect: " << tensorflow::MlirModuleToString(module);
-
-  // Lower MLIR TF Dialect to MLIR TFRT CoreRT dialect.
-  mlir::PassManager pm(module.getContext());
-
-  tensorflow::CoreRTPipelineOptions pass_options;
-  if (!options.default_device.empty()) {
-    pass_options.default_device = options.default_device;
-  }
-  if (!options.force_data_format.empty()) {
-    pass_options.force_data_format = options.force_data_format;
-  }
-  pass_options.enable_optimizer = options.enable_optimizer;
-  tensorflow::CreateTFExecutorToCoreRTPipeline(pm, pass_options);
-
-  if (mlir::failed(pm.run(module)))
-    return tensorflow::errors::Internal(
-        "failed to lower TF Dialect to CoreRT dialect.");
-
-  VLOG(1) << "TFRT Dialect: " << tensorflow::MlirModuleToString(module);
-
-  auto bef =
-      tfrt::ConvertMLIRToBEF(module, /* disable_optional_sections = */ true);
-  if (bef.empty())
-    return tensorflow::errors::Internal("failed to convert MLIR to BEF.");
-
-  assert(bef_buffer);
-  bef_buffer->assign(bef.begin(), bef.end());
-
-  return Status::OK();
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
deleted file mode 100644
index 06a6c5a22f9..00000000000
--- a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_SAVED_MODEL_SAVED_MODEL_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_SAVED_MODEL_SAVED_MODEL_H_
-
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-#include "tensorflow/core/platform/status.h"
-#include "tfrt/core_runtime/tensor_handle.h"
-#include "tfrt/support/aligned_buffer.h"
-
-namespace tfrt {
-class CoreRuntime;
-}
-
-namespace mlir {
-class ModuleOp;
-}
-
-namespace tensorflow {
-
-struct TFRTSavedModelCompileOptions {
-  // TODO(tf-runtime-team): Ideally, compiler should make the decision where
-  // to place the variable.
-  std::string variable_device = "cpu";
-  std::string default_device = "cpu";
-
-  // Enable compiler optimization in TFRT dialect.
-  bool enable_optimizer = true;
-
-  // Force data format for all layout sensitive operations, eg. setting it to
-  // "NHWC" will changes all data format in the graph to "NHWC" by inserting
-  // or removing related tf.Transpose op. Currently the supported formats are
-  // "NHWC" and "NCHW".
-  //
-  // TODO(tf-runtime-team): Ideally compiler should figure out whether the
-  // data format should be changed, instead of controlled by users.
-  std::string force_data_format;
-};
-
-// Map signatures (eg. input/output names, variables) for each function.
-void MapFunctionSignaturesFromTFSavedModelMLIR(
-    mlir::ModuleOp module,
-    llvm::function_ref<void(
-        llvm::StringRef func_name,
-        llvm::ArrayRef<std::pair<llvm::StringRef, llvm::StringRef>>
-            input_names_and_devices,
-        llvm::ArrayRef<llvm::StringRef> output_names,
-        llvm::ArrayRef<mlir::tf_saved_model::GlobalTensorOp> global_tensors)>
-        map_fn);
-
-// Compile MLIR in TF saved model dialect into BEF.
-Status CompileTFSavedModelMLIRToBEF(const TFRTSavedModelCompileOptions& options,
-                                    mlir::ModuleOp module,
-                                    tfrt::AlignedBuffer<8>* bef_buffer);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_SAVED_MODEL_SAVED_MODEL_H_
diff --git a/tensorflow/compiler/mlir/tfrt/tests/BUILD b/tensorflow/compiler/mlir/tfrt/tests/BUILD
deleted file mode 100644
index 4faa8d2efe8..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/mlir:tf-opt",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD b/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
deleted file mode 100644
index fc7c142ea73..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/mlir:tf-mlir-translate",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/compatibility_analysis.mlir b/tensorflow/compiler/mlir/tfrt/tests/analysis/compatibility_analysis.mlir
deleted file mode 100644
index 5943997a1bc..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/analysis/compatibility_analysis.mlir
+++ /dev/null
@@ -1,65 +0,0 @@
-// RUN: tf-mlir-translate -analyze-tf-for-tfrt %s | FileCheck %s
-
-func @main(%serialized: tensor<32x!tf.string>,
-           %names : tensor<32x!tf.string>,
-           %dense_keys : tensor<2x!tf.string>,
-           %dense_default_0 : tensor<?xf32>,
-           %dense_default_1 : tensor<?xf32>) {
-  // CHECK:      summary {
-  // CHECK-NEXT:   ref_variable: true
-  // CHECK-NEXT:   incompatible_variable: true
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.AssignVariableOp"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 1
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:       incompatible_variable: true
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.Const"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 2
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.ParseExampleV2"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 1
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.VarHandleOp"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 1
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  // CHECK-NEXT: ops {
-  // CHECK-NEXT:   key: "tf.VariableV2"
-  // CHECK-NEXT:   value {
-  // CHECK-NEXT:     count: 1
-  // CHECK-NEXT:     report {
-  // CHECK-NEXT:       ref_variable: true
-  // CHECK-NEXT:     }
-  // CHECK-NEXT:   }
-  // CHECK-NEXT: }
-  %0 = "tf.VariableV2"() {shape = #tf.shape<2>, container = "", shared_name = ""} : () -> tensor<!tf.int32ref>
-  %1 = "tf.Const"() {value = dense<4.200000e+01> : tensor<f32>} : () -> tensor<f32>
-  %2 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource>
-  "tf.AssignVariableOp"(%2, %1) : (tensor<*x!tf.resource>, tensor<f32>) -> ()
-  %empty_str_vector = "tf.Const"()
-    {dtype = !tf.string, value = opaque<"tf", "0x746674656E736F722464747970653A2044545F535452494E472074656E736F725F7368617065207B2064696D207B207D207D"> : tensor<0x!tf.string>}
-      : () -> tensor<0x!tf.string>
-  %result:2 = "tf.ParseExampleV2"(%serialized, %names, %empty_str_vector, %dense_keys, %empty_str_vector, %dense_default_0, %dense_default_1)
-    {dense_shapes = [#tf.shape<>, #tf.shape<>], num_sparse = 0 : i64, result_segment_sizes = dense<[0, 0, 0, 2, 0, 0]> : vector<6xi32>}
-      : (tensor<32x!tf.string>, tensor<32x!tf.string>, tensor<0x!tf.string>, tensor<2x!tf.string>, tensor<0x!tf.string>, tensor<?xf32>, tensor<?xf32>) -> (tensor<32xf32>, tensor<32xf32>)
-  return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/basics.mlir b/tensorflow/compiler/mlir/tfrt/tests/basics.mlir
deleted file mode 100644
index 650bd04b882..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/basics.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: tf-opt -tf-legalize-to-hex %s -o -| FileCheck %s
-
-
-// CHECK-LABEL: func @constants() {
-func @constants() {
-  // CHECK: "hex.constant_int"() {value = 1 : i32}
-  %0 = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "x", value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // CHECK: "hex.constant_int"() {value = 42 : i32}
-  %1 = "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "y", value = dense<42> : tensor<1x1xi32>} : () -> tensor<1x1xi32>
-  // CHECK: hex.return
-  return
-}
-
-// CHECK-LABEL: func @add
-func @add(%arg0: tensor<1xi32>) {
-  // CHECK: hex.add_int
-  %2 = "tf.Add"(%arg0, %arg0) {T = "tfdtype$DT_INT32", device = "", name = "z"} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
-  return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/err_partial_convert.mlir b/tensorflow/compiler/mlir/tfrt/tests/err_partial_convert.mlir
deleted file mode 100644
index 410ff299883..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/err_partial_convert.mlir
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: tf-opt %s -tf-legalize-to-hex  -verify-diagnostics
-
-func @partial_convert() {
-  %0 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-  // expected-error @+1 {{failed to legalize operation 'tf.Const'}}
-  %1 = "tf.Const"() {value = dense<42> : tensor<2xi32>} : () -> tensor<2xi32>
-  %2 = "tf.Add"(%0, %1) : (tensor<i32>, tensor<2xi32>) -> tensor<2xi32>
-  return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/opt.mlir b/tensorflow/compiler/mlir/tfrt/tests/opt.mlir
deleted file mode 100644
index 6f27fa6d7e4..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/opt.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: tf-opt %s -pass-pipeline='func(canonicalize)' | FileCheck %s
-
-// CHECK-LABEL: func @simplify_double_conversion_test(
-func @simplify_double_conversion_test() {
-  // CHECK: %[[CREATE:.*]] = dht.create
-  // CHECK: %[[FILL:.*]] = dht.fill
-  // CHECK: dht.print_tensor %[[CREATE]], %[[FILL]]
-  %c0 = hex.new.chain
-
-  // Create 2x2 dht<i32, 2> with value 1
-  %dht0 = dht.create_uninitialized_tensor.i32.2 [2 : i32, 2 : i32]
-  %c1 = dht.fill_tensor_with_constant.i32 %dht0, %c0 1 : i32
-
-  // Convert dht to tf tensor
-  %tft0, %c2 = "tfd.move_dht_to_tft"(%dht0, %c1)
-      : (!t.tensor, !hex.chain) -> (!tfd.tf_tensor, !hex.chain)
-
-  // Convert tf tensor back to dht
-  %dht1, %c3 = "tfd.convert_tft_to_dht"(%tft0, %c2)
-      : (!tfd.tf_tensor, !hex.chain) -> (!t.tensor, !hex.chain)
-
-  // Print the result dht
-  %c4 = dht.print_tensor %dht1, %c3
-
-  hex.return
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
deleted file mode 100644
index 4faa8d2efe8..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
-
-# Bundle together all of the test utilities that are used by tests.
-filegroup(
-    name = "test_utilities",
-    testonly = True,
-    data = [
-        "//tensorflow/compiler/mlir:tf-opt",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
deleted file mode 100644
index 6c129c4be22..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/attributes.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-module attributes {tf_saved_model.semantics} {
-
-"tf_saved_model.global_tensor"() {is_mutable, sym_name = "y", type = tensor<1x3xf32>, value = dense<[[1.67482901, -0.529208779, -0.803792417]]> : tensor<1x3xf32>} : () -> ()
-
-// CHECK-LABEL: func @basic
-func @func_basic(
-    %arg0: tensor<3x1xf32> {tf_saved_model.index_path = [0]},
-    %arg1: tensor<!tf.resource<tensor<1x3xf32>>> {tf_saved_model.bound_input = @y})
-      -> (tensor<3x3xf32> {tf_saved_model.index_path = []})
-  attributes {tf_saved_model.exported_names = ["basic"]} {
-  %1 = "tf.ReadVariableOp"(%arg1) {_output_shapes = ["tfshape$dim { size: 1 } dim { size: 3 }"], device = "cpu", dtype = f32} : (tensor<!tf.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
-
-  // CHECK: {{%.*}} = corert.executeop({{%.*}}) "tf.MatMul"
-  // CHECK-SAME: {T = f32, transpose_a = false, transpose_b = false}
-  %2 = "tf.MatMul"(%arg0, %1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-  return %2 : tensor<3x3xf32>
-}
-
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
deleted file mode 100644
index 40b0332b61c..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/basic.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-// CHECK-NOT: tf_saved_model.semantics
-module attributes {tf_saved_model.semantics} {
-
-// CHECK-NOT: "tf_saved_model.global_tensor"
-"tf_saved_model.global_tensor"() {is_mutable, sym_name = "y", type = tensor<1x3xf32>, value = dense<[[1.67482901, -0.529208779, -0.803792417]]> : tensor<1x3xf32>} : () -> ()
-"tf_saved_model.global_tensor"() {is_mutable, sym_name = "z", type = tensor<3xf32>, value = dense<[1.67482901, -0.529208779, -0.803792417]> : tensor<3xf32>} : () -> ()
-
-// CHECK-LABEL: func @basic
-// CHECK-SAME: ([[arg0:%.*]]: !corert.tensorhandle, [[arg1:%.*]]: !corert.tensorhandle,
-// CHECK-SAME: [[arg2:%.*]]: !corert.tensorhandle) -> !corert.tensorhandle {
-func @func_basic(
-    %arg0: tensor<3x1xf32> {tf_saved_model.index_path = [0]},
-    %arg1: tensor<!tf.resource<tensor<1x3xf32>>> {tf_saved_model.bound_input = @y},
-    %arg2: tensor<!tf.resource<tensor<3xf32>>> {tf_saved_model.bound_input = @z})
-      -> (tensor<3x3xf32> {tf_saved_model.index_path = []})
-  attributes {tf_saved_model.exported_names = ["basic"]} {
-  // CHECK-NEXT: [[cpu_device:%.*]] = corert.get_device "cpu"
-  // CHECK-NEXT: [[r0:%.*]] = corert.executeop([[cpu_device]]) "tf.MatMul"([[arg0]], [[arg1]])
-  // CHECK-NEXT: [[r1:%.*]] = corert.executeop([[cpu_device]]) "tf.BiasAdd"([[r0]], [[arg2]])
-  // CHECK-NEXT: [[r2:%.*]] = corert.executeop([[cpu_device]]) "tf.Tanh"([[r1]])
-  // CHECK-NEXT: hex.return [[r2]] : !corert.tensorhandle
-
-  %0 = "tf.ReadVariableOp"(%arg2) {_output_shapes = ["tfshape$dim { size: 3 }"], device = "cpu", dtype = f32} : (tensor<!tf.resource<tensor<3xf32>>>) -> tensor<3xf32>
-  %1 = "tf.ReadVariableOp"(%arg1) {_output_shapes = ["tfshape$dim { size: 1 } dim { size: 3 }"], device = "cpu", dtype = f32} : (tensor<!tf.resource<tensor<1x3xf32>>>) -> tensor<1x3xf32>
-  %2 = "tf.MatMul"(%arg0, %1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-  %3 = "tf.BiasAdd"(%2, %0) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], data_format = "NHWC", device = "cpu"} : (tensor<3x3xf32>, tensor<3xf32>) -> tensor<3x3xf32>
-  %4 = "tf.Tanh"(%3) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu"} : (tensor<3x3xf32>) -> tensor<3x3xf32>
-  %5 = "tf.Identity"(%4) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "cpu"} : (tensor<3x3xf32>) -> tensor<3x3xf32>
-  return %5 : tensor<3x3xf32>
-}
-
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/derived_attrs.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/derived_attrs.mlir
deleted file mode 100644
index 774ea0526bd..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/derived_attrs.mlir
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-// CHECK-LABEL: func @derived_attrs
-func @derived_attrs(
-  %serialized: tensor<?x!tf.string>,
-  %names: tensor<0x!tf.string>,
-  %sparse_keys: tensor<0x!tf.string>,
-  %dense_keys: tensor<1x!tf.string>,
-  %ragged_keys: tensor<0x!tf.string>,
-  %dense_default: tensor<0xi64>) -> tensor<?xi64> {
-
-  %dense_value =
-    "tf.ParseExampleV2"(%serialized, %names, %sparse_keys, %dense_keys, %ragged_keys, %dense_default)
-    // CHECK: Tdense = [i64]
-    // CHECK-SAME: dense_shapes = [#corert.shape<>]
-    { device = "cpu", num_sparse = 0 : i64, dense_shapes = [#tf.shape<>], result_segment_sizes = dense<[0, 0, 0, 1, 0, 0]> : vector<6xi32>}
-      : (tensor<?x!tf.string>, tensor<0x!tf.string>, tensor<0x!tf.string>, tensor<1x!tf.string>, tensor<0x!tf.string>, tensor<0xi64>)
-      -> tensor<?xi64>
-
-  return %dense_value : tensor<?xi64>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
deleted file mode 100644
index 7077523b1e2..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/device_conversion.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-// CHECK-LABEL: func @device_test
-func @device_test(
-    %arg0: tensor<3x1xf32> {tf_saved_model.index_path = [0]},
-    %arg1: tensor<1x3xf32> {tf_saved_model.index_path = [0]})
-      -> (tensor<3x3xf32> {tf_saved_model.index_path = []}) {
-  // CHECK: {{%.*}} = corert.get_device "gpu"
-
-  %2 = "tf.MatMul"(%arg0, %arg1) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "gpu", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-  return %2 : tensor<3x3xf32>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fold.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fold.mlir
deleted file mode 100644
index 950cef928a9..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/fold.mlir
+++ /dev/null
@@ -1,12 +0,0 @@
-// RUN: tf-opt -corert-optimize %s | FileCheck %s
-
-// CHECK-LABEL: func @fold_test
-func @fold_test(%arg: !corert.tensorhandle) -> !corert.tensorhandle {
-    %cpu = corert.get_device "cpu"
-    // CHECK-NOT: tf.Const
-    %0 = corert.executeop(%cpu) "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : 1
-    // CHECK: "_tf.Transpose"({{%.*}})
-    // CHECK-SAME: perm = dense<[0, 3, 1, 2]> : tensor<4xi32>
-    %1 = corert.executeop(%cpu) "tf.Transpose"(%arg, %0) {T = f32, Tperm = i32} : 1
-    hex.return %1 : !corert.tensorhandle
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/string_tensor.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/string_tensor.mlir
deleted file mode 100644
index b1306be825c..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/string_tensor.mlir
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: tf-opt -tf-to-corert %s | FileCheck %s
-
-// CHECK-LABEL: func @string_tensor
-func @string_tensor() -> (tensor<0x!tf.string>, tensor<7x!tf.string>) {
-  // CHECK: {shape = [0], value = []}
-  %0 = "tf.Const"() {value = dense<[]> : tensor<0x!tf.string>} : () -> tensor<0x!tf.string>
-  // CHECK: {shape = [7], value = ["has_login_page_feature", "num_terms_inside_postform", "num_terms_outside_postform", "num_terms_outside_postform_without_bp", "query_params_contains_url", "title_with_login_phase", "url_contains_login_terms"]}
-  %1 = "tf.Const"() {value = dense<["has_login_page_feature", "num_terms_inside_postform", "num_terms_outside_postform", "num_terms_outside_postform_without_bp", "query_params_contains_url", "title_with_login_phase", "url_contains_login_terms"]> : tensor<7x!tf.string>} : () -> tensor<7x!tf.string>
-  return %0, %1 : tensor<0x!tf.string>, tensor<7x!tf.string>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_executor_to_corert_pipeline.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_executor_to_corert_pipeline.mlir
deleted file mode 100644
index 5c44f558280..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/tf_executor_to_corert_pipeline.mlir
+++ /dev/null
@@ -1,24 +0,0 @@
-// RUN: tf-opt -tf-executor-to-corert-pipeline %s | FileCheck %s
-
-// CHECK-LABEL: func @basic
-// CHECK-SAME: ([[arg0:%.*]]: !corert.tensorhandle, [[arg1:%.*]]: !corert.tensorhandle)
-// CHECK-NEXT: [[cpu:%.*]] = corert.get_device "cpu"
-// CHECK-NEXT: [[res:%.*]] = corert.executeop([[cpu]]) "tf.MatMul"([[arg0]], [[arg1]])
-// CHECK-NEXT: hex.return [[res]] : !corert.tensorhandle
-module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 293 : i32}} {
-  func @basic(%arg0: tensor<3x1xf32>,
-              %arg1: tensor<!tf.resource<tensor<1x3xf32>>>
-  ) -> tensor<3x3xf32> {
-    %0 = tf_executor.graph {
-      %outputs, %control = tf_executor.island wraps "tf.Const"() {value = dense<0.899999976> : tensor<f32>} : () -> tensor<f32>
-      %outputs_0, %control_0 = tf_executor.island {
-        %1 = "tf.Cast"(%arg1) {Truncate = false} : (tensor<!tf.resource<tensor<1x3xf32>>>) -> tensor<*x!tf.resource>
-        %2 = "tf.ReadVariableOp"(%1) {_output_shapes = ["tfshape$dim { size: 1 } dim { size: 3 }"], device = "", dtype = f32} : (tensor<*x!tf.resource>) -> tensor<1x3xf32>
-        %3 = "tf.MatMul"(%arg0, %2) {T = f32, _output_shapes = ["tfshape$dim { size: 3 } dim { size: 3 }"], device = "", transpose_a = false, transpose_b = false} : (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-        tf_executor.yield %3 : tensor<3x3xf32>
-      }
-      tf_executor.fetch %outputs_0, %control_0 : tensor<3x3xf32>, !tf_executor.control
-    }
-    return %0 : tensor<3x3xf32>
-  }
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfd_lowering.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfd_lowering.mlir
deleted file mode 100644
index 5968a590f91..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_tfd_lowering.mlir
+++ /dev/null
@@ -1,111 +0,0 @@
-// RUN: tf-opt %s -tf-to-tfd-lowering | FileCheck %s
-
-// CHECK: func @inference_call(
-// CHECK-SAME: %arg0: !hex.chain,
-// CHECK-SAME: %arg1: !tfd.tf_tensor,
-// CHECK-SAME: %arg2: !tfd.tf_tensor,
-// CHECK-SAME: %arg3: !tfd.tf_tensor,
-// CHECK-SAME: %arg4: !tfd.tf_tensor,
-// CHECK-SAME: %arg5: !tfd.tf_tensor
-// CHECK-SAME: ) -> (!hex.chain, !tfd.tf_tensor)
-func @inference_call(
-  %arg0: tensor<?x784xf32>,
-  %arg1: tensor<*x!tf.resource>,
-  %arg2: tensor<*x!tf.resource>,
-  %arg3: tensor<*x!tf.resource>,
-  %arg4: tensor<*x!tf.resource>
-  )-> tensor<?x10xf32> {
-    // CHECK: %0:2 = "tfd.delegate_kernel"(%arg0, %arg5)
-    // CHECK-SAME: _name = "tf.ReadVariableOp"
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: (!hex.chain, !tfd.tf_tensor) -> (!hex.chain, !tfd.tf_tensor)
-    %0 = "tf.ReadVariableOp"(%arg4) {
-      dtype = "tfdtype$DT_FLOAT"
-      } : (tensor<*x!tf.resource>) -> tensor<10xf32>
-
-    // CHECK: %1:2 = "tfd.delegate_kernel"(%0#0, %arg3) {
-    // CHECK-SAME: _name = "tf.ReadVariableOp"
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %1 = "tf.ReadVariableOp"(%arg2) {
-      dtype = "tfdtype$DT_FLOAT"
-      } : (tensor<*x!tf.resource>) -> tensor<512xf32>
-
-    // CHECK: %2:2 = "tfd.delegate_kernel"(%1#0, %arg4) {
-    // CHECK-SAME: _name = "tf.ReadVariableOp",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %2 = "tf.ReadVariableOp"(%arg3) {
-      dtype = "tfdtype$DT_FLOAT"
-      } : (tensor<*x!tf.resource>) -> tensor<512x10xf32>
-
-    // CHECK: %3:2 = "tfd.delegate_kernel"(%2#0, %arg2) {
-    // CHECK-SAME: _name = "tf.ReadVariableOp",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %3 = "tf.ReadVariableOp"(%arg1) {
-      dtype = "tfdtype$DT_FLOAT"
-      } : (tensor<*x!tf.resource>) -> tensor<784x512xf32>
-
-    // CHECK: %4:2 = "tfd.delegate_kernel"(%3#0, %arg1, %3#1) {
-    // CHECK-SAME: _name = "tf.MatMul",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT",
-    // CHECK-SAME: attr1_name = "transpose_a", attr1_value = false,
-    // CHECK-SAME: attr2_name = "transpose_b", attr2_value = false
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %4 = "tf.MatMul"(%arg0, %3) {
-      dtype = "tfdtype$DT_FLOAT", transpose_a = false, transpose_b = false
-    } : (tensor<?x784xf32>, tensor<784x512xf32>) -> tensor<?x512xf32>
-
-    // CHECK: %5:2 = "tfd.delegate_kernel"(%4#0, %4#1, %1#1) {
-    // CHECK-SAME: _name = "tf.AddV2"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %5 = "tf.AddV2"(%4, %1)
-      : (tensor<?x512xf32>, tensor<512xf32>)-> tensor<?x512xf32>
-
-    // CHECK: %6:2 = "tfd.delegate_kernel"(%5#0, %5#1) {
-    // CHECK-SAME: _name = "tf.Relu",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %6 = "tf.Relu"(%5) {
-      dtype = "tfdtype$DT_FLOAT"
-    } : (tensor<?x512xf32>) -> tensor<?x512xf32>
-
-    // CHECK: %7:2 = "tfd.delegate_kernel"(%6#0, %6#1, %2#1) {
-    // CHECK-SAME: _name = "tf.MatMul",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT",
-    // CHECK-SAME: attr1_name = "transpose_a", attr1_value = false,
-    // CHECK-SAME: attr2_name = "transpose_b", attr2_value = false
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %7 = "tf.MatMul"(%6, %2) {
-      dtype = "tfdtype$DT_FLOAT", transpose_a = false, transpose_b = false
-    } : (tensor<?x512xf32>, tensor<512x10xf32>) -> tensor<?x10xf32>
-
-    // CHECK: %8:2 = "tfd.delegate_kernel"(%7#0, %7#1, %0#1) {
-    // CHECK-SAME: _name = "tf.AddV2",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %8 = "tf.AddV2"(%7, %0) {
-      dtype = "tfdtype$DT_FLOAT"
-    } : (tensor<?x10xf32>, tensor<10xf32>) -> tensor<?x10xf32>
-
-    // CHECK: %9:2 = "tfd.delegate_kernel"(%8#0, %8#1) {
-    // CHECK-SAME: _name = "tf.Identity",
-    // CHECK-SAME: attr0_name = "dtype", attr0_value = "tfdtype$DT_FLOAT"
-    // CHECK-SAME: } : (!hex.chain, !tfd.tf_tensor)
-    // CHECK-SAME: -> (!hex.chain, !tfd.tf_tensor)
-    %9 = "tf.Identity"(%8) {
-      dtype = "tfdtype$DT_FLOAT"
-    } : (tensor<?x10xf32>) -> tensor<?x10xf32>
-
-    // CHECK: hex.return %9#0, %9#1 : !hex.chain, !tfd.tf_tensor
-    return %9 : tensor<?x10xf32>
-}
diff --git a/tensorflow/compiler/mlir/tfrt/tf_legalize_to_hex.cc b/tensorflow/compiler/mlir/tfrt/tf_legalize_to_hex.cc
deleted file mode 100644
index 9d13955490b..00000000000
--- a/tensorflow/compiler/mlir/tfrt/tf_legalize_to_hex.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements lowering of Tf dialect to TFRT Hex kernels.
-//
-// Current lowering is a placeholder performing trivial conversion
-// for integer constants and additions.
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/Builders.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/IR/StandardTypes.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "absl/memory/memory.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-
-namespace mlir {
-namespace {
-
-// Pattern rewrite rules for "tf.Const", "tf.Add" and "return" ops.
-bool isInt32LikeType(Type t) {
-  if (t.isSignlessInteger(32)) return true;
-  if (auto ttype = t.dyn_cast<RankedTensorType>()) {
-    if (ttype.hasStaticShape() && ttype.getNumElements() == 1 &&
-        ttype.getElementType().isSignlessInteger(32))
-      return true;
-  }
-  return false;
-}
-
-// Replaces 32-bit integer TF::ConstOp with "hex.constant_int" op.
-struct ConstOpConversion : public ConversionPattern {
-  explicit ConstOpConversion(MLIRContext *context)
-      : ConversionPattern(TF::ConstOp::getOperationName(), 1, context) {}
-
-  LogicalResult matchAndRewrite(
-      Operation *op, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
-    auto constOp = cast<TF::ConstOp>(op);
-    if (!isInt32LikeType(constOp.getType())) return failure();
-
-    auto valueAttr = constOp.value();
-    auto newAttr = Attribute();
-
-    // Convert constant op if it has an integer or dense elements attribute.
-    // Other kinds of element attributes are not converted for now.
-    if (valueAttr.isa<IntegerAttr>()) {
-      newAttr = valueAttr;
-    } else if (auto v = valueAttr.dyn_cast<SplatElementsAttr>()) {
-      if (v.isSplat()) newAttr = v.getSplatValue();
-    }
-    if (!newAttr) return failure();
-
-    mlir::OperationState state(constOp.getLoc(), "hex.constant_int");
-    state.types.push_back(rewriter.getIntegerType(32));
-    state.addAttribute("value", newAttr);
-    auto newOp = rewriter.createOperation(state);
-    rewriter.replaceOp(op, newOp->getResult(0));
-    return success();
-  }
-};
-
-// Replaces 32-bit integer TF::Add op with "hex.add_int" op.
-struct AddOpConversion : public ConversionPattern {
-  explicit AddOpConversion(MLIRContext *context)
-      : ConversionPattern(TF::AddOp::getOperationName(), 1, context) {}
-
-  LogicalResult matchAndRewrite(
-      Operation *op, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
-    auto addOp = cast<TF::AddOp>(op);
-
-    if (!isInt32LikeType(operands[0].getType()) ||
-        !isInt32LikeType(operands[1].getType()))
-      return failure();
-
-    auto int32Ty = rewriter.getIntegerType(32);
-    mlir::OperationState state(addOp.getLoc(), "hex.add_int", operands,
-                               {int32Ty}, {});
-    auto newOp = rewriter.createOperation(state);
-    rewriter.replaceOp(op, newOp->getResult(0));
-    return success();
-  }
-};
-
-// Replaces return op that has no arguments with "hex.return" op.
-struct ReturnOpConversion : public OpConversionPattern<ReturnOp> {
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      ReturnOp srcOp, ArrayRef<Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
-    if (srcOp.getNumOperands() != 0) return failure();
-
-    mlir::OperationState state(srcOp.getLoc(), "hex.return");
-    rewriter.createOperation(state);
-
-    rewriter.eraseOp(srcOp);
-    return success();
-  }
-};
-
-// Legalize TF operations to host program dialect.
-struct TfLegalizeToHex
-    : public PassWrapper<TfLegalizeToHex, OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    auto *ctx = &getContext();
-    TypeConverter converter;
-    converter.addConversion([](Type type) -> Type {
-      // Convert single element tensor type of int32s to int32 type
-      if (isInt32LikeType(type)) {
-        return IntegerType::get(32, type.getContext());
-      }
-      return Type();
-    });
-
-    OwningRewritePatternList patterns;
-
-    // For now, replace only int32 TF::OpConst, TF::OpAdd and OpReturn with
-    // "hex.constant_int", "hex.add_int" and "hex.return", respectively.
-    patterns.insert<ConstOpConversion, AddOpConversion, ReturnOpConversion>(
-        ctx);
-
-    ConversionTarget target(*ctx);
-    const auto legal = ConversionTarget::LegalizationAction::Legal;
-    target.setOpAction(OperationName(StringRef("hex.constant_int"), ctx),
-                       legal);
-    target.setOpAction(OperationName(StringRef("hex.add_int"), ctx), legal);
-    target.setOpAction(OperationName(StringRef("hex.return"), ctx), legal);
-    target.addLegalOp<ModuleOp, ModuleTerminatorOp, FuncOp>();
-
-    auto result =
-        applyFullConversion(getOperation(), target, patterns, &converter);
-    if (failed(result)) signalPassFailure();
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToHexPass() {
-  return std::make_unique<TfLegalizeToHex>();
-}
-
-static PassRegistration<TfLegalizeToHex> pass(
-    "tf-legalize-to-hex",
-    "Convert TF dialect to the TF runtime host program dialect.");
-}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc b/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
deleted file mode 100644
index 9e06ba1f4bc..00000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/optimize.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements the optimzation passe on TFRT CoreRuntime dialect.
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/IR/Matchers.h"
-#include "mlir/IR/PatternMatch.h"
-#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
-#include "tfrt/core_runtime/opdefs/core_runtime.h"
-
-namespace tensorflow {
-namespace {
-
-// Implement a constant fold pattern for corert dialect. The following pattern
-// will be matched:
-//
-// %0 = corert.executeop(%cpu) "tf.Const"()
-//  {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : 1
-// %1 = corert.executeop(%cpu) "tf.Transpose"(%arg, %0)
-//  {T = f32, Tperm = i32} : 1
-//
-// And it will converted to:
-//
-// %1 = corert.executeop(%cpu) "_tf.Transpose"(%arg)
-//  {T = f32, Tperm = i32, perm = dense<[0, 3, 1, 2]> : tensor<4xi32>} : 1
-//
-class CoreRTExecuteOpRewritePattern
-    : public mlir::OpRewritePattern<tfrt::corert::ExecuteOp> {
- public:
-  CoreRTExecuteOpRewritePattern(
-      mlir::MLIRContext *context,
-      ArrayRef<std::pair<StringRef, ArrayRef<StringRef>>> ops_to_attrs)
-      : OpRewritePattern(context),
-        ops_to_attrs_(ops_to_attrs.begin(), ops_to_attrs.end()) {}
-
-  mlir::LogicalResult matchAndRewrite(
-      tfrt::corert::ExecuteOp op,
-      mlir::PatternRewriter &rewriter) const override {
-    auto attr_names = ops_to_attrs_.lookup(op.op_name());
-    if (attr_names.empty()) return failure();
-
-    SmallVector<mlir::Value, 4> new_operands;
-    SmallVector<std::pair<StringRef, Attribute>, 4> new_attributes;
-    op.getOpAttrs(&new_attributes);
-    assert(op.operands().size() == attr_names.size());
-    for (const auto &iter : llvm::zip(op.operands(), attr_names)) {
-      mlir::Value arg = std::get<0>(iter);
-      StringRef name = std::get<1>(iter);
-
-      Attribute const_attr;
-      if (!name.empty() && matchPattern(arg, m_Constant(&const_attr))) {
-        // Convert the folded argument to an attribute.
-        new_attributes.push_back({name, const_attr});
-      } else {
-        // Keep the argument that is not folded.
-        new_operands.push_back(arg);
-      }
-    }
-
-    if (new_operands.size() == op.operands().size()) return failure();
-
-    SmallString<32> new_op_name{"_"};
-    new_op_name += op.op_name();
-
-    rewriter.replaceOpWithNewOp<tfrt::corert::ExecuteOp>(
-        op, op.getResultTypes(), op.device(), new_operands, new_attributes,
-        new_op_name);
-
-    return success();
-  }
-
- private:
-  // Map from op_name to attr_names. The attr_names indicates the name of the
-  // attribute to which each constant-folded argument is converted. An empty
-  // string means this argument should not be folded.
-  llvm::DenseMap<StringRef, ArrayRef<StringRef>> ops_to_attrs_;
-};
-
-struct CoreRTOptimizePass
-    : public mlir::PassWrapper<CoreRTOptimizePass, FunctionPass> {
-  void runOnFunction() override {
-    mlir::OwningRewritePatternList patterns;
-    auto func = getFunction();
-
-    static constexpr StringRef kMeanAttrs[] = {"", "reduction_indices"};
-    static constexpr StringRef kPadAttrs[] = {"", "paddings"};
-    static constexpr StringRef kTransposeAttrs[] = {"", "perm"};
-
-    static constexpr std::pair<StringRef, ArrayRef<StringRef>> kOpsToAttrs[] = {
-        {"tf.Mean", kMeanAttrs},
-        {"tf.Pad", kPadAttrs},
-        {"tf.Transpose", kTransposeAttrs},
-    };
-
-    patterns.insert<CoreRTExecuteOpRewritePattern>(&getContext(), kOpsToAttrs);
-
-    mlir::applyPatternsAndFoldGreedily(func, patterns);
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<mlir::OperationPass<mlir::FuncOp>> CreateCoreRTOptimizePass() {
-  return std::make_unique<CoreRTOptimizePass>();
-}
-
-static mlir::PassRegistration<CoreRTOptimizePass> pass("corert-optimize",
-                                                       "Optimizes corert.");
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.h b/tensorflow/compiler/mlir/tfrt/transforms/passes.h
deleted file mode 100644
index be0bf0fbd1f..00000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
-#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
-
-#include <memory>
-
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-
-namespace tensorflow {
-
-// Create a pass that converts MLIR TF dialect to MLIR TFRT CoreRT dialect.
-std::unique_ptr<mlir::Pass> CreateTFToCoreRTConversionPass();
-
-// Run TFToCoreRTConversionPass as a free function. Useful for reusing the pass
-// logic in a custom pass with additional conversions.
-mlir::LogicalResult TFToCoreRTConversionPassRun(
-    mlir::MLIRContext* context, mlir::ModuleOp* module,
-    mlir::ConversionTarget* target, mlir::OwningRewritePatternList* patterns);
-
-// Create the corert optimization pass.
-std::unique_ptr<mlir::OperationPass<mlir::FuncOp>> CreateCoreRTOptimizePass();
-
-struct CoreRTPipelineOptions
-    : public mlir::PassPipelineOptions<CoreRTPipelineOptions> {
-  Option<std::string> default_device{
-      *this, "default-device", llvm::cl::desc("default device assignment"),
-      llvm::cl::init("cpu")};
-  Option<bool> enable_optimizer{
-      *this, "enable-optimizer",
-      llvm::cl::desc("run optimization passes on corert dialect"),
-      llvm::cl::init(false)};
-  Option<std::string> force_data_format{
-      *this, "force-data-format",
-      llvm::cl::desc("force data format for all layout sensitive operations")};
-};
-
-// Creates a pipeline of passes that lowers MLIR TF Executor dialect to TF
-// dialect for CoreRT purposes.
-void CreateTFExecutorToTFPipeline(
-    mlir::OpPassManager& pm, const CoreRTPipelineOptions& options);  // NOLINT
-
-// Creates a pipeline of passes that converts MLIR TF Executor dialect to CoreRT
-// dialect.
-void CreateTFExecutorToCoreRTPipeline(
-    mlir::OpPassManager& pm, const CoreRTPipelineOptions& options);  // NOLINT
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_corert.cc b/tensorflow/compiler/mlir/tfrt/transforms/tf_to_corert.cc
deleted file mode 100644
index 0784dc4ffea..00000000000
--- a/tensorflow/compiler/mlir/tfrt/transforms/tf_to_corert.cc
+++ /dev/null
@@ -1,484 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file implements lowering of TF dialect to TFRT CoreRuntime ExecuteOp.
-// This lowering pass is heavily experimental and incomplete. External code
-// should not depend on the code here. And please do not take example on it as
-// "the path forward" for this.
-
-#include <vector>
-
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Transforms/DialectConversion.h"
-#include "mlir/Transforms/Passes.h"
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/OperationSupport.h"  // from @llvm-project
-#include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Pass/PassOptions.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
-#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/platform/tstring.h"
-#include "tfrt/basic_kernels/opdefs/basic_kernels.h"
-#include "tfrt/core_runtime/opdefs/attributes.h"
-#include "tfrt/core_runtime/opdefs/core_runtime.h"
-
-namespace tensorflow {
-namespace {
-
-// TODO(chky): define these dialect types instead of using opaque types.
-mlir::Type CreateDeviceType(mlir::Builder *builder) {
-  return mlir::OpaqueType::get(builder->getIdentifier("corert"), "device",
-                               builder->getContext());
-}
-
-mlir::Type CreateTensorHandleType(mlir::Builder *builder) {
-  return mlir::OpaqueType::get(builder->getIdentifier("corert"), "tensorhandle",
-                               builder->getContext());
-}
-
-mlir::Type CreateStringType(mlir::Builder *builder) {
-  return mlir::OpaqueType::get(builder->getIdentifier("hex"), "string",
-                               builder->getContext());
-}
-
-// A helper class for converting CoreRT types and attributes.
-class CoreRTConverter : public mlir::TypeConverter {
- public:
-  explicit CoreRTConverter(mlir::MLIRContext *context)
-      : builder_(context),
-        device_type_(CreateDeviceType(&builder_)),
-        tensor_handle_type_(CreateTensorHandleType(&builder_)) {
-    addConversion([](Type type) { return type; });
-    addConversion([=](TensorType type) { return tensor_handle_type_; });
-  }
-
-  // Create a single attribute that contains the named attribute lists. It is an
-  // array of pairs. The key must be a string attribute, and the value can be
-  // any attribute that is supported by CoreRuntime.
-  mlir::ArrayAttr CreateOpAttrs(ArrayRef<NamedAttribute> attrs) {
-    llvm::SmallVector<mlir::Attribute, 4> attr_array;
-    for (auto key_and_value : attrs) {
-      if (!IsUnusedAttribute(key_and_value.first)) {
-        auto converted = ConvertAttribute(key_and_value.second);
-        if (!converted) return {};
-
-        mlir::StringAttr key = builder_.getStringAttr(key_and_value.first);
-        attr_array.push_back(builder_.getArrayAttr({key, converted}));
-      }
-    }
-    return builder_.getArrayAttr(attr_array);
-  }
-
-  // Convert the device attribute in `op` to a device value produced by the
-  // corresponding GetDeviceOp in the current block. If there does not exist
-  // one, insert a GetDeviceOp to the beginning of the block and return the
-  // device value.
-  Value ConvertDevice(mlir::Operation *op,
-                      ConversionPatternRewriter *rewriter) const {
-    auto device_attr = op->getAttr("device");
-    if (!device_attr) {
-      op->emitOpError("device attribute not found.");
-      return {};
-    }
-
-    auto device_name = device_attr.cast<mlir::StringAttr>().getValue();
-    if (device_name.empty()) {
-      op->emitOpError("device has not been assigned.");
-      return {};
-    }
-
-    op->removeAttr(rewriter->getIdentifier("device"));
-
-    auto *block = op->getBlock();
-
-    if (auto get_device_op = GetDeviceOrNull(device_name, block))
-      return get_device_op.device();
-
-    ConversionPatternRewriter::InsertionGuard insertion_guard(*rewriter);
-    rewriter->setInsertionPointToStart(block);
-    return rewriter
-        ->create<tfrt::corert::GetDeviceOp>(block->getParent()->getLoc(),
-                                            device_type(), device_name)
-        .device();
-  }
-
-  mlir::Type device_type() const { return device_type_; }
-  mlir::Type tensor_handle_type() const { return tensor_handle_type_; }
-
- private:
-  // TODO(chky): attributes "_output_shapes" should be removed by any tool that
-  // generates TF MLIR dialect, as they are not used by CoreRuntime. Remove this
-  // filtering logic once unused attributes are cleaned up in the upper layer.
-  bool IsUnusedAttribute(llvm::StringRef name) const {
-    return name == "_output_shapes";
-  }
-
-  // Returns the converted attribute in TFRT dialect. If the conversion fails,
-  // returns a null attribute instead.
-  mlir::Attribute ConvertAttribute(mlir::Attribute attr) {
-    // The supported attributes here should be kept consistent with
-    // //third_party/tf_runtime/include/tfrt/core_runtime/op_attr_type.h
-    //
-    // Currently, not all tensorflow data types are supported. Unranked shape
-    // attributes are not supported yet.
-
-    // Return directly if the attribute is already supported.
-    if (attr.isa<mlir::IntegerAttr>() || attr.isa<mlir::FloatAttr>() ||
-        attr.isa<mlir::BoolAttr>() || attr.isa<mlir::TypeAttr>() ||
-        attr.isa<mlir::StringAttr>() ||
-        attr.isa<mlir::DenseIntOrFPElementsAttr>())
-      return attr;
-
-    // Convert the attribute to the corresponding format in TFRT dialect if
-    // needed.
-    if (auto shape_attr = attr.dyn_cast<mlir::TF::ShapeAttr>()) {
-      if (!shape_attr.hasRank()) return {};
-      return tfrt::corert::ShapeAttr::get(builder_.getContext(),
-                                          shape_attr.getShape());
-    }
-
-    // For arrays, we recursively convert the elements.
-    if (auto array_attr = attr.dyn_cast<mlir::ArrayAttr>()) {
-      llvm::SmallVector<mlir::Attribute, 8> attrs;
-      attrs.reserve(array_attr.size());
-      for (auto attr : array_attr) {
-        auto converted = ConvertAttribute(attr);
-        if (!converted) return {};
-        attrs.push_back(converted);
-      }
-      return builder_.getArrayAttr(attrs);
-    }
-
-    return {};
-  }
-
-  // Find a GetDeviceOp that matches the device_name at the beginning of the
-  // block. Return nullptr if it does not find one.
-  tfrt::corert::GetDeviceOp GetDeviceOrNull(StringRef device_name,
-                                            Block *block) const {
-    for (auto &op : *block) {
-      auto get_device_op = llvm::dyn_cast<tfrt::corert::GetDeviceOp>(&op);
-      if (!get_device_op) break;
-      if (get_device_op.device_name() == device_name) return get_device_op;
-    }
-    return nullptr;
-  }
-
-  mlir::Builder builder_;
-  mlir::Type device_type_;
-  mlir::Type tensor_handle_type_;
-};
-
-// Lower a tf.Const op that creates a string tensor to a native
-// corert.create_string_tensor op.
-class CoreRTConstStringTensorOpConversion
-    : public mlir::OpConversionPattern<mlir::TF::ConstOp> {
- public:
-  CoreRTConstStringTensorOpConversion(mlir::MLIRContext *context,
-                                      CoreRTConverter *corert_converter)
-      : mlir::OpConversionPattern<mlir::TF::ConstOp>(context),
-        corert_converter_(*corert_converter) {}
-
-  LogicalResult matchAndRewrite(
-      mlir::TF::ConstOp op, ArrayRef<mlir::Value> operands,
-      ConversionPatternRewriter &rewriter) const override {  // NOLINT
-    if (!op.dtype().isa<mlir::TF::StringType>()) return failure();
-
-    DenseStringElementsAttr attr = op.value().cast<DenseStringElementsAttr>();
-
-    llvm::SmallVector<Attribute, 4> values;
-    values.reserve(attr.getNumElements());
-    for (const auto &element : attr.getRawStringData())
-      values.push_back(rewriter.getStringAttr(
-          llvm::StringRef(element.data(), element.size())));
-
-    // Create the shape attribute from the tensor shape.
-    ArrayRef<int64_t> shape = op.value().getType().getShape();
-    llvm::SmallVector<mlir::Attribute, 4> dims;
-    dims.reserve(shape.size());
-    auto i64_type = rewriter.getIntegerType(64);
-    for (auto dim : shape)
-      dims.push_back(rewriter.getIntegerAttr(i64_type, dim));
-
-    auto new_op = rewriter.create<tfrt::corert::ConstStringTensorOp>(
-        op.getLoc(), corert_converter_.tensor_handle_type(),
-        rewriter.getArrayAttr(dims), rewriter.getArrayAttr(values));
-
-    rewriter.replaceOp(op, new_op.result());
-
-    return success();
-  }
-
- private:
-  CoreRTConverter &corert_converter_;
-};
-
-// Convert TF dialect operations with no side effects to CoreRT ExecuteOp. For
-// example,
-//
-// %0 = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} :
-//    (tensor<3x1xf32>, tensor<1x3xf32>) -> tensor<3x3xf32>
-//
-// is converted to
-//
-// %result = corert.executeop(%device)
-//    "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = false} :
-//    (!corert.tensorhandle, !corert.tensorhandle) -> !corert.tensorhandle
-//
-// Note that it will fail to match if some attributes are not supported.
-template <typename TF_Op>
-class CoreRTExecuteOpConversion : public mlir::OpConversionPattern<TF_Op> {
- public:
-  CoreRTExecuteOpConversion(mlir::MLIRContext *context,
-                            CoreRTConverter *corert_converter)
-      : mlir::OpConversionPattern<TF_Op>(context),
-        corert_converter_(*corert_converter) {}
-
-  LogicalResult matchAndRewrite(
-      TF_Op op, ArrayRef<mlir::Value> operands,
-      ConversionPatternRewriter &rewriter) const override {  // NOLINT
-    mlir::StringAttr op_name = rewriter.getStringAttr(op.getOperationName());
-
-    llvm::SmallVector<Type, 4> result_types;
-    for (auto type : op.getOperation()->getResultTypes())
-      result_types.push_back(corert_converter_.convertType(type));
-
-    // Get the device, or create one if there does not exist one.
-    auto device = corert_converter_.ConvertDevice(op, &rewriter);
-    if (!device) return failure();
-
-    auto derived_attrs = op.materializeDerivedAttributes();
-    for (auto named_attr : derived_attrs) {
-      op.setAttr(named_attr.first, named_attr.second);
-    }
-
-    ArrayAttr op_attrs = corert_converter_.CreateOpAttrs(op.getAttrs());
-    if (!op_attrs) return failure();
-
-    auto new_op = rewriter.create<tfrt::corert::ExecuteOp>(
-        op.getLoc(), result_types, device, operands, op_attrs, op_name);
-
-    rewriter.replaceOp(op, new_op.results());
-    return success();
-  }
-
- private:
-  CoreRTConverter &corert_converter_;
-};
-
-// Deletes the op and forwards the arguments.
-template <typename TF_Op>
-class PassThroughConversion : public mlir::OpConversionPattern<TF_Op> {
- public:
-  explicit PassThroughConversion(MLIRContext *context)
-      : mlir::OpConversionPattern<TF_Op>(context) {}
-
-  LogicalResult matchAndRewrite(
-      TF_Op op, ArrayRef<mlir::Value> operands,
-      ConversionPatternRewriter &rewriter) const override {  // NOLINT
-    // Just forward the arguments to results.
-    rewriter.replaceOp(op, operands);
-    return success();
-  }
-};
-
-// Convert standard ReturnOp to hex.return.
-//
-// TODO(chky): conversion to hex kernels should come from a common tf_to_hex
-// library.
-class ReturnOpConversion : public mlir::OpConversionPattern<mlir::ReturnOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      mlir::ReturnOp op, ArrayRef<mlir::Value> operands,
-      ConversionPatternRewriter &rewriter) const override {
-    rewriter.replaceOpWithNewOp<tfrt::hex::ReturnOp>(op, operands);
-    return success();
-  }
-};
-
-// Convert TF dialect to CoreRT dialect.
-class TFToCoreRTConversionPass
-    : public mlir::PassWrapper<TFToCoreRTConversionPass,
-                               OperationPass<ModuleOp>> {
-  void runOnOperation() override {
-    auto module = getOperation();
-    mlir::ConversionTarget target(getContext());
-    mlir::OwningRewritePatternList patterns;
-    if (failed(TFToCoreRTConversionPassRun(&getContext(), &module, &target,
-                                           &patterns)))
-      signalPassFailure();
-  }
-};
-
-}  // namespace
-
-LogicalResult TFToCoreRTConversionPassRun(
-    mlir::MLIRContext *context, mlir::ModuleOp *module,
-    mlir::ConversionTarget *target, mlir::OwningRewritePatternList *patterns) {
-  module->removeAttr("tf_saved_model.semantics");
-
-  mlir::Builder builder(context);
-  auto bound_id = builder.getIdentifier("tf_saved_model.bound_input");
-  auto path_id = builder.getIdentifier("tf_saved_model.index_path");
-
-  module->walk([bound_id, path_id, module](mlir::Operation *op) mutable {
-    if (auto func_op = dyn_cast<mlir::FuncOp>(op)) {
-      // Remove tf_saved_model specific function arg attributes.
-      for (unsigned i = 0, e = func_op.getNumArguments(); i != e; ++i) {
-        func_op.removeArgAttr(i, bound_id);
-        func_op.removeArgAttr(i, path_id);
-      }
-      for (unsigned i = 0, e = func_op.getNumResults(); i != e; ++i) {
-        func_op.removeResultAttr(i, bound_id);
-        func_op.removeResultAttr(i, path_id);
-      }
-      if (auto exported_names = func_op.getAttrOfType<mlir::ArrayAttr>(
-              "tf_saved_model.exported_names")) {
-        // Create a function for each exported name.
-        //
-        // TODO(b/148477882): TFRT dialect should have similar concepts of
-        // exported names so that a function can be referenced by multiple
-        // exported names.
-        func_op.removeAttr("tf_saved_model.exported_names");
-        for (auto exported_name : exported_names) {
-          auto exported_func_op = func_op.clone();
-          exported_func_op.setName(
-              exported_name.cast<mlir::StringAttr>().getValue());
-          module->insert(module->begin(), exported_func_op);
-        }
-        func_op.erase();
-      }
-    } else if (isa<mlir::tf_saved_model::GlobalTensorOp>(op)) {
-      // Remove all global_tensor_ops.
-      op->erase();
-    }
-  });
-
-  CoreRTConverter corert_converter(context);
-
-  target->addLegalDialect<tfrt::corert::CoreRTDialect>();
-  target->addLegalDialect<tfrt::hex::HexDialect>();
-  target->addIllegalDialect<TF::TensorFlowDialect>();
-  target->addDynamicallyLegalOp<mlir::FuncOp>([&corert_converter](FuncOp op) {
-    return corert_converter.isSignatureLegal(op.getType());
-  });
-
-  patterns->insert<PassThroughConversion<TF::ReadVariableOp>,
-                   PassThroughConversion<TF::IdentityOp>, ReturnOpConversion>(
-      context);
-
-  // Here we use one specialized pattern for tf.Const with string tensors as
-  // it will incorrect to use ExecuteOp pattern to convert string tensor
-  // attribute.
-  patterns->insert<CoreRTConstStringTensorOpConversion>(context,
-                                                        &corert_converter);
-
-  // TODO(b/148823030): Pattern registration for TF operations is not
-  // sustainable currently. We need to figure out a plan
-  patterns->insert<CoreRTExecuteOpConversion<TF::AddV2Op>,
-                   // TODO(chky): Move the ReadVariableOp + Identity pattern
-                   // to optimizer.
-                   // CoreRTExecuteOpConversion<TF::IdentityOp>,
-                   CoreRTExecuteOpConversion<TF::MulOp>,
-                   CoreRTExecuteOpConversion<TF::BiasAddOp>,
-                   CoreRTExecuteOpConversion<TF::Conv2DOp>,
-                   CoreRTExecuteOpConversion<TF::ConcatV2Op>,
-                   CoreRTExecuteOpConversion<TF::ConstOp>,
-                   CoreRTExecuteOpConversion<TF::CastOp>,
-                   CoreRTExecuteOpConversion<TF::ExpandDimsOp>,
-                   CoreRTExecuteOpConversion<TF::TransposeOp>,
-                   CoreRTExecuteOpConversion<TF::FusedBatchNormV3Op>,
-                   CoreRTExecuteOpConversion<TF::FusedBatchNormExOp>,
-                   CoreRTExecuteOpConversion<TF::MatMulOp>,
-                   CoreRTExecuteOpConversion<TF::MaxPoolOp>,
-                   CoreRTExecuteOpConversion<TF::MeanOp>,
-                   CoreRTExecuteOpConversion<TF::PadOp>,
-                   CoreRTExecuteOpConversion<TF::ParseExampleV2Op>,
-                   CoreRTExecuteOpConversion<TF::ReluOp>,
-                   CoreRTExecuteOpConversion<TF::SoftmaxOp>,
-                   CoreRTExecuteOpConversion<TF::ShapeOp>,
-                   CoreRTExecuteOpConversion<TF::TanhOp>>(context,
-                                                          &corert_converter);
-
-  mlir::populateFuncOpTypeConversionPattern(*patterns, context,
-                                            corert_converter);
-  return mlir::applyPartialConversion(*module, *target, *patterns);
-}
-
-std::unique_ptr<mlir::Pass> CreateTFToCoreRTConversionPass() {
-  return std::make_unique<TFToCoreRTConversionPass>();
-}
-
-void CreateTFExecutorToTFPipeline(mlir::OpPassManager &pm,
-                                  const CoreRTPipelineOptions &options) {
-  // First, we prune unused operations in MLIR in TF Executor dialect.
-  pm.addPass(mlir::tf_executor::CreateTFExecutorGraphPruningPass());
-
-  // Then we pass the MLIR module through the TF standard pipeline, which for
-  // instances does shape inference, canonicalization, inlining, etc.
-  mlir::TF::StandardPipelineOptions tf_options;
-  tf_options.enable_inliner = true;
-  mlir::TF::CreateTFStandardPipeline(pm, tf_options);
-
-  // After all standard passes run layout optimization to assign optimal data
-  // format for all layout sensitive operations.
-  mlir::TF::LayoutOptimizationPipelineOptions layout_optimization_options;
-  layout_optimization_options.force_data_format =
-      options.force_data_format.getValue();
-  mlir::TF::CreateLayoutOptimizationPipeline(pm, layout_optimization_options);
-
-  // Run canonicalization pipeline to remove unused constants and bypassed
-  // transpose operations left in the IR after layout optimization.
-  pm.addNestedPass<mlir::FuncOp>(mlir::createCanonicalizerPass());
-
-  if (options.default_device == "gpu")
-    pm.addNestedPass<mlir::FuncOp>(mlir::TF::CreateGpuOpFusionPass());
-
-  // Then we assign default devices.
-  pm.addNestedPass<mlir::FuncOp>(
-      mlir::TF::CreateSimpleTFDeviceAssignmentPass(options.default_device));
-}
-
-void CreateTFExecutorToCoreRTPipeline(mlir::OpPassManager &pm,
-                                      const CoreRTPipelineOptions &options) {
-  CreateTFExecutorToTFPipeline(pm, options);
-
-  // Convert it to MLIR in CoreRT dialect.
-  pm.addPass(CreateTFToCoreRTConversionPass());
-
-  // Run optimizer on the MLIR module in CoreRT dialect.
-  if (options.enable_optimizer)
-    pm.addNestedPass<mlir::FuncOp>(CreateCoreRTOptimizePass());
-}
-
-static mlir::PassRegistration<TFToCoreRTConversionPass> pass(
-    "tf-to-corert",
-    "Convert Tensorflow dialect to TFRT's CoreRuntime dialect.");
-
-static mlir::PassPipelineRegistration<CoreRTPipelineOptions> pipeline(
-    "tf-executor-to-corert-pipeline",
-    "Convert Tensorflow Executor dialect to TFRT's CoreRuntime dialect, and "
-    "also apply necessary optimization passes.",
-    CreateTFExecutorToCoreRTPipeline);
-
-}  // namespace tensorflow

From 21b04b6fe0c5bc6a8dd0cc2f414760f47b142ae9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 10:02:03 -0700
Subject: [PATCH 114/412] Add support for global operation dispatchers.  (This
 is intended for use by TF-internal classes only.)

PiperOrigin-RevId: 311350209
Change-Id: Ib095f019fc6825409b490d7dec7e86116955b746
---
 tensorflow/python/util/dispatch.py      | 21 ---------
 tensorflow/python/util/dispatch_test.py | 58 +------------------------
 2 files changed, 2 insertions(+), 77 deletions(-)

diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
index 3868da14b44..e94e3345348 100644
--- a/tensorflow/python/util/dispatch.py
+++ b/tensorflow/python/util/dispatch.py
@@ -39,10 +39,6 @@ from tensorflow.python.util import tf_inspect
 DISPATCH_ATTR = "_tf_dispatchers"
 
 
-# OpDispatchers which should be used for all operations.
-_GLOBAL_DISPATCHERS = []
-
-
 class OpDispatcher(object):
   """Abstract base class for TensorFlow operator dispatchers.
 
@@ -86,19 +82,6 @@ class OpDispatcher(object):
     getattr(op, DISPATCH_ATTR).append(self)
 
 
-class GlobalOpDispatcher(object):
-  """Abstract base class for TensorFlow global operator dispatchers."""
-
-  NOT_SUPPORTED = OpDispatcher.NOT_SUPPORTED
-
-  def handle(self, op, args, kwargs):
-    """Handle the specified operation with the specified arguments."""
-
-  def register(self):
-    """Register this dispatcher as a handler for all ops."""
-    _GLOBAL_DISPATCHERS.append(self)
-
-
 def dispatch(op, *args, **kwargs):
   """Returns the result from the first successful dispatcher for a given op.
 
@@ -118,10 +101,6 @@ def dispatch(op, *args, **kwargs):
     result = dispatcher.handle(args, kwargs)
     if result is not OpDispatcher.NOT_SUPPORTED:
       return result
-  for dispatcher in _GLOBAL_DISPATCHERS:
-    result = dispatcher.handle(op, args, kwargs)
-    if result is not OpDispatcher.NOT_SUPPORTED:
-      return result
   return OpDispatcher.NOT_SUPPORTED
 
 
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index bd35c391924..89999fcf843 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -45,47 +45,6 @@ def test_op(x, y, z):
   return x + (2 * y) + (3 * z)
 
 
-class TensorTracer(object):
-  """An object used to trace TensorFlow graphs.
-
-  This is an example class that is used to test global op dispatchers.  The
-  global op dispatcher for TensorTracers is defined below.
-  """
-
-  def __init__(self, name, args=None, kwargs=None):
-    self.name = name
-    self.args = args
-    self.kwargs = kwargs
-
-  def __repr__(self):
-    if self.args is None and self.kwargs is None:
-      return self.name
-    else:
-      args = [str(x) for x in self.args]
-      args += sorted(
-          ["{}={}".format(name, x) for (name, x) in self.kwargs.items()])
-      return "{}({})".format(self.name, ", ".join(args))
-
-
-class TensorTracerOpDispatcher(dispatch.GlobalOpDispatcher):
-  """Global op dispatcher for TensorTracer."""
-
-  def handle(self, op, args, kwargs):
-    # Dispatcher only applies if at least one arg is a TensorTracer.
-    if not (any(self.is_tensor_tracer_arg(x) for x in args) or
-            any(self.is_tensor_tracer_arg(x) for x in kwargs.values())):
-      return self.NOT_SUPPORTED
-
-    return TensorTracer(op.__name__, args, kwargs)
-
-  def is_tensor_tracer_arg(self, value):
-    if isinstance(value, TensorTracer):
-      return True
-    if isinstance(value, (list, tuple)):
-      if any(isinstance(x, TensorTracer) for x in value):
-        return True
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class DispatchTest(test_util.TensorFlowTestCase):
 
@@ -172,21 +131,8 @@ class DispatchTest(test_util.TensorFlowTestCase):
         r".*some_op \(from __main__\) is deprecated and will be "
         "removed in a future version.*")
 
-  def testGlobalDispatcher(self):
-    original_global_dispatchers = dispatch._GLOBAL_DISPATCHERS
-    try:
-      TensorTracerOpDispatcher().register()
-
-      x = TensorTracer("x")
-      y = TensorTracer("y")
-      trace = math_ops.reduce_sum(math_ops.add(math_ops.abs(x), y), axis=3)
-      self.assertEqual(
-          str(trace), "reduce_sum(add(name=None, x=abs(x), y=y), axis=3)")
-
-    finally:
-      # Clean up.
-      dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
-
 
 if __name__ == "__main__":
   googletest.main()
+
+

From 840e8b64a1a8ccbd88bf00621019912ec17c16a9 Mon Sep 17 00:00:00 2001
From: Kuangyuan Chen <chky@google.com>
Date: Wed, 13 May 2020 10:20:30 -0700
Subject: [PATCH 115/412] Set up TFRT OSS dependency in Tensorflow.

PiperOrigin-RevId: 311354250
Change-Id: I79f65da3dbde9ea21d412860fb63b417818268ee
---
 tensorflow/workspace.bzl | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 7cc156a2985..6a958e1b00f 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -162,19 +162,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         print("path_prefix was specified to tf_workspace but is no longer used " +
               "and will be removed in the future.")
 
-    TFRT_COMMIT = "26fb26d716545388edb9785f8f4b3e60a4ad5092"
-    TFRT_SHA256 = "f7419a3eaab8b7137a4de5b428045a731d93da91ef1bce9ba91fab81ed23a676"
-    TFRT_URLS = [
-        "http://mirror.tensorflow.org/github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),
-        "https://github.com/tensorflow/runtime/archive/{commit}.zip".format(commit = TFRT_COMMIT),
-    ]
-    tf_http_archive(
-        name = "tf_runtime",
-        sha256 = TFRT_SHA256,
-        strip_prefix = "runtime-" + TFRT_COMMIT,
-        urls = TFRT_URLS,
-    )
-
     tf_http_archive(
         name = "XNNPACK",
         sha256 = "15a300dec0d483af67310ed2edf76a6eff643e1438d0612ad00a372add472c22",

From 88c4ee01021e67e7eaf49a32de381e751d484495 Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Wed, 13 May 2020 10:48:49 -0700
Subject: [PATCH 116/412] Adding TensorHandleList as the returned type for
 TF_ConcreteFunctionGetCaptures.

PiperOrigin-RevId: 311360593
Change-Id: Ic292aef980339e5bd5e360eea391bbee4751caf9
---
 .../c/experimental/saved_model/internal/BUILD | 37 +++++++++++++--
 .../saved_model/internal/concrete_function.cc |  9 ++--
 .../saved_model/internal/tensorhandle_list.cc | 39 +++++++++++++++
 .../internal/tensorhandle_list_type.h         | 37 +++++++++++++++
 .../c/experimental/saved_model/public/BUILD   |  7 +++
 .../saved_model/public/c_saved_model_api.h    |  1 +
 .../saved_model/public/concrete_function.h    |  6 +--
 .../public/concrete_function_list.h           | 16 +++++--
 .../saved_model/public/tensorhandle_list.h    | 47 +++++++++++++++++++
 9 files changed, 184 insertions(+), 15 deletions(-)
 create mode 100644 tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
 create mode 100644 tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
 create mode 100644 tensorflow/c/experimental/saved_model/public/tensorhandle_list.h

diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index 7a694f4f803..5c51e26f925 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -31,9 +31,6 @@ cc_library(
         "//tensorflow/c/experimental/saved_model/public:concrete_function.h",
     ],
     copts = tf_copts(),
-    # TODO(bmzhao): Remove this as we refactor C API to granular targets,
-    # so that we can depend on c/eager/c_api_unified_experimental.h.
-    features = ["-layering_check"],
     visibility = [
         "//tensorflow/c/experimental/saved_model/public:__pkg__",
     ],
@@ -41,6 +38,8 @@ cc_library(
         ":concrete_function_type",
         ":function_metadata",
         ":function_metadata_type",
+        ":tensorhandle_list",
+        ":tensorhandle_list_type",
         "//tensorflow/c:c_api_macros",
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_internal",
@@ -160,6 +159,38 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tensorhandle_list",
+    srcs = [
+        "tensorhandle_list.cc",
+    ],
+    hdrs = [
+        "//tensorflow/c/experimental/saved_model/public:tensorhandle_list.h",
+    ],
+    copts = tf_copts(),
+    visibility = [
+        "//tensorflow/c/experimental/saved_model/public:__pkg__",
+    ],
+    deps = [
+        ":tensorhandle_list_type",
+        "//tensorflow/c:c_api_macros",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:tensor_handle_interface",
+        "//tensorflow/c/eager:tfe_tensorhandle_internal",
+    ],
+)
+
+cc_library(
+    name = "tensorhandle_list_type",
+    hdrs = [
+        "tensorhandle_list_type.h",
+    ],
+    deps = [
+        "//tensorflow/c:conversion_macros",
+        "//tensorflow/c/eager:tensor_handle_interface",
+    ],
+)
+
 tf_cc_test(
     name = "saved_model_api_test",
     size = "small",
diff --git a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
index 4884f9e2e97..dd54416ddf9 100644
--- a/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
+++ b/tensorflow/c/experimental/saved_model/internal/concrete_function.cc
@@ -15,12 +15,12 @@ limitations under the License.
 
 #include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
 
-#include "tensorflow/c/eager/c_api_unified_experimental.h"
 #include "tensorflow/c/eager/tfe_op_internal.h"
 #include "tensorflow/c/experimental/saved_model/core/concrete_function.h"
 #include "tensorflow/c/experimental/saved_model/core/function_metadata.h"
 #include "tensorflow/c/experimental/saved_model/internal/concrete_function_type.h"
 #include "tensorflow/c/experimental/saved_model/internal/function_metadata_type.h"
+#include "tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h"
 
 extern "C" {
 
@@ -29,10 +29,9 @@ TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(TF_ConcreteFunction* func) {
       &tensorflow::unwrap(func)->GetFunctionMetadata()));
 }
 
-TF_OutputList* TF_ConcreteFunctionGetCaptures(TF_ConcreteFunction* func) {
-  // TODO(bmzhao): Refactor TF_OutputList struct definition into a separate
-  // internal header, and implement this function.
-  return nullptr;
+const TF_TensorHandleList* TF_ConcreteFunctionGetCaptures(
+    TF_ConcreteFunction* func) {
+  return tensorflow::wrap(&tensorflow::unwrap(func)->GetCaptures());
 }
 
 TFE_Op* TF_ConcreteFunctionGetCallOp(TF_ConcreteFunction* func) {
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
new file mode 100644
index 00000000000..6ef937591aa
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
@@ -0,0 +1,39 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
+
+#include <stddef.h>
+
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+#include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
+#include "tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h"
+
+extern "C" {
+
+size_t TF_TensorHandleListSize(const TF_TensorHandleList* list) {
+  return tensorflow::unwrap(list)->size();
+}
+
+TFE_TensorHandle* TF_TensorHandleListGet(const TF_TensorHandleList* list,
+                                         int i) {
+  return tensorflow::wrap((*tensorflow::unwrap(list))[i]);
+}
+
+void TF_DeleteTensorHandleList(const TF_TensorHandleList* list) {
+  delete tensorflow::unwrap(list);
+}
+
+}  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
new file mode 100644
index 00000000000..8cbec2806a8
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list_type.h
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
+
+#include <vector>
+
+#include "tensorflow/c/conversion_macros.h"
+#include "tensorflow/c/eager/tensor_handle_interface.h"
+
+// Internal structures used by the SavedModel C API. These are likely to
+// change and should not be depended on.
+
+typedef struct TF_TensorHandleList TF_TensorHandleList;
+
+namespace tensorflow {
+
+DEFINE_CONVERSION_FUNCTIONS(
+    std::vector<tensorflow::AbstractTensorHandleInterface*>,
+    TF_TensorHandleList)
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_INTERNAL_CONCRETE_FUNCTION_LIST_TYPE_H_
diff --git a/tensorflow/c/experimental/saved_model/public/BUILD b/tensorflow/c/experimental/saved_model/public/BUILD
index af65e05e7f6..0cfa0a2c005 100644
--- a/tensorflow/c/experimental/saved_model/public/BUILD
+++ b/tensorflow/c/experimental/saved_model/public/BUILD
@@ -24,6 +24,7 @@ exports_files(
         "concrete_function_list.h",
         "function_metadata.h",
         "saved_model_api.h",
+        "tensorhandle_list.h",
     ],
     visibility = ["//tensorflow/c/experimental/saved_model/internal:__pkg__"],
 )
@@ -39,6 +40,7 @@ cc_library(
         ":concrete_function_list",
         ":function_metadata",
         ":saved_model_api",
+        ":tensorhandle_list",
     ],
 )
 
@@ -61,3 +63,8 @@ alias(
     name = "saved_model_api",
     actual = "//tensorflow/c/experimental/saved_model/internal:saved_model_api",
 )
+
+alias(
+    name = "tensorhandle_list",
+    actual = "//tensorflow/c/experimental/saved_model/internal:tensorhandle_list",
+)
diff --git a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
index 30f533f140a..aae95a5477c 100644
--- a/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
+++ b/tensorflow/c/experimental/saved_model/public/c_saved_model_api.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/c/experimental/saved_model/public/concrete_function_list.h"
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
 #include "tensorflow/c/experimental/saved_model/public/saved_model_api.h"
+#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
 // IWYU pragma: end_exports
 
 #endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_C_SAVED_MODEL_API_H_
diff --git a/tensorflow/c/experimental/saved_model/public/concrete_function.h b/tensorflow/c/experimental/saved_model/public/concrete_function.h
index 351d8daed8e..2a87214270c 100644
--- a/tensorflow/c/experimental/saved_model/public/concrete_function.h
+++ b/tensorflow/c/experimental/saved_model/public/concrete_function.h
@@ -17,9 +17,9 @@ limitations under the License.
 #define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_H_
 
 #include "tensorflow/c/c_api_macros.h"
-#include "tensorflow/c/eager/c_api_internal.h"
-#include "tensorflow/c/eager/c_api_unified_experimental.h"
+#include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/experimental/saved_model/public/function_metadata.h"
+#include "tensorflow/c/experimental/saved_model/public/tensorhandle_list.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -36,7 +36,7 @@ TF_CAPI_EXPORT extern TF_FunctionMetadata* TF_ConcreteFunctionGetMetadata(
     TF_ConcreteFunction* func);
 
 // Returns a list of TensorHandles implicitly captured by this function.
-TF_CAPI_EXPORT extern TF_OutputList* TF_ConcreteFunctionGetCaptures(
+TF_CAPI_EXPORT extern const TF_TensorHandleList* TF_ConcreteFunctionGetCaptures(
     TF_ConcreteFunction* func);
 
 // Returns a TFE_Op suitable for executing this function.
diff --git a/tensorflow/c/experimental/saved_model/public/concrete_function_list.h b/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
index 7add847259c..e35546751f1 100644
--- a/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
+++ b/tensorflow/c/experimental/saved_model/public/concrete_function_list.h
@@ -21,19 +21,27 @@ limitations under the License.
 #include "tensorflow/c/c_api_macros.h"
 #include "tensorflow/c/experimental/saved_model/public/concrete_function.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
 // An opaque type that is acts like a list of TF_ConcreteFunction pointers.
 typedef struct TF_ConcreteFunctionList TF_ConcreteFunctionList;
 
 // Returns the size of `list`.
-TF_CAPI_EXPORT size_t
-TF_ConcreteFunctionListSize(TF_ConcreteFunctionList* list);
+TF_CAPI_EXPORT extern size_t TF_ConcreteFunctionListSize(
+    TF_ConcreteFunctionList* list);
 
 // Returns the `i`th TF_ConcreteFunction in the list.
-TF_CAPI_EXPORT TF_ConcreteFunction* TF_ConcreteFunctionListGet(
+TF_CAPI_EXPORT extern TF_ConcreteFunction* TF_ConcreteFunctionListGet(
     TF_ConcreteFunctionList* list, int i);
 
 // Deletes `list`.
-TF_CAPI_EXPORT void TF_DeleteConcreteFunctionList(
+TF_CAPI_EXPORT extern void TF_DeleteConcreteFunctionList(
     TF_ConcreteFunctionList* list);
 
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
 #endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_CONCRETE_FUNCTION_LIST_H_
diff --git a/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
new file mode 100644
index 00000000000..393708aa2bf
--- /dev/null
+++ b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
@@ -0,0 +1,47 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
+#define TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_
+
+#include <stddef.h>
+
+#include "tensorflow/c/c_api_macros.h"
+#include "tensorflow/c/eager/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// An opaque type that is acts like a list of TF_ConcreteFunction pointers.
+typedef struct TF_TensorHandleList TF_TensorHandleList;
+
+// Returns the size of `list`.
+TF_CAPI_EXPORT extern size_t TF_TensorHandleListSize(
+    const TF_TensorHandleList* list);
+
+// Returns the `i`th TFE_TensorHandle in the list.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TF_TensorHandleListGet(
+    const TF_TensorHandleList* list, int i);
+
+// Deletes `list`.
+TF_CAPI_EXPORT extern void TF_DeleteTensorHandleList(
+    const TF_TensorHandleList* list);
+
+#ifdef __cplusplus
+}  // end extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_C_EXPERIMENTAL_SAVED_MODEL_PUBLIC_TENSORHANDLE_LIST_H_

From b97bf5ae0be96a3e00aa12a096263c9de08f474c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 11:06:08 -0700
Subject: [PATCH 117/412] Flush denormals to zero in eager mode.

PiperOrigin-RevId: 311364546
Change-Id: I42efa6b19b8193c49bc581879b04ce3d05a13607
---
 .../common_runtime/eager/kernel_and_device.cc |  4 +++
 .../python/kernel_tests/denormal_test.py      | 33 +++++++++----------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 3c586e8188a..bf7c083f24b 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -35,8 +35,10 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/profiler/lib/annotated_traceme.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/public/version.h"
@@ -281,6 +283,8 @@ Status KernelAndDeviceOp::Run(
   OpKernelContext context(&params);
 
   {
+    port::ScopedFlushDenormal flush;
+    port::ScopedSetRound round(FE_TONEAREST);
     // 'AnnotatedTraceMe' will trace both scheduling time on host and execution
     // time on device of the OpKernel.
     profiler::AnnotatedTraceMe activity(
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index d824e95f213..6e073f0d526 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -23,7 +23,6 @@ import platform
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
@@ -35,32 +34,30 @@ class DenormalTest(test.TestCase):
       tiny = np.finfo(dtype).tiny
       self.assertEqual(tiny, tiny / 16 * 16)
 
-  def _flushDenormalsTest(self, use_gpu, dtypes):
-    if platform.machine() == "ppc64le" or platform.machine(
-    ) == "s390x" or platform.machine() == "aarch64":
+  def _flushDenormalsTest(self, dtypes):
+    if (platform.machine() == "ppc64le" or platform.machine() == "s390x" or
+        platform.machine() == "aarch64"):
       # Disabled denormal_test on power/s390x/aarch64 platform
       # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
       return
-    with self.cached_session(use_gpu=use_gpu):
-      array_ops.identity(7).eval()
-      for dtype in dtypes:
-        tiny = np.finfo(dtype).tiny
-        # Small shape to test main thread, large shape to test thread pool
-        for shape in (), (1 << 20,):
-          flush = 0.1 * constant_op.constant(tiny, shape=shape)
-          self.assertAllEqual(flush.eval(), np.zeros(shape))
-          # Make sure the flags don't leak out
-          self.testPythonHasDenormals()
+    for dtype in dtypes:
+      tiny = np.finfo(dtype).tiny
+      # Small shape to test main thread, large shape to test thread pool
+      for shape in (), (1 << 20,):
+        flush = 0.1 * constant_op.constant(tiny, shape=shape)
+        self.assertAllEqual(self.evaluate(flush), np.zeros(shape))
+        # Make sure the flags don't leak out
+        self.testPythonHasDenormals()
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=False)
   def testFlushDenormalsCPU(self):
     # On CPUs, the processor flags flush for both single and double precision.
-    self._flushDenormalsTest(use_gpu=False, dtypes=(np.float32, np.float64))
+    self._flushDenormalsTest(dtypes=(np.float32, np.float64))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testFlushDenormalsGPU(self):
     # On GPUs, only single precision can flush to zero.
-    self._flushDenormalsTest(use_gpu=True, dtypes=(np.float32,))
+    self._flushDenormalsTest(dtypes=(np.float32,))
 
 
 if __name__ == "__main__":

From 59239ab4990468323df7c8237713bbae7a77b548 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Wed, 13 May 2020 11:18:28 -0700
Subject: [PATCH 118/412] Use round-robin approach to reading from tf.data
 service workers.

PiperOrigin-RevId: 311367134
Change-Id: I5408de5d85c13514c55681ecf09dcecec5c2168a
---
 .../experimental/data_service_dataset_op.cc   | 264 +++++++++++-------
 1 file changed, 167 insertions(+), 97 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 697f4d99a1e..56077a671fb 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -189,7 +189,9 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       VLOG(1) << "Destroying data service dataset iterator for job id "
               << job_id_;
       cancelled_ = true;
-      cv_.notify_all();
+      worker_thread_cv_.notify_all();
+      manager_thread_cv_.notify_all();
+      get_next_cv_.notify_all();
       // Thread destructors will block until the threads finish, no need to wait
       // here.
     }
@@ -222,12 +224,16 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             });
       }
 
-      while (results_.empty() && !job_finished_ && !cancelled_) {
-        cv_.wait(l);
+      while (results_.empty() && !job_finished_ && !cancelled_ &&
+             status_.ok()) {
+        get_next_cv_.wait(l);
       }
       if (cancelled_) {
         return errors::Cancelled("Data service iterator was cancelled");
       }
+      if (!status_.ok()) {
+        return status_;
+      }
       if (results_.empty()) {
         *end_of_sequence = true;
         return Status::OK();
@@ -236,7 +242,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       *end_of_sequence = false;
       out_tensors->swap(results_.front());
       results_.pop();
-      cv_.notify_all();
+      worker_thread_cv_.notify_one();
 
       return Status::OK();
     }
@@ -259,16 +265,21 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     }
 
    private:
-    typedef struct TaskThread {
-      int64 task_id;
-      // Cached address of the worker for task `task_id`.
-      std::string address;
-      std::unique_ptr<DataServiceWorkerClient> worker;
-      std::unique_ptr<Thread> thread;
-      bool end_of_sequence = false;
-      // Indicates that the thread has finished running.
-      bool finished = false;
-    } TaskThread;
+    struct Task {
+      Task(int64 task_id, const std::string& address,
+           std::unique_ptr<DataServiceWorkerClient> worker)
+          : task_id(task_id), address(address), worker(std::move(worker)) {}
+
+      const int64 task_id;
+      // Address of the tf.data service worker for task `task_id`.
+      const std::string address;
+      // Client for fetching task elements from the tf.data service worker.
+      const std::unique_ptr<DataServiceWorkerClient> worker;
+      // Indicates whether a worker thread is currently processing the task.
+      bool in_use TF_GUARDED_BY(&Iterator::mu_) = false;
+      // Indicates whether the worker has returned end_of_sequence for the task.
+      bool end_of_sequence TF_GUARDED_BY(&Iterator::mu_) = false;
+    };
 
     // Periodically refresh the task list.
     // Maintain one thread fetching elements for each task.
@@ -286,22 +297,23 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
             int64 remaining_time = next_check - Env::Default()->NowMicros();
             VLOG(3) << "Task thread manager waiting for " << remaining_time
                     << "us";
-            cv_.wait_for(l, std::chrono::microseconds(remaining_time));
+            manager_thread_cv_.wait_for(
+                l, std::chrono::microseconds(remaining_time));
           }
           if (cancelled_) {
             VLOG(3) << "Task thread manager finished";
             return;
           }
         }
-        UpdateTaskThreads(&master, ctx.get());
+        UpdateTasks(&master);
+        UpdateWorkerThreads(ctx.get());
         next_check = Env::Default()->NowMicros() +
                      dataset()->task_refresh_interval_ms_ * 1000;
       }
     }
 
-    void UpdateTaskThreads(DataServiceMasterClient* master,
-                           IteratorContext* ctx) LOCKS_EXCLUDED(mu_) {
-      VLOG(3) << "Updating task threads";
+    void UpdateTasks(DataServiceMasterClient* master) LOCKS_EXCLUDED(mu_) {
+      VLOG(3) << "Updating tasks";
       std::vector<TaskInfo> tasks;
       bool job_finished;
       Status s = master->GetTasks(job_id_, &tasks, &job_finished);
@@ -310,94 +322,119 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
                      << s;
         return;
       }
-      absl::flat_hash_set<int64> task_ids;
+      absl::flat_hash_map<int64, TaskInfo> task_id_to_task;
+      for (auto& task : tasks) {
+        task_id_to_task[task.id()] = task;
+      }
       mutex_lock l(mu_);
       job_finished_ = job_finished;
-      for (auto& task : tasks) {
-        task_ids.insert(task.id());
-        if (task_threads_.contains(task.id())) {
-          continue;
-        }
-        task_threads_[task.id()] = absl::make_unique<TaskThread>();
-        TaskThread* task_thread = task_threads_[task.id()].get();
-        task_thread->task_id = task.id();
-        task_thread->address = task.worker_address();
-        num_unfinished_tasks_++;
-        outstanding_requests_++;
-        auto done = [this, task_thread]() {
-          mutex_lock l(mu_);
-          num_unfinished_tasks_--;
-          outstanding_requests_--;
-          cv_.notify_all();
-          task_thread->finished = true;
-          VLOG(3) << "Task thread " << task_thread->task_id << " finished";
-        };
-        task_thread->thread =
-            ctx->StartThread("tf-data-service-task_thread",
-                             [this, task_thread, done = std::move(done)]() {
-                               RunTaskThread(task_thread, std::move(done));
-                             });
+      if (job_finished) {
+        get_next_cv_.notify_all();
+        return;
       }
-      // Mark deleted tasks and clean up finished task threads.
-      for (auto it = task_threads_.begin(); it != task_threads_.end();) {
-        TaskThread* task_thread = it->second.get();
-        if (task_thread->finished) {
-          task_threads_.erase(it++);
+      for (int i = 0; i < tasks_.size(); ++i) {
+        std::shared_ptr<Task> task = tasks_[i];
+        if (task_id_to_task.contains(task->task_id)) {
+          // Remove already-known tasks from `task_id_to_task`, so that at the
+          // end of the loop, only new tasks remain.
+          task_id_to_task.erase(task->task_id);
+        } else {
+          // Task has been removed.
+          if (task->end_of_sequence) {
+            finished_tasks_--;
+          }
+          tasks_[i] = tasks_[tasks_.size() - 1];
+          tasks_.pop_back();
+        }
+      }
+      for (auto& new_task_entry : task_id_to_task) {
+        TaskInfo& task_info = new_task_entry.second;
+        std::unique_ptr<DataServiceWorkerClient> worker;
+        Status s = CreateDataServiceWorkerClient(task_info.worker_address(),
+                                                 dataset()->protocol_, &worker);
+        if (!s.ok()) {
+          status_ = s;
+          get_next_cv_.notify_all();
           continue;
         }
-        if (!task_ids.contains(task_thread->task_id)) {
-          VLOG(3) << "Marking removed task thread " << task_thread->task_id
-                  << " as finished";
-          task_thread->end_of_sequence = true;
-        }
-        ++it;
+        tasks_.push_back(std::make_shared<Task>(
+            task_info.id(), task_info.worker_address(), std::move(worker)));
       }
       if (dataset()->max_outstanding_requests_ == model::kAutotune) {
         // Adjust max_outstanding_requests to account for newly added tasks.
-        max_outstanding_requests_ = task_threads_.size();
+        max_outstanding_requests_ = tasks_.size();
       }
     }
 
-    void RunTaskThread(TaskThread* task_thread, std::function<void()> done) {
+    void UpdateWorkerThreads(IteratorContext* ctx) LOCKS_EXCLUDED(mu_) {
+      mutex_lock l(mu_);
+      while (num_running_worker_threads_ < max_outstanding_requests_) {
+        num_running_worker_threads_++;
+        outstanding_requests_++;
+        auto done = [this]() {
+          mutex_lock l(mu_);
+          num_running_worker_threads_--;
+          outstanding_requests_--;
+          VLOG(3) << "Exiting worker thread";
+        };
+        worker_threads_.push_back(ctx->StartThread(
+            "tf-data-service-task_thread", [this, done = std::move(done)]() {
+              RunWorkerThread(std::move(done));
+            }));
+      }
+    }
+
+    void RunWorkerThread(std::function<void()> done) {
       auto cleanup = gtl::MakeCleanup([done = std::move(done)]() { done(); });
-      VLOG(3) << "Starting task thread for task " << task_thread->task_id
-              << " with worker address " << task_thread->address;
+      VLOG(3) << "Starting worker thread";
+      std::shared_ptr<Task> task_to_process;
       while (true) {
-        if (!task_thread->worker) {
-          Status s = CreateDataServiceWorkerClient(
-              task_thread->address, dataset()->protocol_, &task_thread->worker);
-          if (!s.ok()) {
-            LOG(WARNING) << "Failed to create a worker client for "
-                         << task_thread->address << ": " << s;
-          }
-        }
         {
           mutex_lock l(mu_);
-          if (task_thread->end_of_sequence) {
-            VLOG(3) << "Task thread" << task_thread->task_id
-                    << " reached end_of_sequence";
-            return;
+          if (task_to_process) {
+            task_to_process->in_use = false;
+            task_to_process = nullptr;
+            worker_thread_cv_.notify_one();
           }
           outstanding_requests_--;
-          while (!cancelled_ && results_.size() + outstanding_requests_ >=
-                                    max_outstanding_requests_) {
-            VLOG(3) << "Task thread for task " << task_thread->task_id
-                    << " waiting. results_.size()=" << results_.size()
-                    << " outstanding_requests_=" << outstanding_requests_;
-            cv_.wait(l);
+          while (!cancelled_ && !(SpaceInBuffer() && TaskAvailable())) {
+            if (VLOG_IS_ON(3)) {
+              VLOG(3) << "Sleeping with results_.size=" << results_.size()
+                      << ", outstanding_requests_=" << outstanding_requests_
+                      << ", max_oustanding_requests="
+                      << max_outstanding_requests_
+                      << " finished_tasks=" << finished_tasks_
+                      << " tasks_.size()=" << tasks_.size();
+            }
+            worker_thread_cv_.wait(l);
           }
-          outstanding_requests_++;
           if (cancelled_) {
             return;
           }
+          outstanding_requests_++;
+          // Search for a task to update.
+          int num_tasks = tasks_.size();
+          for (int i = 0; i < num_tasks; ++i) {
+            int index = (next_task_index_ + i) % num_tasks;
+            std::shared_ptr<Task>& task = tasks_[index];
+            if (!task->in_use && !task->end_of_sequence) {
+              task->in_use = true;
+              task_to_process = task;
+              next_task_index_ = (index + 1) % num_tasks;
+              break;
+            }
+          }
+          DCHECK(task_to_process != nullptr);
+          VLOG(3) << "Processing task " << task_to_process->task_id;
         }
-        // TODO(aaudibert): add backoff and max retries.
         int64 deadline_micros =
             Env::Default()->NowMicros() + kRetryTimeoutMicros;
-        Status s = GetElement(task_thread, deadline_micros);
+        Status s = GetElement(task_to_process.get(), deadline_micros);
         if (!s.ok()) {
-          LOG(WARNING) << "Failed to get element from worker at "
-                       << task_thread->address << ": " << s;
+          mutex_lock l(mu_);
+          status_ = s;
+          get_next_cv_.notify_all();
+          return;
         }
       }
     }
@@ -407,18 +444,27 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     // If the task reaches end_of_sequence or is cancelled (e.g. due to a
     // worker dying), GetElement returns Status::OK() without adding to
     // `results_`.
-    Status GetElement(TaskThread* task_thread, int64 deadline_micros) {
-      VLOG(3) << "Getting an element for task id " << task_thread->task_id;
+    Status GetElement(Task* task, int64 deadline_micros)
+        TF_LOCKS_EXCLUDED(mu_) {
+      VLOG(3) << "Getting an element for task id " << task->task_id;
       tensorflow::profiler::TraceMe activity(
           "GetElement", tensorflow::profiler::TraceMeLevel::kInfo);
       CompressedElement compressed;
       bool end_of_sequence;
       for (int num_retries = 0;; ++num_retries) {
-        Status s = task_thread->worker->GetElement(
-            task_thread->task_id, &compressed, &end_of_sequence);
+        Status s = task->worker->GetElement(task->task_id, &compressed,
+                                            &end_of_sequence);
         if (s.ok()) {
           break;
         }
+        if (errors::IsNotFound(s)) {
+          // This indicates that the worker was restarted. The restarted worker
+          // will get a new task, and the old task is lost.
+          mutex_lock l(mu_);
+          finished_tasks_++;
+          task->end_of_sequence = true;
+          return Status::OK();
+        }
         // Retry all errors that could indicate preemption.
         if (!errors::IsUnavailable(s) && !errors::IsCancelled(s) &&
             !errors::IsAborted(s)) {
@@ -428,7 +474,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
           mutex_lock l(mu_);
           // If `UpdateTaskThreads` finds that the task has been cancelled, it
           // will set end_of_sequence to `true`.
-          if (task_thread->end_of_sequence || cancelled_) {
+          if (task->end_of_sequence || cancelled_) {
             return Status::OK();
           }
         }
@@ -454,21 +500,31 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
       }
       mutex_lock l(mu_);
       if (end_of_sequence) {
-        task_thread->end_of_sequence = true;
+        task->end_of_sequence = true;
+        finished_tasks_++;
         return Status::OK();
       }
       results_.push(std::move(element));
-      cv_.notify_all();
-      VLOG(3) << "Got an element for task id " << task_thread->task_id;
+      get_next_cv_.notify_all();
+      VLOG(3) << "Got an element for task id " << task->task_id;
       return Status::OK();
     }
 
+    bool SpaceInBuffer() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      return results_.size() + outstanding_requests_ <
+             max_outstanding_requests_;
+    }
+
+    bool TaskAvailable() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      return finished_tasks_ + outstanding_requests_ < tasks_.size();
+    }
+
     const int64 iterator_index_;
 
     mutex mu_;
-    // TODO(aaudibert): split this into a couple cvs for different conditions
-    // so that we can use notify_one and avoid unnecessary wakeups.
-    condition_variable cv_ TF_GUARDED_BY(mu_);
+    condition_variable get_next_cv_ TF_GUARDED_BY(mu_);
+    condition_variable worker_thread_cv_ TF_GUARDED_BY(mu_);
+    condition_variable manager_thread_cv_ TF_GUARDED_BY(mu_);
     bool cancelled_ TF_GUARDED_BY(mu_) = false;
 
     int64 outstanding_requests_ TF_GUARDED_BY(mu_) = 0;
@@ -476,17 +532,31 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
     // at the same time. This count includes both in-progress requests for
     // elements as well as completed requests which haven't yet been produced.
     int64 max_outstanding_requests_ TF_GUARDED_BY(mu_);
+
+    // The number of threads in `worker_threads_` which are still running.
+    int64 num_running_worker_threads_ TF_GUARDED_BY(mu_) = 0;
+
+    // The index of the next task in `tasks_` to read from.
+    int64 next_task_index_ TF_GUARDED_BY(mu_) = 0;
+
+    // The number tasks in the `tasks_` list that have reached end_of_sequence.
+    int64 finished_tasks_ TF_GUARDED_BY(mu_) = 0;
+
+    // List of tasks to read from.
+    std::vector<std::shared_ptr<Task>> tasks_ TF_GUARDED_BY(mu_);
+
+    // A status to be returned from the next call to `GetNext`. This is set by
+    // asynchronous threads when they encounter errors.
+    Status status_ TF_GUARDED_BY(mu_) = Status::OK();
     std::queue<std::vector<Tensor>> results_ TF_GUARDED_BY(mu_);
 
     // Set once in Initialize().
     int64 job_id_;
-    int64 num_unfinished_tasks_ TF_GUARDED_BY(mu_) = 0;
 
     bool job_finished_ = false;
-    // Must come second to last so that task threads are joined before
+    // Must be ordered second to last so that worker threads are joined before
     // destroying other fields.
-    absl::flat_hash_map<int64, std::unique_ptr<TaskThread>> task_threads_
-        TF_GUARDED_BY(mu_);
+    std::vector<std::unique_ptr<Thread>> worker_threads_ TF_GUARDED_BY(mu_);
     // Must be ordered last so that the thread is joined before destroying other
     // fields.
     std::unique_ptr<Thread> task_thread_manager_ GUARDED_BY(mu_);

From d45abae4e938be8f6bac8b9a1e1344241a30e2a1 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 13 May 2020 11:20:11 -0700
Subject: [PATCH 119/412] [XLA] Move SPMD partitioner to third_party

This change moves the work on SPMD partitioning that the XLA team has been working on in the past 12 months.

PiperOrigin-RevId: 311367525
Change-Id: If174527128c222c53736dc8db2ef1ea4177fb476
---
 tensorflow/compiler/xla/service/BUILD         |   31 +
 .../compiler/xla/service/hlo_sharding_util.cc |  574 ++
 .../compiler/xla/service/hlo_sharding_util.h  |  143 +
 .../xla/service/hlo_sharding_util_test.cc     |  206 +
 tensorflow/compiler/xla/service/spmd/BUILD    |   69 +
 .../xla/service/spmd/spmd_partitioner.cc      | 4655 +++++++++++++++++
 .../xla/service/spmd/spmd_partitioner.h       |  435 ++
 .../xla/service/spmd/spmd_partitioner_test.cc | 3191 +++++++++++
 .../xla/service/spmd/spmd_partitioner_util.cc |  662 +++
 .../xla/service/spmd/spmd_partitioner_util.h  |  229 +
 10 files changed, 10195 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/hlo_sharding_util.cc
 create mode 100644 tensorflow/compiler/xla/service/hlo_sharding_util.h
 create mode 100644 tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
 create mode 100644 tensorflow/compiler/xla/service/spmd/BUILD
 create mode 100644 tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
 create mode 100644 tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
 create mode 100644 tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
 create mode 100644 tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
 create mode 100644 tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 3349528ebc2..126b62a8eb2 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -460,6 +460,37 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_sharding_util",
+    srcs = [
+        "hlo_sharding_util.cc",
+    ],
+    hdrs = [
+        "hlo_sharding_util.h",
+    ],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:array",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_sharding_util_test",
+    srcs = [
+        "hlo_sharding_util_test.cc",
+    ],
+    deps = [
+        ":hlo_sharding_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 tf_cc_test(
     name = "dynamic_parameter_binding_test",
     srcs = ["dynamic_parameter_binding_test.cc"],
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.cc b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
new file mode 100644
index 00000000000..129091ca06f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.cc
@@ -0,0 +1,574 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+
+#include <map>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/array.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+
+absl::optional<int64> SelectDominantDevice(
+    const std::map<int64, int64>& device_map, int64* top_count) {
+  int64 device = 0;
+  int64 count = 0;
+  for (auto& it : device_map) {
+    if (it.second > count) {
+      count = it.second;
+      device = it.first;
+    }
+  }
+  if (top_count != nullptr) {
+    *top_count = count;
+  }
+  return count > 0 ? absl::optional<int64>(device) : absl::optional<int64>();
+}
+
+Status AssignComputationDevice(HloComputation* computation, int64 device) {
+  VLOG(4) << "Assigning device " << device << " to " << computation->name()
+          << " computation";
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (!instruction->has_sharding()) {
+      VLOG(4) << "Assigning device " << device << " to " << instruction->name();
+      instruction->set_device_sharding(device);
+    }
+  }
+  return Status::OK();
+}
+
+absl::optional<int64> GetMostOccurringDevice(
+    absl::Span<HloInstruction* const> instructions) {
+  std::map<int64, int64> device_map;
+  for (HloInstruction* instruction : instructions) {
+    if (instruction->has_sharding()) {
+      for (auto& it : instruction->sharding().UsedDevices(nullptr)) {
+        // The UsedDevices() API returns a map<device, occurrence_count>.
+        device_map[it.first] += it.second;
+      }
+    }
+  }
+  return SelectDominantDevice(device_map, nullptr);
+}
+
+StatusOr<absl::optional<int64>> GetDominantDevice(
+    absl::Span<HloComputation* const> computations, double dominant_factor) {
+  int64 instruction_count = 0;
+  std::map<int64, int64> device_map;
+  for (HloComputation* computation : computations) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      int64 count = 1;
+      if (instruction->has_sharding()) {
+        for (auto& it : instruction->sharding().UsedDevices(&count)) {
+          // The UsedDevices() API returns a map<device, occurrence_count>.
+          device_map[it.first] += it.second;
+        }
+      }
+      instruction_count += count;
+    }
+  }
+  int64 count;
+  absl::optional<int64> device = SelectDominantDevice(device_map, &count);
+  absl::optional<int64> dominant_device;
+  if (device) {
+    double factor =
+        static_cast<double>(count) / static_cast<double>(instruction_count);
+    if (factor >= dominant_factor) {
+      dominant_device = device;
+    }
+  }
+  return dominant_device;
+}
+
+HloSharding TransposeSharding(const HloSharding& sharding,
+                              const std::vector<int64>& dimensions) {
+  if (sharding.IsTileMaximal()) {
+    return sharding;
+  }
+  const int64 rank = dimensions.size();
+  std::vector<int64> tile_assignment_dim(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    tile_assignment_dim[i] = sharding.tile_assignment().dim(dimensions[i]);
+  }
+  Array<int64> tile_assignment = sharding.tile_assignment();
+  tile_assignment.Reshape(tile_assignment_dim);
+  tile_assignment.Each([&](absl::Span<const int64> indices, int64* value) {
+    std::vector<int64> src_indices(indices.size(), -1);
+    for (int64 i = 0; i < indices.size(); ++i) {
+      src_indices[dimensions[i]] = indices[i];
+    }
+    *value = sharding.tile_assignment()(src_indices);
+  });
+  return HloSharding::Tile(tile_assignment);
+}
+
+absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
+                                            const Shape& target_shape,
+                                            const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return sharding;
+  }
+
+  // In case of a tiled sharding the reshaped sharding will be a valid if the
+  // reshape is composed from the following operations:
+  // * Adding or removing dimensions with size 1.
+  // * Merging consecutive dimensions where only the most major is sharded.
+  // * Splitting a dimension to consecutive dimensions.
+  // * Any reshaping of unsharded dimensions.
+  // Note that merge and split can happen consecutively on the same dimension,
+  // e.g., f32[1024,256,1024] to f32[128,2048,1024] can be considered that 1024
+  // gets split into 128 and 8, but 8 then gets merged with 256. We use stacks
+  // to make supporting such cases easy.
+  const Shape tile_shape = sharding.TileShape(source_shape);
+  std::vector<int64> target_tile_assignment_dimensions;
+  std::vector<int64> source_dims_stack(source_shape.rank());
+  std::vector<int64> target_dims_stack(target_shape.rank());
+  std::vector<int64> sharding_tile_dims_stack(source_shape.rank());
+  for (int64 i = 0; i < source_shape.rank(); ++i) {
+    source_dims_stack[i] = source_shape.dimensions(source_shape.rank() - 1 - i);
+    sharding_tile_dims_stack[i] =
+        sharding.tile_assignment().dim(source_shape.rank() - 1 - i);
+  }
+  for (int64 i = 0; i < target_shape.rank(); ++i) {
+    target_dims_stack[i] = target_shape.dimensions(target_shape.rank() - 1 - i);
+  }
+  while (!source_dims_stack.empty() || !target_dims_stack.empty()) {
+    if (target_dims_stack.empty()) {
+      if (Product(sharding_tile_dims_stack) != 1) {
+        return absl::nullopt;
+      }
+      break;
+    }
+    int64 s_size = 1;
+    int64 t_size = 1;
+    int64 s_partitions = 1;
+    if (!source_dims_stack.empty()) {
+      s_size = source_dims_stack.back();
+      source_dims_stack.pop_back();
+      s_partitions = sharding_tile_dims_stack.back();
+      sharding_tile_dims_stack.pop_back();
+    }
+    t_size = target_dims_stack.back();
+    target_dims_stack.pop_back();
+    if (s_partitions * Product(sharding_tile_dims_stack) == 1) {
+      // No more partitions left.
+      target_tile_assignment_dimensions.push_back(1);
+      continue;
+    }
+    if (s_size == t_size) {
+      // Same dimension.
+      target_tile_assignment_dimensions.push_back(s_partitions);
+    } else if (t_size == 1) {
+      // Trivial dimension added.
+      target_tile_assignment_dimensions.push_back(1);
+      source_dims_stack.push_back(s_size);
+      sharding_tile_dims_stack.push_back(s_partitions);
+    } else if (s_size == 1) {
+      // Trivial dimension removed.
+      if (s_partitions != 1) {
+        return absl::nullopt;
+      }
+      target_dims_stack.push_back(t_size);
+    } else if (s_size > t_size) {
+      // Dimension split.
+      if (s_size % t_size != 0 || t_size % s_partitions != 0) {
+        return absl::nullopt;
+      }
+      target_tile_assignment_dimensions.push_back(s_partitions);
+      // We have part of the s_size unprocessed, so put it back to stack.
+      source_dims_stack.push_back(s_size / t_size);
+      sharding_tile_dims_stack.push_back(1);
+    } else {
+      // Dimension merge. Also merge the source dimension with the next, and
+      // process it next time.
+      if (s_size % s_partitions != 0) {
+        return absl::nullopt;
+      }
+      CHECK(!source_dims_stack.empty());
+      if (sharding_tile_dims_stack.back() != 1 && s_size != s_partitions) {
+        // If the next dimension to combine is sharded, we require that the
+        // current dimension's shard size to be 1. Otherwise, the new shard
+        // would be non-contiguous.
+        return absl::nullopt;
+      }
+      source_dims_stack.back() *= s_size;
+      sharding_tile_dims_stack.back() *= s_partitions;
+      target_dims_stack.push_back(t_size);
+    }
+  }
+  Array<int64> new_tile_assignment = sharding.tile_assignment();
+  new_tile_assignment.Reshape(target_tile_assignment_dimensions);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ReshapeToTileDimension(const HloSharding& sharding, int64 dim,
+                                   absl::Span<const int64> dims) {
+  CHECK(!sharding.IsTuple() && !sharding.IsTileMaximal());
+  CHECK_NE(absl::c_find(dims, dim), dims.end()) << "dim is not in dims";
+  // We optimize the tile assignment on the single dimension dim in a way to
+  // minimize communication among devices caused by the reshard:
+  // +---+---+               +---+---+              +-+-+-+-+
+  // |   |   |               |   0   |              | | | | |
+  // | 0 | 1 |               +-------+              | | | | |
+  // |   |   |  reshape on   |   1   |  reshape on  | | | | |
+  // +---+---+   dim 0  =>   +-------+   dim 1  =>  |0|2|1|3|
+  // |   |   |               |   2   |              | | | | |
+  // | 2 | 3 |               +-------+              | | | | |
+  // |   |   |               |   3   |              | | | | |
+  // +---+---+               +---+---+              +-+-+-+-+
+
+  std::vector<int64> tile_dims(sharding.tile_assignment().num_dimensions(), 1);
+  // Handle ignore dimensions.
+  std::vector<int64> ignore_sizes;
+  int64 ignore_size = 1;
+  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+    if (absl::c_find(dims, i) == dims.end()) {
+      int64 size = sharding.tile_assignment().dim(i);
+      ignore_sizes.push_back(size);
+      tile_dims[i] = size;
+      ignore_size *= size;
+    }
+  }
+
+  using Buckets = std::vector<std::vector<int64>>;
+  Array<Buckets> buckets(ignore_sizes,
+                         Buckets(sharding.tile_assignment().dim(dim)));
+  sharding.tile_assignment().Each(
+      [&](absl::Span<const int64> index, int64 device) {
+        std::vector<int64> ignore_index;
+        for (int64 i = 0; i < index.size(); ++i) {
+          if (absl::c_find(dims, i) == dims.end()) {
+            ignore_index.push_back(index[i]);
+          }
+        }
+        buckets(ignore_index)[index[dim]].push_back(device);
+      });
+  std::vector<int64> devices;
+  buckets.Each([&](absl::Span<const int64> index, const Buckets& buckets) {
+    for (auto& bucket : buckets) {
+      devices.insert(devices.end(), bucket.begin(), bucket.end());
+    }
+  });
+  tile_dims[dim] = devices.size() / ignore_size;
+  Array<int64> tile_assignment(tile_dims);
+  tile_assignment.SetValues(devices);
+  return HloSharding::Tile(tile_assignment);
+}
+
+bool ContainsTileSharding(const HloModule& module) {
+  for (const HloComputation* computation : module.computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      if (instruction->has_sharding() &&
+          !instruction->sharding().IsTileMaximal()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+HloSharding GatherOutputSharding(const HloSharding& index_sharding,
+                                 const HloInstruction* hlo) {
+  if (index_sharding.IsTileMaximal()) {
+    return index_sharding;
+  }
+
+  const GatherDimensionNumbers& dnums = hlo->gather_dimension_numbers();
+  std::vector<int64> output_tile_assignment_dims;
+  for (int64 i = 0, index_dim = 0; i < hlo->shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.offset_dims(), i)) {
+      output_tile_assignment_dims.push_back(1);
+    } else {
+      output_tile_assignment_dims.push_back(
+          index_sharding.tile_assignment().dim(index_dim));
+      index_dim++;
+    }
+  }
+  Array<int64> new_tile_assignment = index_sharding.tile_assignment();
+  new_tile_assignment.Reshape(output_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding GatherIndexSharding(const HloSharding& output_sharding,
+                                const HloInstruction* hlo) {
+  if (output_sharding.IsTileMaximal()) {
+    return output_sharding;
+  }
+
+  const GatherDimensionNumbers& dnums = hlo->gather_dimension_numbers();
+  std::vector<int64> index_tile_assignment_dims;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.offset_dims(), i)) {
+      index_tile_assignment_dims.push_back(
+          output_sharding.tile_assignment().dim(i));
+    }
+  }
+  Array<int64> new_tile_assignment = output_sharding.tile_assignment();
+  new_tile_assignment.Reshape(index_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo) {
+  if (hlo.sharding().IsTileMaximal()) {
+    return hlo.sharding();
+  }
+
+  const GatherDimensionNumbers& dnums = hlo.gather_dimension_numbers();
+  std::vector<int64> tile_assignment_dims(hlo.shape().rank());
+  int64 num_elements = 1;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.offset_dims(), i)) {
+      tile_assignment_dims[i] = hlo.sharding().tile_assignment().dim(i);
+      num_elements *= hlo.sharding().tile_assignment().dim(i);
+    } else {
+      tile_assignment_dims[i] = 1;
+    }
+  }
+  if (num_elements == hlo.sharding().tile_assignment().num_elements()) {
+    // Output sharding is only on non offset dimensions. We use output sharding
+    // to shard this gather op directly.
+    return hlo.sharding();
+  }
+
+  if (num_elements == 1) {
+    // Output sharding is only on offset dimensions. We do not shard this gather
+    // op. Return a tile maximal sharding with the first device in output
+    // sharding tile assignment.
+    return HloSharding::AssignDevice(*hlo.sharding().tile_assignment().begin());
+  }
+
+  // Output sharding is on both offset and non offset dimensions. We shard the
+  // gather op only on non offset dimensions.
+  // For example:
+  // - the gather op has sharding [2,2]{0,1,2,3},
+  // - first dimension is non offset dimension,
+  // - second dimension is offset dimension,
+  // Then the result sharding will be [2,1]{0,2}.
+  std::vector<int64> slice_starts(hlo.shape().rank(), 0LL),
+      slice_limits(hlo.shape().rank());
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.offset_dims(), i)) {
+      slice_limits[i] = hlo.sharding().tile_assignment().dim(i);
+    } else {
+      slice_limits[i] = 1;
+    }
+  }
+  Array<int64> tile_assignment =
+      hlo.sharding().tile_assignment().Slice(slice_starts, slice_limits);
+  return HloSharding::Tile(tile_assignment);
+}
+
+HloSharding ScatterIndexSharding(const HloSharding& data_sharding,
+                                 const HloInstruction* hlo) {
+  if (data_sharding.IsTileMaximal()) {
+    return data_sharding;
+  }
+
+  const ScatterDimensionNumbers& dnums = hlo->scatter_dimension_numbers();
+  std::vector<int64> index_tile_assignment_dims;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (!absl::c_binary_search(dnums.update_window_dims(), i)) {
+      index_tile_assignment_dims.push_back(
+          data_sharding.tile_assignment().dim(i));
+    }
+  }
+  if (index_tile_assignment_dims.size() < hlo->operand(1)->shape().rank()) {
+    index_tile_assignment_dims.push_back(1);
+  }
+  Array<int64> new_tile_assignment = data_sharding.tile_assignment();
+  new_tile_assignment.Reshape(index_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ScatterDataSharding(const HloSharding& index_sharding,
+                                const HloInstruction* hlo) {
+  if (index_sharding.IsTileMaximal()) {
+    return index_sharding;
+  }
+
+  const ScatterDimensionNumbers& dnums = hlo->scatter_dimension_numbers();
+  std::vector<int64> data_tile_assignment_dims;
+  for (int64 i = 0, index_dim = 0; i < hlo->shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.update_window_dims(), i)) {
+      data_tile_assignment_dims.push_back(1);
+    } else {
+      data_tile_assignment_dims.push_back(
+          index_sharding.tile_assignment().dim(index_dim));
+      index_dim++;
+    }
+  }
+  Array<int64> new_tile_assignment = index_sharding.tile_assignment();
+  new_tile_assignment.Reshape(data_tile_assignment_dims);
+  return HloSharding::Tile(new_tile_assignment);
+}
+
+HloSharding ScatterEffectiveIndexSharding(const HloSharding& index_sharding,
+                                          const HloInstruction& hlo) {
+  if (index_sharding.IsTileMaximal()) {
+    return index_sharding;
+  }
+
+  // Only shard on first "number of scatter_window_dims" dimensions.
+  const ScatterDimensionNumbers& dnums = hlo.scatter_dimension_numbers();
+  int64 num_elements = 1;
+  int64 index_dim = 0;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.inserted_window_dims(), i)) {
+      num_elements *= index_sharding.tile_assignment().dim(index_dim);
+      index_dim++;
+    }
+  }
+  if (num_elements == index_sharding.tile_assignment().num_elements()) {
+    // Index sharding is only on scatter_window_dims. We use this index sharding
+    // directly.
+    return index_sharding;
+  }
+
+  // Index sharding is only on update_window_dims. We do not shard this scatter
+  // op. Return a tile maximal sharding with the first device in index sharding
+  // tile assignment.
+  if (num_elements == 1) {
+    return HloSharding::AssignDevice(*index_sharding.tile_assignment().begin());
+  }
+
+  const int64 index_rank = hlo.operand(1)->shape().rank();
+  std::vector<int64> slice_starts(index_rank, 0LL), slice_limits(index_rank);
+  for (int64 i = 0; i < index_rank; ++i) {
+    if (i < index_dim) {
+      slice_limits[i] = index_sharding.tile_assignment().dim(i);
+    } else {
+      slice_limits[i] = 1;
+    }
+  }
+  Array<int64> tile_assignment =
+      index_sharding.tile_assignment().Slice(slice_starts, slice_limits);
+  return HloSharding::Tile(tile_assignment);
+}
+
+HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
+                                         const HloInstruction& hlo) {
+  if (data_sharding.IsTileMaximal()) {
+    return data_sharding;
+  }
+
+  const ScatterDimensionNumbers& dnums = hlo.scatter_dimension_numbers();
+  const int64 data_rank = hlo.operand(2)->shape().rank();
+  std::vector<int64> tile_assignment_dims(data_rank, 1LL);
+  int64 num_elements = 1;
+  for (int64 i = 0; i < hlo.shape().rank(); ++i) {
+    if (absl::c_binary_search(dnums.inserted_window_dims(), i)) {
+      CHECK_LT(i, data_rank);
+      tile_assignment_dims[i] = data_sharding.tile_assignment().dim(i);
+      num_elements *= data_sharding.tile_assignment().dim(i);
+    }
+  }
+  if (num_elements == data_sharding.tile_assignment().num_elements()) {
+    // Data sharding is only on scatter_window_dims. We use this data sharding
+    // directly.
+    return data_sharding;
+  }
+
+  if (num_elements == 1) {
+    // Data sharding is only on update_window_dims. We do not shard this
+    // scatter op. Return a tile maximal sharding with the first device in
+    // data sharding tile assignment.
+    return HloSharding::AssignDevice(*data_sharding.tile_assignment().begin());
+  }
+
+  // Data sharding is on both update_window_dims and scatter_window_dims. We
+  // shard the scatter op only on scatter_window_dims. For example:
+  // - the scatter data has sharding [2,2]{0,1,2,3},
+  // - first dimension is scatter_window_dims,
+  // - second dimension is update_window_dims,
+  // Then the result sharding will be [2,1]{0,2}.
+  std::vector<int64> slice_starts(data_rank, 0LL);
+  Array<int64> tile_assignment =
+      data_sharding.tile_assignment().Slice(slice_starts, tile_assignment_dims);
+  return HloSharding::Tile(tile_assignment);
+}
+
+StatusOr<std::pair<std::unique_ptr<HloInstruction>, HloOpcode>>
+IdentityValueAndHloOpcodeForScatterReduceComputation(
+    const HloScatterInstruction& scatter) {
+  auto computation = scatter.to_apply();
+  // We only handle computations with 2 parameters and only 1 calculation.
+  if (computation->instruction_count() != 3) {
+    return Status(
+        tensorflow::error::Code::INVALID_ARGUMENT,
+        "Expected scatter reduce computation with 2 parameters and only 1 "
+        "calculation");
+  }
+
+  auto root_instruction = computation->root_instruction();
+  if (root_instruction->opcode() == HloOpcode::kAdd ||
+      root_instruction->opcode() == HloOpcode::kOr) {
+    return std::make_pair(HloInstruction::CreateConstant(LiteralUtil::Zero(
+                              scatter.shape().element_type())),
+                          root_instruction->opcode());
+  } else if (root_instruction->opcode() == HloOpcode::kMultiply ||
+             root_instruction->opcode() == HloOpcode::kAnd) {
+    return std::make_pair(HloInstruction::CreateConstant(
+                              LiteralUtil::One(scatter.shape().element_type())),
+                          root_instruction->opcode());
+  } else if (root_instruction->opcode() == HloOpcode::kMaximum) {
+    return std::make_pair(HloInstruction::CreateConstant(LiteralUtil::MinValue(
+                              scatter.shape().element_type())),
+                          root_instruction->opcode());
+  } else if (root_instruction->opcode() == HloOpcode::kMinimum) {
+    return std::make_pair(HloInstruction::CreateConstant(LiteralUtil::MaxValue(
+                              scatter.shape().element_type())),
+                          root_instruction->opcode());
+  }
+
+  return Status(tensorflow::error::Code::INVALID_ARGUMENT,
+                "Expected scatter reduce computation which is "
+                "add/or/multiply/add/min/max");
+}
+
+std::vector<int64> DevicesForSharding(
+    const HloSharding& sharding, const std::vector<int64>& available_devices) {
+  std::vector<int64> devices;
+  if (sharding.IsReplicated()) {
+    for (int64 d : available_devices) {
+      if (!HloSharding::IsReservedDevice(d)) {
+        devices.push_back(d);
+      }
+    }
+    return devices;
+  }
+
+  for (int64 i : available_devices) {
+    if (sharding.UsesDevice(i)) {
+      devices.push_back(i);
+    }
+  }
+  DCHECK(std::all_of(sharding.tile_assignment().begin(),
+                     sharding.tile_assignment().end(), [&](int64 device) {
+                       return std::find(available_devices.begin(),
+                                        available_devices.end(),
+                                        device) != available_devices.end();
+                     }));
+  return devices;
+}
+
+}  // namespace hlo_sharding_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util.h b/tensorflow/compiler/xla/service/hlo_sharding_util.h
new file mode 100644
index 00000000000..00d9434a34d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util.h
@@ -0,0 +1,143 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_UTIL_H_
+
+#include <map>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+
+// Given a map<device, occurrence_count>, selects the device with higher
+// occurrence count (if any). If top_count in not nullptr, it will receive the
+// count of the dominant device returned.
+absl::optional<int64> SelectDominantDevice(
+    const std::map<int64, int64>& device_map, int64* top_count);
+
+// Assigns all the instructions of a computation, to a given device.
+// This API does not recurse into called computations, and does not assign
+// instructions which already have sharding.
+Status AssignComputationDevice(HloComputation* computation, int64 device);
+
+// Given an instruction container, returns the device which is most commonly
+// occurring among the instructions.
+absl::optional<int64> GetMostOccurringDevice(
+    absl::Span<HloInstruction* const> instructions);
+
+// Given a set of computations, tries to extract the dominant device. A device
+// is dominant if the combined occurrence among all the instructions of the
+// input computations, is greater/equal than/to dominant_factor (real number
+// from 0 to 1).
+// This API does not recurse into called computations.
+// If no device exists that satisfies the condition, the returned optional will
+// hold no value.
+StatusOr<absl::optional<int64>> GetDominantDevice(
+    absl::Span<HloComputation* const> computations, double dominant_factor);
+
+// Returns the HloSharding with the tile dimensions and tile assignment
+// transposed based on the specified dimension numbers. In case of a tile
+// maximal sharding returns the original sharding.
+HloSharding TransposeSharding(const HloSharding& sharding,
+                              const std::vector<int64>& dimensions);
+
+// Returns the HloSharding with the tile shape reshaped based on the source and
+// target shapes and the tile assignment adjusted to correspond to the new tile
+// shape or absl::nullopt if the resulting reshape would create an invalid
+// sharding (non continuous or non uniformly sized tiles). In case of a tile
+// maximal sharding returns the original sharding.
+absl::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
+                                            const Shape& target_shape,
+                                            const HloSharding& sharding);
+
+// Returns a sharding tiled on unique dimension dim by reshaping the tile
+// assignment of the sharding argument. Only dimensions in the dims span
+// argument are considered for reshaping, the others are ignored.
+// Assumptions: sharding is tile sharded, and dim must be included in dims.
+HloSharding ReshapeToTileDimension(const HloSharding& sharding, int64 dim,
+                                   absl::Span<const int64> dims);
+
+// Returns true if the provided module includes one or more instructions with
+// a tile sharding.
+bool ContainsTileSharding(const HloModule& module);
+
+// Returns the preferred output sharding for a gather op based on the sharding
+// of the indces.
+HloSharding GatherOutputSharding(const HloSharding& index_sharding,
+                                 const HloInstruction* hlo);
+
+// Returns the preferred index sharding for a gather op based on the sharding
+// of the output.
+HloSharding GatherIndexSharding(const HloSharding& output_sharding,
+                                const HloInstruction* hlo);
+
+// Returns a new HloSharding for a gather op so that only non offset dimensions
+// are sharded. Assume "result" is returned by this function. It is ensured that
+// "GetIndexSharding(result, hlo)" will have the same number of elements as
+// "result".
+HloSharding GatherEffectiveOutputSharding(const HloInstruction& hlo);
+
+// Returns the preferred index sharding for a scatter op based on the sharding
+// of the data.
+HloSharding ScatterIndexSharding(const HloSharding& data_sharding,
+                                 const HloInstruction* hlo);
+
+// Returns the preferred data sharding for a scatter op based on the sharding
+// of the index.
+HloSharding ScatterDataSharding(const HloSharding& index_sharding,
+                                const HloInstruction* hlo);
+
+// Returns a new index sharding for a scatter op so that we only shard on first
+// "number of scatter_window_dims" dimensions. Assume "result" is returned by
+// this function. It is ensured that "ScatterDataSharding(result, hlo)" will
+// have the same number of elements as "result".
+HloSharding ScatterEffectiveIndexSharding(const HloSharding& index_sharding,
+                                          const HloInstruction& hlo);
+
+// Returns a new data sharding for a scatter op so that we only shard on
+// scatter_window_dims. Assume "result" is returned by this function. It is
+// ensured that "ScatterIndexSharding(result, hlo)" will have the same number of
+// elements as "result".
+HloSharding ScatterEffectiveDataSharding(const HloSharding& data_sharding,
+                                         const HloInstruction& hlo);
+
+// Returns an identity value and an HloOpcode for reduce computation of scatter
+// instruction.
+// - If computation is add/or, return 0/false with corresponding op code;
+// - If computation is multiply/and, return 1/true with corresponding op code.
+// - If computation is min/max, return max value/min value with corresponding op
+//   code.
+// - Otherwise, return error status.
+StatusOr<std::pair<std::unique_ptr<HloInstruction>, HloOpcode>>
+IdentityValueAndHloOpcodeForScatterReduceComputation(
+    const HloScatterInstruction& scatter);
+
+// Given a sharding and a list of devices in the topology, return a
+// list of the devices that `sharding` applies to.
+std::vector<int64> DevicesForSharding(
+    const HloSharding& sharding, const std::vector<int64>& available_devices);
+
+}  // namespace hlo_sharding_util
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
new file mode 100644
index 00000000000..02496c75965
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_util_test.cc
@@ -0,0 +1,206 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace hlo_sharding_util {
+namespace {
+
+TEST(HloShardingUtilTest, TransposeShardingReplicated) {
+  EXPECT_EQ(TransposeSharding(HloSharding::Replicate(), {0, 1, 2}),
+            HloSharding::Replicate());
+}
+
+TEST(HloShardingUtilTest, TransposeShardingTiled) {
+  HloSharding input = HloSharding::Tile(Array4D<int64>({{{{0, 1}}, {{2, 3}}}}));
+  HloSharding output =
+      HloSharding::Tile(Array4D<int64>({{{{0}, {2}}}, {{{1}, {3}}}}));
+  EXPECT_EQ(TransposeSharding(input, {3, 0, 1, 2}), output);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingMaximal) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 3, 5});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 5, 2});
+  HloSharding sharding = HloSharding::AssignDevice(7);
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledInvalid) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 3, 5});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 5, 2});
+  HloSharding sharding = HloSharding::Tile(Array3D<int64>({{{0}, {1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledMerge) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {4, 5, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {20, 7});
+  HloSharding input_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  HloSharding output_sharding = HloSharding::Tile(Array2D<int64>({{0}, {1}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledSplit) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 4, 7});
+  HloSharding input_sharding = HloSharding::Tile(Array2D<int64>({{0}, {1}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledSplitThenMerge) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 4, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 16, 7});
+  HloSharding input_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledArbitraryMinorDimensions) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {16, 7, 5, 3});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 15, 2, 14});
+  Array<int64> sharding_array({2, 1, 1, 1});
+  sharding_array(0, 0, 0, 0) = 0;
+  sharding_array(1, 0, 0, 0) = 1;
+  HloSharding sharding = HloSharding::Tile(sharding_array);
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTiledTrivialDimensions) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {3, 1, 5, 7});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {3, 5, 1, 7});
+  HloSharding input_sharding =
+      HloSharding::Tile(Array4D<int64>({{{{0}, {1}}}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array4D<int64>({{{{0}}, {{1}}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingTrivialDImensionInsertedToEnd) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {8, 16});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {8, 16, 1});
+  HloSharding input_sharding = HloSharding::Tile(Array2D<int64>({{0}, {1}}));
+  HloSharding output_sharding =
+      HloSharding::Tile(Array3D<int64>({{{0}}, {{1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, input_sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), output_sharding);
+}
+
+TEST(HloShardingUtilTest, NoopReshapeShardingEmptyTile) {
+  Shape shape = ShapeUtil::MakeShape(F32, {7, 1, 1});
+  HloSharding sharding = HloSharding::Tile(Array3D<int64>({{{0}, {1}}}));
+  absl::optional<HloSharding> result = ReshapeSharding(shape, shape, sharding);
+  EXPECT_TRUE(result.has_value());
+  EXPECT_EQ(result.value(), sharding);
+}
+
+TEST(HloShardingUtilTest, ReshapeShardingScalar) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {1, 1, 1});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {});
+  HloSharding sharding = HloSharding::Tile(Array3D<int64>({{{0}, {1}}}));
+  absl::optional<HloSharding> result =
+      ReshapeSharding(input_shape, output_shape, sharding);
+  EXPECT_FALSE(result.has_value());
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim0) {
+  HloSharding sharding = HloSharding::Tile(Array2D<int64>({{0, 1}, {2, 3}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/0, /*dims=*/{0, 1});
+  EXPECT_EQ(result.tile_assignment(), Array2D<int64>({{0}, {1}, {2}, {3}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim1) {
+  HloSharding sharding = HloSharding::Tile(Array2D<int64>({{0, 1}, {2, 3}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/1, /*dims=*/{0, 1});
+  EXPECT_EQ(result.tile_assignment(), Array2D<int64>({{0, 2, 1, 3}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension3D_Dim0) {
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/0, /*dims=*/{0, 1, 2});
+  EXPECT_EQ(
+      result.tile_assignment(),
+      Array3D<int64>({{{0}}, {{1}}, {{2}}, {{3}}, {{4}}, {{5}}, {{6}}, {{7}}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension3D_Dim1) {
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/1, /*dims=*/{0, 1, 2});
+  EXPECT_EQ(result.tile_assignment(),
+            Array3D<int64>({{{0}, {1}, {4}, {5}, {2}, {3}, {6}, {7}}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension3D_Dim2) {
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  HloSharding result =
+      ReshapeToTileDimension(sharding, /*dim=*/2, /*dims=*/{0, 1, 2});
+  EXPECT_EQ(result.tile_assignment(),
+            Array3D<int64>({{{0, 2, 4, 6, 1, 3, 5, 7}}}));
+}
+
+TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim2_Batch1) {
+  // Tile sharding in batch dimension, i.e.
+  // sharding={devices[2,2,2]0,1,2,3,4,5,6,7,8}.
+  HloSharding sharding =
+      HloSharding::Tile(Array3D<int64>({{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}}));
+  // Reshape on dimensions {1, 2} only, therefore ignoring batch dimension 0.
+  HloSharding result = ReshapeToTileDimension(sharding, /*dim=*/2,
+                                              /*dims=*/{1, 2});
+  // Expected result is {devices=[2,1,4]0,2,1,3,4,6,5,7}, i.e. the two
+  // non-batch dimensions {{0, 1}, {2, 3}} and {{4, 5}, {6, 7}} are individually
+  // reshaped to tile dimension 2, i.e. {{0, 2, 1, 3}}, {{4, 6, 5, 7}}.
+  EXPECT_EQ(result.tile_assignment(),
+            Array3D<int64>({{{0, 2, 1, 3}}, {{4, 6, 5, 7}}}));
+}
+
+}  // namespace
+}  // namespace hlo_sharding_util
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
new file mode 100644
index 00000000000..5be6a04f934
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -0,0 +1,69 @@
+# Description: SPMD partitioning pass.
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    default_visibility = [":friends"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla:friends",
+    ],
+)
+
+cc_library(
+    name = "spmd_partitioner",
+    srcs = [
+        "spmd_partitioner.cc",
+        "spmd_partitioner_util.cc",
+    ],
+    hdrs = [
+        "spmd_partitioner.h",
+        "spmd_partitioner_util.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:protobuf_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/client/lib:comparators",
+        "//tensorflow/compiler/xla/service:flatten_call_graph",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_casting_utils",
+        "//tensorflow/compiler/xla/service:hlo_cse",
+        "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_query",
+        "//tensorflow/compiler/xla/service:hlo_sharding_util",
+        "//tensorflow/compiler/xla/service:shape_inference",
+        "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/core/platform:numbers",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+tf_cc_test(
+    name = "spmd_partitioner_test",
+    srcs = ["spmd_partitioner_test.cc"],
+    deps = [
+        ":spmd_partitioner",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
new file mode 100644
index 00000000000..fd865342ca3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -0,0 +1,4655 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+#include <float.h>
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/protobuf_util.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_util.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/numbers.h"
+
+namespace xla {
+namespace spmd {
+
+string SpmdLogger::MakeReport() {
+  string report;
+  absl::StrAppend(&report,
+                  "\n\n***** SPMD memory during transformation *****\n");
+
+  std::sort(entries_.begin(), entries_.end(),
+            [](auto const& entry0, auto const& entry1) {
+              return entry0.first > entry1.first;
+            });
+  for (int64 i = 0;
+       i < std::min<int64>(report_instruction_count_, entries_.size()); ++i) {
+    absl::StrAppend(
+        &report, "\n  ",
+        tensorflow::strings::HumanReadableNumBytes(entries_[i].first), " : ",
+        entries_[i].second, "\n");
+  }
+
+  return report;
+}
+
+void SpmdLogger::RegisterLogEntry(HloInstruction* hlo,
+                                  const std::vector<HloInstruction*>& group) {
+  string report = hlo->ToString();
+  int64 max_value = -1;
+  for (HloInstruction* inst : group) {
+    if (inst->shape().IsTuple()) {
+      continue;
+    }
+    max_value =
+        std::max<int64>(max_value, ShapeUtil::ByteSizeOf(inst->shape(), 4));
+    absl::StrAppend(&report, "     * ", inst->ToString(), "\n");
+  }
+  entries_.push_back(std::make_pair(max_value, report));
+}
+
+/* static */ string SpmdLogger::ReportBeforePartition(
+    const HloModule& module, int64 report_instruction_count) {
+  string report;
+  absl::StrAppend(&report,
+                  "\n\n***** SPMD memory usage before partition *****\n");
+  absl::StrAppend(&report, "\n  ** Replicated instructions\n");
+  absl::StrAppend(&report, ReportMemoryUsage(
+                               module,
+                               [](const HloInstruction* hlo) {
+                                 return !hlo->has_sharding() ||
+                                        hlo->sharding().IsReplicated();
+                               },
+                               report_instruction_count));
+  absl::StrAppend(&report, "\n  ** All instructions\n");
+  absl::StrAppend(&report,
+                  ReportMemoryUsage(
+                      module, [](const HloInstruction* hlo) { return true; },
+                      report_instruction_count));
+  return report;
+}
+
+/* static */ string SpmdLogger::ReportAfterPartition(
+    const HloModule& module, int64 report_instruction_count) {
+  string report;
+  absl::StrAppend(&report,
+                  "\n\n***** SPMD memory usage after partition *****\n");
+  absl::StrAppend(&report,
+                  ReportMemoryUsage(
+                      module, [](const HloInstruction* hlo) { return true; },
+                      report_instruction_count));
+  return report;
+}
+
+template <typename F>
+/* static */ string SpmdLogger::ReportMemoryUsage(
+    const HloModule& module, const F& filter, int64 report_instruction_count) {
+  string report;
+  std::vector<HloInstruction*> instructions;
+  instructions.reserve(module.instruction_count());
+
+  for (auto computation : module.computations()) {
+    if (computation->IsFusionComputation()) {
+      continue;
+    }
+    for (auto hlo : computation->instructions()) {
+      if (hlo->shape().IsTuple() ||
+          ShapeUtil::IsEffectiveScalar(hlo->shape())) {
+        continue;
+      }
+      if (filter(hlo)) {
+        instructions.push_back(hlo);
+      }
+    }
+  }
+
+  const auto add_report = [&](std::vector<HloInstruction*>* insts) {
+    std::sort(insts->begin(), insts->end(),
+              [](const HloInstruction* inst0, const HloInstruction* inst1) {
+                return ShapeUtil::ByteSizeOf(inst0->shape()) >
+                       ShapeUtil::ByteSizeOf(inst1->shape());
+              });
+    for (int64 i = 0;
+         i < std::min<int64>(report_instruction_count, insts->size()); ++i) {
+      absl::StrAppend(&report, "  ",
+                      tensorflow::strings::HumanReadableNumBytes(
+                          ShapeUtil::ByteSizeOf((*insts)[i]->shape())),
+                      " : ", (*insts)[i]->ToString(), "\n");
+    }
+  };
+
+  add_report(&instructions);
+  return report;
+}
+
+namespace {
+
+// Returns the replica group configuration where each replica belongs to its own
+// group.
+std::vector<ReplicaGroup> CreateReplicaGroups(int64 num_replicas) {
+  std::vector<ReplicaGroup> groups(num_replicas);
+  for (int64 i = 0; i < num_replicas; ++i) {
+    groups[i].add_replica_ids(i);
+  }
+  return groups;
+}
+
+bool CanReshardWithAllToAll(const HloSharding& source,
+                            const HloSharding& target) {
+  return UniqueTiledDim(source) && UniqueTiledDim(target) &&
+         UniqueTiledDim(source) != UniqueTiledDim(target);
+}
+
+bool CanReshardWithCollectivePermute(const HloSharding& source,
+                                     const HloSharding& target) {
+  return UniqueTiledDim(source) && UniqueTiledDim(target) &&
+         UniqueTiledDim(source) == UniqueTiledDim(target) && source != target;
+}
+
+// Clears all sharding attributes from instructions in the module. This must be
+// called only after all SPMD transformation is complete.
+Status ClearShardingAttributes(HloModule* module) {
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* hlo : computation->instructions()) {
+      // Keep sharding annotation on Infeed and entry parameters since they're
+      // used by HloReplicationAnalysis later (for ArCrsCombiner).
+      if (hlo->opcode() == HloOpcode::kInfeed) {
+        continue;
+      }
+      if (hlo->opcode() == HloOpcode::kParameter &&
+          computation == module->entry_computation()) {
+        continue;
+      }
+      hlo->clear_sharding();
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+HloInstruction* SpmdBuilder::AddInstruction(
+    std::unique_ptr<HloInstruction> instruction) {
+  HloInstruction* hlo =
+      HloComputation::Builder::AddInstruction(std::move(instruction));
+  if (visiting_hlo_) {
+    instructions_[visiting_hlo_].push_back(hlo);
+  }
+  return hlo;
+}
+
+PartitionedHlo PartitionedHlo::Reshard(const HloSharding& target) {
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
+  for (auto& entry : cache) {
+    if (entry.first == target) {
+      return entry.second;
+    }
+  }
+  cache.emplace_back(target, ReshardNoCache(target));
+  state_.reshard_cache->per_hlo_cache[cache.back().second.hlo()]
+      .reshard_cache.emplace_back(sharding(), *this);
+  return cache.back().second;
+}
+
+PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target) {
+  VLOG(2) << "Resharding " << hlo_->ToString() << " from "
+          << hlo_->sharding().ToString() << " to " << target.ToString();
+  const Shape& shape = hlo_->shape();
+  CHECK(shape.IsTuple() || !target.IsTuple());
+
+  // Tuple shape instructions may have non-tuple sharding, which means that the
+  // same sharding applies to all the leaves.
+  if (shape.IsTuple() && !target.IsTuple()) {
+    return Reshard(target.GetTupleSharding(shape).ValueOrDie());
+  }
+
+  // For a tuple shape, recursively apply Reshard to all the leaves and return
+  // a tuple instruction.
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> elements;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      auto subshape = ShapeUtil::GetTupleElementShape(shape, i);
+      auto element = state_.b->AddInstruction(
+          HloInstruction::CreateGetTupleElement(subshape, hlo(), i));
+      element->set_sharding(sharding().GetSubSharding(shape, {i}));
+      elements.push_back(
+          PartitionedHlo(
+              element, ShapeUtil::GetTupleElementShape(base_shape_, i), state_)
+              .Reshard(target.GetSubSharding(shape, {i}))
+              .hlo());
+    }
+    auto tuple =
+        state_.b->AddInstruction(HloInstruction::CreateTuple(elements));
+    tuple->set_sharding(target);
+    return PartitionedHlo(tuple, base_shape_, state_);
+  }
+
+  if (sharding() == target) {
+    return *this;
+  }
+
+  if (shape.element_type() == TOKEN) {
+    return *this;
+  }
+
+  if (CanReshardWithCollectivePermute(sharding(), target)) {
+    return ReshardWithCollectivePermute(target);
+  }
+
+  if (CanReshardWithAllToAll(sharding(), target)) {
+    return ReshardWithAllToAll(target);
+  }
+
+  // If not replicated yet, first replicate and then reshard to use one of the
+  // two implementations below.
+  if (!sharding().IsReplicated()) {
+    return Replicate().Reshard(target);
+  }
+
+  // 'Replicated' to 'SingleDevice'.
+  if (target.IsTileMaximal()) {
+    auto copy = state_.b->AddInstruction(
+        HloInstruction::CreateUnary(hlo_->shape(), HloOpcode::kCopy, hlo_));
+    copy->set_sharding(target);
+    return PartitionedHlo(copy, base_shape_, state_);
+  }
+
+  // 'Replicated' to 'Tiled'.
+  auto padded_hlo =
+      PadBaseShapeBeforeUnevenTiledSharding(hlo_, target, state_.b);
+  auto shard_shape = MakePartitionedShape(shape, target);
+  auto slice = state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
+      shard_shape, padded_hlo,
+      MakePartitionOffsets(shape, target, state_.partition_id, state_.b),
+      shard_shape.dimensions()));
+  slice->set_sharding(target);
+  return PartitionedHlo(slice, base_shape_, state_);
+}
+
+PartitionedHlo PartitionedHlo::PadWithValue(HloInstruction* pad_value) const {
+  const HloSharding& sharding = hlo_->sharding();
+  const Shape& shape = hlo_->shape();
+  CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
+  if (sharding.IsReplicated() || EvenlyPartitions(base_shape_, sharding)) {
+    return *this;
+  }
+  CHECK(!sharding.IsTileMaximal());
+  auto index_shape = ShapeUtil::ChangeElementType(shape, S32);
+  auto mask_shape = ShapeUtil::ChangeElementType(index_shape, PRED);
+  auto get_mask_for_dim = [&](int64 dim, HloInstruction* start_index) {
+    // Comparison: iota + start_index < valid_size
+    auto iota =
+        state_.b->AddInstruction(HloInstruction::CreateIota(index_shape, dim));
+    auto broadcast_start_index = state_.b->AddInstruction(
+        HloInstruction::CreateBroadcast(index_shape, start_index, {}));
+    auto index_in_full_shape =
+        state_.b->AddInstruction(HloInstruction::CreateBinary(
+            index_shape, HloOpcode::kAdd, iota, broadcast_start_index));
+    auto valid_size = state_.b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<int32>(base_shape_.dimensions(dim))));
+    auto broadcast_valid_size = state_.b->AddInstruction(
+        HloInstruction::CreateBroadcast(index_shape, valid_size, {}));
+    return state_.b->AddInstruction(HloInstruction::CreateCompare(
+        mask_shape, index_in_full_shape, broadcast_valid_size,
+        ComparisonDirection::kLt));
+  };
+
+  HloInstruction* mask = nullptr;
+  auto offsets = MakePartitionOffsets(base_shape_, sharding,
+                                      state_.partition_id, state_.b);
+  for (int64 i = 0; i < shape.rank(); ++i) {
+    if (base_shape_.dimensions(i) % sharding.tile_assignment().dim(i) == 0) {
+      continue;
+    }
+    if (mask == nullptr) {
+      mask = get_mask_for_dim(i, offsets[i]);
+    } else {
+      mask = state_.b->AddInstruction(
+          HloInstruction::CreateBinary(mask->shape(), HloOpcode::kAnd, mask,
+                                       get_mask_for_dim(i, offsets[i])));
+    }
+  }
+
+  if (mask == nullptr) {
+    return *this;
+  }
+
+  auto broadcast_pad_value = state_.b->AddInstruction(
+      HloInstruction::CreateBroadcast(shape, pad_value, {}));
+  auto result = state_.b->AddInstruction(HloInstruction::CreateTernary(
+      shape, HloOpcode::kSelect, mask, hlo_, broadcast_pad_value));
+  result->set_sharding(sharding);
+  return PartitionedHlo(result, base_shape_, state_);
+}
+
+absl::optional<PartitionedHlo::WindowedInputShardReturnValue>
+PartitionedHlo::ReshardAsWindowedInput(const Window& window,
+                                       const HloSharding& target,
+                                       HloInstruction* pad_value,
+                                       bool mask_invalid_region) {
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].window_reshard_cache;
+  for (auto& entry : cache) {
+    if (std::get<0>(entry) == target &&
+        protobuf_util::ProtobufEquals(std::get<1>(entry), window)) {
+      return std::get<2>(entry);
+    }
+  }
+  auto update_cache = [&](WindowedInputShardReturnValue result) {
+    cache.emplace_back(target, window, std::move(result));
+    return std::get<2>(cache.back());
+  };
+  VLOG(2) << "ReshardAsWindowedInput()\n"
+          << "\twindow:" << window_util::ToString(window)
+          << "\ttarget sharding:" << target.ToString();
+
+  CHECK(!target.IsTileMaximal());
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(target, state_.partition_id, state_.b);
+  auto shard_shape = base_shape_;
+
+  std::vector<MultiplyAddDivideOffsetCalculation> start_on_padded_calculations(
+      base_shape_.rank());
+  std::vector<MultiplyAddDivideOffsetCalculation> limit_on_padded_calculations(
+      base_shape_.rank());
+  std::vector<HloInstruction*> dynamic_slice_offset_on_output(
+      base_shape_.rank(), nullptr);
+
+  Window shard_window = window;
+  auto padded_shape = base_shape_;
+  std::vector<HloInstruction*> offsets_on_padded_shape(base_shape_.rank());
+  std::vector<int64> per_shard_window_counts(base_shape_.rank());
+  std::vector<int64> explicit_left_padding(base_shape_.rank());
+  for (int64 i = 0; i < base_shape_.rank(); ++i) {
+    // Do not pad non-partitioned dimensions.
+    int64 shard_count = target.tile_assignment().dim(i);
+    if (shard_count == 1) {
+      offsets_on_padded_shape[i] = state_.b->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      continue;
+    }
+    const auto& wd = window.dimensions(i);
+    if (wd.window_dilation() != 1) {
+      // TODO(yuanzx): Support window dilation.
+      VLOG(2) << "Failed to reshard window operand due to window dilation";
+      return absl::nullopt;
+    }
+    int64 full_size =
+        base_shape_.dimensions(i) +
+        (wd.base_dilation() - 1) * (base_shape_.dimensions(i) - 1) +
+        wd.padding_high() + wd.padding_low();
+    if (full_size < wd.size()) {
+      VLOG(2) << "Failed to reshard window operand because the window size is "
+                 "larger than padded base size";
+      return absl::nullopt;
+    }
+    int64 window_count = (full_size - wd.size()) / wd.stride() + 1;
+    per_shard_window_counts[i] = CeilOfRatio(window_count, shard_count);
+    if (wd.stride() != 1 &&
+        (wd.stride() * per_shard_window_counts[i]) % wd.base_dilation() != 0) {
+      // TODO(yuanzx): Support this case.
+      VLOG(2) << "Failed to reshard window operand due to non-trivial dilation";
+      return absl::nullopt;
+    }
+
+    // We use explicit padding for full dilations, then use padding_low and
+    // padding_high on the sharded op for the remaining. padding_low and
+    // padding_high are now given initial values, which will be later updated if
+    // dilation is not 1.
+    auto swd = shard_window.mutable_dimensions(i);
+    explicit_left_padding[i] = wd.padding_low() / wd.base_dilation();
+    swd->set_padding_low(wd.padding_low() % wd.base_dilation());
+    swd->set_padding_high(0);
+
+    // Calculation for the first element needed on the 'padded-but-not-dilated'
+    // shape. The start on the dilated shape could be a hole, so we add
+    // wd.base_dilation() - 1 to the constant term to skip the leading holes.
+    start_on_padded_calculations[i] = MultiplyAddDivideOffsetCalculation(
+        wd.stride() * per_shard_window_counts[i],
+        wd.base_dilation() - 1 - swd->padding_low(), wd.base_dilation());
+    int64 dilated_shard_size =
+        wd.stride() * (per_shard_window_counts[i] - 1) + wd.size();
+    limit_on_padded_calculations[i] = MultiplyAddDivideOffsetCalculation(
+        wd.stride() * per_shard_window_counts[i],
+        dilated_shard_size + wd.base_dilation() - 1 - swd->padding_low(),
+        wd.base_dilation());
+
+    offsets_on_padded_shape[i] = start_on_padded_calculations[i].Calculate(
+        partition_ordinals[i], state_.b);
+
+    auto shard_size_function =
+        limit_on_padded_calculations[i] - start_on_padded_calculations[i];
+    int64 max_shard_size = shard_size_function.MaxInRange(0, shard_count);
+    shard_shape.set_dimensions(i, max_shard_size);
+    padded_shape.set_dimensions(
+        i, limit_on_padded_calculations[i].Calculate(shard_count - 1));
+
+    // For base dilation, calculate the needed padding_low and padding_high, as
+    // well as the offset for the output if a dynamic slice is needed after the
+    // sharded op.
+    if (wd.base_dilation() != 1) {
+      // Returns the offset of a shard's first valid element in the dilated
+      // shard.
+      auto get_first_valid_element_offset_on_dilated_shard =
+          [&](int64 shard_ordinal) {
+            return start_on_padded_calculations[i].Calculate(shard_ordinal) *
+                       wd.base_dilation() +
+                   swd->padding_low() -
+                   wd.stride() * per_shard_window_counts[i] * shard_ordinal;
+          };
+      CHECK_EQ(get_first_valid_element_offset_on_dilated_shard(0),
+               swd->padding_low());
+
+      // Determine swd->padding_high.
+      for (int64 shard_ordinal = 0; shard_ordinal < shard_count;
+           ++shard_ordinal) {
+        int64 wanted_limit_on_dilated_shard =
+            wd.stride() * (per_shard_window_counts[i] - 1) + wd.size();
+        int64 actual_limit_on_dilated_shard_without_pad_high =
+            get_first_valid_element_offset_on_dilated_shard(shard_ordinal) +
+            (max_shard_size - 1) * wd.base_dilation() + 1;
+        swd->set_padding_high(std::max<int64>(
+            swd->padding_high(),
+            wanted_limit_on_dilated_shard -
+                actual_limit_on_dilated_shard_without_pad_high));
+      }
+
+      // Determine swd->padding_low and output dynamic slice index.
+      if (wd.stride() == 1) {
+        int64 max_pad_low = get_first_valid_element_offset_on_dilated_shard(0);
+        bool all_same = true;
+        for (int64 shard_ordinal = 1; shard_ordinal < shard_count;
+             ++shard_ordinal) {
+          int64 start =
+              get_first_valid_element_offset_on_dilated_shard(shard_ordinal);
+          if (start != swd->padding_low()) {
+            all_same = false;
+          }
+          max_pad_low = std::max(max_pad_low, start);
+        }
+        if (!all_same) {
+          auto start_on_padded_input =
+              start_on_padded_calculations[i].Calculate(partition_ordinals[i],
+                                                        state_.b);
+          // We will calculate
+          //   max_pad_low - (first_window - required_first_window)
+          // which equals
+          //   required_first_window - (first_window - max_pad_low)
+          auto first_window_minus_max_pad_low =
+              MultiplyAddDivideOffsetCalculation(
+                  wd.base_dilation(), swd->padding_low() - max_pad_low, 1)
+                  .Calculate(start_on_padded_input, state_.b);
+          auto required_first_window =
+              MultiplyAddDivideOffsetCalculation(per_shard_window_counts[i], 0,
+                                                 1)
+                  .Calculate(partition_ordinals[i], state_.b);
+          dynamic_slice_offset_on_output[i] =
+              state_.b->AddInstruction(HloInstruction::CreateBinary(
+                  required_first_window->shape(), HloOpcode::kSubtract,
+                  required_first_window, first_window_minus_max_pad_low));
+        }
+        swd->set_padding_low(max_pad_low);
+      } else {
+        CHECK_EQ(
+            (wd.stride() * per_shard_window_counts[i]) % wd.base_dilation(), 0)
+            << "General base dilation not yet implemented.";
+        // padding_low on all shards should equal the initially assigned
+        // swd->padding_low(), i.e., the padding_low() on the original window.
+      }
+    }
+  }
+
+  // Returns the output dynamic slice offset when needed, and absl::nullopt
+  // otherwise.
+  auto get_dynamic_slice_offset_on_output_if_needed =
+      [&]() -> absl::optional<std::vector<HloInstruction*>> {
+    if (absl::c_all_of(
+            dynamic_slice_offset_on_output,
+            [](HloInstruction* offset) { return offset == nullptr; })) {
+      return absl::nullopt;
+    }
+    auto zero = state_.b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+    for (int64 i = 0; i < dynamic_slice_offset_on_output.size(); ++i) {
+      if (dynamic_slice_offset_on_output[i] == nullptr) {
+        dynamic_slice_offset_on_output[i] = zero;
+      }
+    }
+    return dynamic_slice_offset_on_output;
+  };
+
+  // If the currrent HLO is replicated, pad then slice.
+  if (sharding().IsReplicated()) {
+    PaddingConfig padding_config;
+    for (int64 i = 0; i < base_shape_.rank(); ++i) {
+      auto padding_config_dim = padding_config.add_dimensions();
+      padding_config_dim->set_interior_padding(0);
+      // Do not pad non-partitioned dimensions.
+      if (target.tile_assignment().dim(i) == 1) {
+        padding_config_dim->set_edge_padding_low(0);
+        padding_config_dim->set_edge_padding_high(0);
+        continue;
+      }
+      padding_config_dim->set_edge_padding_low(explicit_left_padding[i]);
+      padding_config_dim->set_edge_padding_high(padded_shape.dimensions(i) -
+                                                explicit_left_padding[i] -
+                                                base_shape_.dimensions(i));
+    }
+    auto padded_hlo = ShapeUtil::Compatible(padded_shape, base_shape_)
+                          ? hlo_
+                          : state_.b->AddInstruction(HloInstruction::CreatePad(
+                                padded_shape, hlo_, pad_value, padding_config));
+    auto sharded_input =
+        state_.b->AddInstruction(HloInstruction::CreateDynamicSlice(
+            shard_shape, padded_hlo, offsets_on_padded_shape,
+            shard_shape.dimensions()));
+    return update_cache(WindowedInputShardReturnValue{
+        sharded_input, shard_window,
+        get_dynamic_slice_offset_on_output_if_needed()});
+  }
+
+  if (target != sharding()) {
+    return Replicate().ReshardAsWindowedInput(window, target, pad_value);
+  }
+
+  // Halo exchange.
+  HloInstruction* visiting_hlo = hlo_;
+  auto original_shard_shape = MakePartitionedShape(base_shape_, target);
+
+  std::vector<OffsetCalculation> left_halo_size_functions(base_shape_.rank());
+  std::vector<OffsetCalculation> right_halo_size_functions(base_shape_.rank());
+  // TODO(yuanzx): We are concatenating on each sharded dimension one at time,
+  // and in the second dimension (and beyond) we create halos by slicing the
+  // concat in the previous dimension, which is not optimal. We should generate
+  // halos only concating slices, instead of slicing concats.
+  for (int dim = 0; dim < base_shape_.rank(); ++dim) {
+    int64 shard_count = target.tile_assignment().dim(dim);
+    if (shard_count == 1) {
+      continue;
+    }
+    int64 input_shard_size =
+        CeilOfRatio(base_shape_.dimensions(dim), shard_count);
+
+    // Left halo. The size of the halo is derived by subtracting the first read
+    // element offset of the i'th partition from the limit of the (i-1)'th
+    // partition.
+    MultiplyAddDivideOffsetCalculation shard_limit_of_previous_on_padded(
+        input_shard_size, explicit_left_padding[dim], 1);
+    left_halo_size_functions[dim] =
+        shard_limit_of_previous_on_padded - start_on_padded_calculations[dim];
+
+    // Right halo.
+    MultiplyAddDivideOffsetCalculation shard_start_of_next_on_padded(
+        input_shard_size, input_shard_size + explicit_left_padding[dim], 1);
+    right_halo_size_functions[dim] =
+        limit_on_padded_calculations[dim] - shard_start_of_next_on_padded;
+
+    auto resharded = ExchangeHaloAndGetValidData(
+        visiting_hlo, base_shape_, left_halo_size_functions[dim],
+        right_halo_size_functions[dim], explicit_left_padding[dim],
+        padded_shape.dimensions(dim), shard_shape.dimensions(dim), dim, target,
+        offsets_on_padded_shape[dim], pad_value, partition_ordinals[dim],
+        state_.collective_ops_creator, state_.next_channel_id, state_.b,
+        mask_invalid_region);
+    if (!resharded) {
+      VLOG(1) << "ReshardAsWindowedInput failed without replicate first: halo "
+                 "is beyond the neighbor.";
+      return Replicate().ReshardAsWindowedInput(window, target, pad_value);
+    }
+    visiting_hlo = *resharded;
+  }
+  return update_cache(WindowedInputShardReturnValue{
+      visiting_hlo, shard_window,
+      get_dynamic_slice_offset_on_output_if_needed()});
+}
+
+PartitionedHlo PartitionedHlo::Replicate() {
+  const HloSharding& sharding = hlo_->sharding();
+  const Shape& shape = hlo_->shape();
+  CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
+
+  if (sharding.IsReplicated()) {
+    return *this;
+  }
+  auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].reshard_cache;
+  for (auto& entry : cache) {
+    if (entry.first.IsReplicated()) {
+      return entry.second;
+    }
+  }
+  auto update_cache = [&](PartitionedHlo resharded) {
+    state_.reshard_cache->per_hlo_cache[resharded.hlo()]
+        .reshard_cache.emplace_back(sharding, *this);
+    cache.emplace_back(HloSharding::Replicate(), std::move(resharded));
+    return cache.back().second;
+  };
+  // 'Single Device' to 'Repliated'.
+  if (sharding.IsTileMaximal()) {
+    return update_cache(Broadcast());
+  }
+
+  // 'Tiled' to 'Replicated'.
+  Shape padded_base_shape = shape;
+  for (int64 i = 0; i < padded_base_shape.rank(); ++i) {
+    padded_base_shape.set_dimensions(
+        i, shape.dimensions(i) * sharding.tile_assignment().dim(i));
+  }
+  auto zero = state_.b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
+  auto zero_bcast = state_.b->AddInstruction(
+      HloInstruction::CreateBroadcast(padded_base_shape, zero, {}));
+  auto dus = state_.b->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      padded_base_shape, zero_bcast, hlo_,
+      MakePartitionOffsets(padded_base_shape, sharding, state_.partition_id,
+                           state_.b)));
+  HloComputation* reduction =
+      MakeBinaryAdd(shape.element_type(), state_.module);
+
+  auto all_reduce =
+      state_.collective_ops_creator.create_cross_partition_all_reduce(
+          state_.b, dus, reduction, NewChannel());
+  HloInstruction* result = all_reduce;
+  if (!ShapeUtil::Compatible(base_shape_, padded_base_shape)) {
+    std::vector<int64> start_indices(shape.rank(), 0);
+    std::vector<int64> strides(shape.rank(), 1);
+    result = state_.b->AddInstruction(HloInstruction::CreateSlice(
+        base_shape_, result, start_indices, base_shape_.dimensions(), strides));
+  }
+  result->set_sharding(HloSharding::Replicate());
+  return update_cache(PartitionedHlo(result, base_shape_, state_));
+}
+
+PartitionedHlo PartitionedHlo::Broadcast() const {
+  const Shape& shape = hlo_->shape();
+  const HloSharding& sharding = hlo_->sharding();
+  CHECK(sharding.HasUniqueDevice());
+  CHECK(!shape.IsTuple() && shape.element_type() != TOKEN);
+
+  auto src_core_id = state_.b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR0<uint32>(sharding.GetUniqueDevice())));
+  Shape bcast_shape = ShapeUtil::ChangeElementType(shape, PRED);
+  auto is_src_core = state_.b->AddInstruction(HloInstruction::CreateBroadcast(
+      bcast_shape,
+      state_.b->AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::MakeShape(PRED, {}), state_.partition_id, src_core_id,
+          ComparisonDirection::kEq)),
+      {}));
+
+  auto zero = state_.b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
+  auto zero_bcast = state_.b->AddInstruction(
+      HloInstruction::CreateBroadcast(shape, zero, {}));
+  auto operand = state_.b->AddInstruction(HloInstruction::CreateTernary(
+      shape, HloOpcode::kSelect, is_src_core, hlo(), zero_bcast));
+  HloComputation* reduction =
+      MakeBinaryAdd(shape.element_type(), state_.module);
+
+  auto result = state_.collective_ops_creator.create_cross_partition_all_reduce(
+      state_.b, operand, reduction, NewChannel());
+  result->set_sharding(HloSharding::Replicate());
+  return PartitionedHlo(result, base_shape_, state_);
+}
+
+PartitionedHlo PartitionedHlo::ReshardWithAllToAll(
+    const HloSharding& target) const {
+  int64 partition_count = sharding().tile_assignment().num_elements();
+  absl::optional<int64> input_partition_dim = UniqueTiledDim(sharding());
+  absl::optional<int64> output_partition_dim = UniqueTiledDim(target);
+  CHECK(input_partition_dim.has_value());
+  CHECK(output_partition_dim.has_value());
+
+  // If the device order is different in the target, fix the order with
+  // ReshardWithCollectivePermute.
+  auto input_tile_fixed_device_order = target.tile_assignment();
+  input_tile_fixed_device_order.Reshape(
+      sharding().tile_assignment().dimensions());
+  auto input_sharding_fixed_device_order =
+      HloSharding::Tile(input_tile_fixed_device_order);
+  if (input_sharding_fixed_device_order != sharding()) {
+    auto fixed_order =
+        ReshardWithCollectivePermute(input_sharding_fixed_device_order);
+    return fixed_order.ReshardWithAllToAll(target);
+  }
+
+  auto padded_hlo =
+      PadBaseShapeBeforeUnevenTiledSharding(hlo_, target, state_.b);
+
+  // The order of ids in the group must follow the target sharding.
+  std::vector<ReplicaGroup> groups(1);
+  for (int64 device : target.tile_assignment()) {
+    groups[0].add_replica_ids(device);
+  }
+
+  HloInstruction* result = nullptr;
+
+  // Split along the split dimension (output_partition_dim) of the all-to-all
+  // output.
+  std::vector<int64> dimensions;
+  for (int64 i = 0; i < base_shape_.rank(); ++i) {
+    if (i == *output_partition_dim) {
+      dimensions.push_back(partition_count);
+      dimensions.push_back(padded_hlo->shape().dimensions(i) / partition_count);
+    } else {
+      dimensions.push_back(padded_hlo->shape().dimensions(i));
+    }
+  }
+  auto reshape = state_.b->AddInstruction(HloInstruction::CreateReshape(
+      ShapeUtil::MakeShape(base_shape_.element_type(), dimensions),
+      padded_hlo));
+  // After the reshape, it is guaranteed to have at least 3 dimensions.
+  auto all_to_all =
+      state_.collective_ops_creator.create_cross_partition_all_to_all(
+          state_.b, {reshape}, groups, (*state_.next_channel_id)++,
+          output_partition_dim);
+
+  // Reorder the split dimension of the reshape to be located in front of the
+  // input partition dimension, so the two dimensions can be combined.
+  int64 new_input_partition_dim = (*output_partition_dim < *input_partition_dim)
+                                      ? *input_partition_dim + 1
+                                      : *input_partition_dim;
+  std::vector<int64> permutation;
+  for (int64 i = 0; i < all_to_all->shape().rank(); ++i) {
+    if (i == *output_partition_dim) {
+      continue;
+    }
+    if (i == new_input_partition_dim) {
+      permutation.push_back(*output_partition_dim);
+    }
+    permutation.push_back(i);
+  }
+  auto transpose = state_.b->AddInstruction(HloInstruction::CreateTranspose(
+      ShapeInference::InferTransposeShape(all_to_all->shape(), permutation)
+          .ValueOrDie(),
+      all_to_all, permutation));
+
+  // Combine the split dimension and the input partition dimension.
+  auto new_shape = ShapeInference::InferAllToAllShape(
+                       padded_hlo->shape(), *output_partition_dim,
+                       *input_partition_dim, partition_count)
+                       .ValueOrDie();
+  result = state_.b->AddInstruction(
+      HloInstruction::CreateReshape(new_shape, transpose));
+
+  const Shape result_shape = MakePartitionedShape(base_shape_, target);
+  if (result_shape != result->shape()) {
+    result = state_.b->AddInstruction(HloInstruction::CreateSlice(
+        result_shape, result, std::vector<int64>(result_shape.rank(), 0),
+        result_shape.dimensions(), std::vector<int64>(result_shape.rank(), 1)));
+  }
+  result->set_sharding(target);
+  return PartitionedHlo(result, base_shape_, state_);
+}
+
+PartitionedHlo PartitionedHlo::ReshardWithCollectivePermute(
+    const HloSharding& target) const {
+  CHECK(CanReshardWithCollectivePermute(sharding(), target));
+  std::vector<std::pair<int64, int64>> src_dst_pairs;
+  sharding().tile_assignment().Each(
+      [&](absl::Span<const int64> indices, int64 src_device) {
+        int64 dst_device = target.tile_assignment()(indices);
+        if (dst_device != src_device) {
+          src_dst_pairs.emplace_back(src_device, dst_device);
+        }
+      });
+  auto cp =
+      state_.collective_ops_creator.create_cross_partition_collective_permute(
+          state_.b, hlo(), src_dst_pairs, (*state_.next_channel_id)++);
+  cp->set_sharding(target);
+  return PartitionedHlo(cp, base_shape_, state_);
+}
+
+SpmdPartitioningVisitor::SpmdPartitioningVisitor(
+    HloComputation* computation, int64 num_partitions, int64 num_replicas,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdLogger* logger, SpmdPartitionerOptions options,
+    SpmdPartitioner* partitioner)
+    : changed_(false),
+      module_(computation->parent()),
+      num_partitions_(num_partitions),
+      num_replicas_(num_replicas),
+      collective_ops_creator_(collective_ops_creator),
+      next_channel_id_(next_channel_id),
+      b_(SpmdBuilder(computation->name() + "_spmd", /*hlo=*/nullptr)),
+      partition_id_(collective_ops_creator_.create_partition_id(&b_)),
+      logger_(logger),
+      options_(std::move(options)),
+      partitioner_(partitioner) {}
+
+Status SpmdPartitioningVisitor::DefaultAction(HloInstruction* hlo) {
+  if (hlo->HasSideEffect()) {
+    return Unimplemented("Side-effect ops cannot be replicated: %s",
+                         hlo->ToString());
+  }
+
+  if (hlo->IsElementwise() && hlo->operand_count() > 0) {
+    return HandleElementwise(hlo);
+  }
+
+  if (!hlo->sharding().IsTileMaximal()) {
+    VLOG(1) << "Not partitioned in SPMD mode (DefaultAction):"
+            << hlo->ToString();
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      VLOG(1) << "  operand " << i
+              << " sharding:" << hlo->operand(i)->sharding().ToString();
+    }
+  }
+
+  // If the instruction cannot be partitioned, replicate the instruction unless
+  // the instruction has side-effect.
+  std::vector<HloInstruction*> new_operands;
+  for (HloInstruction* operand : hlo->operands()) {
+    new_operands.push_back(
+        GetPartitionedHlo(operand).Reshard(HloSharding::Replicate()).hlo());
+  }
+  auto clone =
+      b_.AddInstruction(hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+  clone->set_sharding(HloSharding::Replicate());
+  clone->set_metadata(hlo->metadata());
+  SetPartitionedHlo(hlo,
+                    PartitionedHlo(clone, hlo->shape(), MakePartitioningState())
+                        .Reshard(hlo->sharding()));
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::Preprocess(HloInstruction* hlo) {
+  visiting_hlo_ = hlo;
+  b_.set_visiting_hlo(hlo);
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::Postprocess(HloInstruction* hlo) {
+  logger_->RegisterLogEntry(GetPartitionedHlo(hlo).hlo(),
+                            b_.derived_instructions(hlo));
+  visiting_hlo_ = nullptr;
+  b_.set_visiting_hlo(nullptr);
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleElementwise(HloInstruction* hlo) {
+  std::vector<HloInstruction*> new_operands;
+  for (HloInstruction* operand : hlo->operands()) {
+    new_operands.push_back(
+        GetPartitionedHlo(operand).Reshard(hlo->sharding()).hlo());
+  }
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConcatenate(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  const Shape shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+  const int64 dimension = hlo->concatenate_dimension();
+  if (sharding.tile_assignment().dim(dimension) == 1) {
+    std::vector<HloInstruction*> new_operands;
+    for (HloInstruction* operand : hlo->operands()) {
+      new_operands.push_back(
+          GetPartitionedHlo(operand).Reshard(sharding).hlo());
+    }
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(
+          hlo->CloneWithNewOperands(shard_shape, new_operands));
+    });
+    return Status::OK();
+  }
+
+  // If the concatenate dimension is along one of the partitioned dimensions,
+  // allocate the full output shape, each partition updates its owned region,
+  // all-reduce across partitions, and then slice its output region.
+
+  // We currently don't support subgroup all-reduce along partitions, so more
+  // than 1 partitioned dimensions is not supported.
+  if (sharding.tile_assignment().dim(dimension) != num_partitions_) {
+    return DefaultAction(hlo);
+  }
+
+  // temp_output_shape is the output shape where the concatenate dimension
+  // is changed to the full (and padded to shard count) dimension size.
+  auto temp_output_shape = MakePartitionedShape(hlo->shape(), sharding);
+  temp_output_shape.set_dimensions(
+      dimension, temp_output_shape.dimensions(dimension) *
+                     sharding.tile_assignment().dim(dimension));
+  auto temp_output = CreateZero(temp_output_shape, &b_);
+
+  // Offset of each operand along the concatenate dimension.
+  int64 offset = 0;
+  for (HloInstruction* operand : hlo->operands()) {
+    auto spmd_operand = GetPartitionedHlo(operand).Reshard(sharding).hlo();
+    std::vector<HloInstruction*> start_indices(
+        hlo->shape().rank(), b_.AddInstruction(HloInstruction::CreateConstant(
+                                 LiteralUtil::Zero(S32))));
+    start_indices[dimension] =
+        MultiplyAddDivideOffsetCalculation(
+            spmd_operand->shape().dimensions(dimension), offset, 1)
+            .Calculate(MakeTiledPartitionOrdinals(sharding, partition_id_,
+                                                  &b_)[dimension],
+                       &b_);
+    temp_output = b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+        temp_output_shape, temp_output, spmd_operand, start_indices));
+    offset += operand->shape().dimensions(dimension);
+  }
+  auto all_reduce = collective_ops_creator_.create_cross_partition_all_reduce(
+      &b_, temp_output, MakeBinaryAdd(hlo->shape().element_type(), module_),
+      NewChannel());
+  SetPartitionedHlo(hlo, [&] {
+    auto start_indices =
+        MakeTiledPartitionOrdinals(hlo->sharding(), partition_id_, &b_);
+    start_indices[dimension] = MultiplyAddDivideOffsetCalculation(
+                                   shard_shape.dimensions(dimension), 0, 1)
+                                   .Calculate(start_indices[dimension], &b_);
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, all_reduce, start_indices, shard_shape.dimensions()));
+  });
+
+  return Status::OK();
+}
+
+// If partitioning in the operand only happens in dimensions in passthrough
+// dimensions (offset dimensions in the gather output (or scatter update) that
+// have the same size as the operand), returns the corresponding output (or
+// update) sharding by passing through the input sharding.
+absl::optional<HloSharding> PassthroughOperandToGatherOutputOrScatterUpdate(
+    const PartitionedHlo& operand, const Shape& update_or_gather_shape,
+    absl::Span<const int64> collapsed_or_inserted_dims,
+    absl::Span<const int64> index_map,
+    absl::Span<const int64> offset_or_window_dims,
+    absl::Span<const int64> slice_size) {
+  if (operand.sharding().IsTileMaximal()) {
+    return operand.sharding();
+  }
+  std::vector<int64> passthrough_tile(update_or_gather_shape.rank(), 1);
+  int64 collapsed = 0;
+  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+    int64 dim_partitions = operand.sharding().tile_assignment().dim(i);
+    if (absl::c_linear_search(collapsed_or_inserted_dims, i) ||
+        absl::c_linear_search(index_map, i)) {
+      if (dim_partitions > 1) {
+        return absl::nullopt;
+      }
+      collapsed++;
+      continue;
+    }
+    if (slice_size[i] != operand.base_shape().dimensions(i) &&
+        dim_partitions > 1) {
+      return absl::nullopt;
+    }
+    int64 offset_dim = offset_or_window_dims[i - collapsed];
+    if (i - collapsed > 0 &&
+        offset_dim < offset_or_window_dims[i - collapsed - 1]) {
+      // Output offsets are transposed, we do not support this case.
+      return absl::nullopt;
+    }
+    passthrough_tile[offset_dim] = dim_partitions;
+  }
+  Array<int64> tile_assignment = operand.sharding().tile_assignment();
+  tile_assignment.Reshape(passthrough_tile);
+  return HloSharding::Tile(tile_assignment);
+}
+
+// Returns whether partitioning in the operand only happens in dimensions with
+// gather/scatter slice size 1.
+bool GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+    const PartitionedHlo& operand, absl::Span<const int64> index_map,
+    absl::Span<const int64> slice_size, int64 num_partitions) {
+  if (operand.sharding().IsTileMaximal()) {
+    return false;
+  }
+  int64 trivial_slice_dims_partitions = 1;
+  for (int64 dim : index_map) {
+    if (slice_size[dim] == 1) {
+      trivial_slice_dims_partitions *=
+          operand.sharding().tile_assignment().dim(dim);
+    }
+  }
+  return trivial_slice_dims_partitions == num_partitions;
+}
+
+// Returns the min and max for the indices (replicated) in a scatter/gather
+// which has the operand partitioned on trivial slice dimensions (slice size 1).
+std::pair<HloInstruction*, HloInstruction*>
+IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
+    const PartitionedHlo& operand, const PartitionedHlo& replicated_indices,
+    HloInstruction* partition_id, absl::Span<const int64> index_map,
+    int64 index_vector_dim, SpmdBuilder* b) {
+  auto operand_offsets = MakePartitionOffsets(
+      operand.base_shape(), operand.sharding(), partition_id, b);
+  // Find the per-dimension index bounds.
+  std::vector<HloInstruction*> min_indices;
+  std::vector<HloInstruction*> max_indices;
+  for (int64 i = 0; i < index_map.size(); ++i) {
+    int64 dim = index_map[i];
+    int64 partitions = operand.sharding().tile_assignment().dim(dim);
+    if (partitions == 1) {
+      min_indices.push_back(CreateR0WithType<int32>(
+          replicated_indices.base_shape().element_type(), 0, b));
+      max_indices.push_back(CreateR0WithType<int32>(
+          replicated_indices.base_shape().element_type(),
+          operand.base_shape().dimensions(dim), b));
+      continue;
+    }
+    auto offset = operand_offsets[dim];
+    if (offset->shape().element_type() !=
+        replicated_indices.base_shape().element_type()) {
+      offset = b->AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::MakeShape(replicated_indices.base_shape().element_type(),
+                               {}),
+          offset));
+    }
+    min_indices.push_back(offset);
+    auto partition_size_minus_1 =
+        CreateR0WithType<int32>(replicated_indices.base_shape().element_type(),
+                                operand.hlo()->shape().dimensions(dim) - 1, b);
+    max_indices.push_back(b->AddInstruction(HloInstruction::CreateBinary(
+        offset->shape(), HloOpcode::kAdd, offset, partition_size_minus_1)));
+  }
+  // Broadcast the index bounds to the same shape as the indices.
+  HloInstruction* broadcast_min;
+  HloInstruction* broadcast_max;
+  if (index_vector_dim < replicated_indices.base_shape().rank()) {
+    // The index vector is an R1, we need to reshape individual bounds to
+    // [1], and concat them if there are more than one.
+    for (int64 i = 0; i < min_indices.size(); ++i) {
+      min_indices[i] = b->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(min_indices[i]->shape().element_type(), {1}),
+          min_indices[i]));
+      max_indices[i] = b->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(max_indices[i]->shape().element_type(), {1}),
+          max_indices[i]));
+    }
+    int64 slice_dims = max_indices.size();
+    if (slice_dims > 1) {
+      min_indices[0] = b->AddInstruction(HloInstruction::CreateConcatenate(
+          ShapeUtil::MakeShape(min_indices[0]->shape().element_type(),
+                               {slice_dims}),
+          min_indices, 0));
+      max_indices[0] = b->AddInstruction(HloInstruction::CreateConcatenate(
+          min_indices[0]->shape(), max_indices, 0));
+    }
+    broadcast_min = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), min_indices[0], {index_vector_dim}));
+    broadcast_max = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), max_indices[0], {index_vector_dim}));
+  } else {
+    CHECK_EQ(max_indices.size(), 1);
+    broadcast_min = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), min_indices[0], {}));
+    broadcast_max = b->AddInstruction(HloInstruction::CreateBroadcast(
+        replicated_indices.base_shape(), max_indices[0], {}));
+  }
+  return {broadcast_min, broadcast_max};
+}
+
+Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
+  auto scatter = Cast<HloScatterInstruction>(hlo);
+  auto dnums = scatter->scatter_dimension_numbers();
+  auto operand = GetPartitionedHlo(scatter->operand(0));
+  auto indices = GetPartitionedHlo(scatter->operand(1));
+  auto updates = GetPartitionedHlo(scatter->operand(2));
+  std::vector<int64> slice_size(operand.base_shape().rank(), 1);
+  int64 num_update_window_dims = 0;
+  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.inserted_window_dims(), i)) {
+      continue;
+    }
+    slice_size[i] = updates.base_shape().dimensions(
+        dnums.update_window_dims(num_update_window_dims++));
+  }
+  std::vector<int64> inserted_window_dims(dnums.inserted_window_dims().begin(),
+                                          dnums.inserted_window_dims().end());
+  std::vector<int64> scatter_dims_to_operand_dims(
+      dnums.scatter_dims_to_operand_dims().begin(),
+      dnums.scatter_dims_to_operand_dims().end());
+  std::vector<int64> update_window_dims(dnums.update_window_dims().begin(),
+                                        dnums.update_window_dims().end());
+  if (!operand.sharding().IsTileMaximal()) {
+    auto maybe_passthrough = PassthroughOperandToGatherOutputOrScatterUpdate(
+        operand, updates.base_shape(), inserted_window_dims,
+        scatter_dims_to_operand_dims, update_window_dims, slice_size);
+    // Handle pass through cases if we can use compatible sharding for update.
+    if (maybe_passthrough.has_value()) {
+      indices = indices.Reshard(HloSharding::Replicate());
+      updates = updates.Reshard(*maybe_passthrough);
+      auto pscatter = b_.AddInstruction(HloInstruction::CreateScatter(
+          operand.hlo()->shape(), operand.hlo(), indices.hlo(), updates.hlo(),
+          scatter->to_apply(), dnums, scatter->indices_are_sorted(),
+          scatter->unique_indices()));
+      pscatter->set_sharding(*maybe_passthrough);
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pscatter, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+    if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+            operand, scatter_dims_to_operand_dims, slice_size,
+            num_partitions_) &&
+        ShapeUtil::ByteSizeOf(updates.base_shape()) <
+            ShapeUtil::ByteSizeOf(scatter->shape())) {
+      // Operand is sharded on trivial slice dims (update slice size 1). We can
+      // adjust the indices on each partition by subtracting the offsets. Then
+      // we execute a scatter on full updated indices, and out-of-bound accesses
+      // will have no effect on the result as guaranteed by the scatter
+      // semantics.
+      indices = indices.Reshard(HloSharding::Replicate());
+      updates = updates.Reshard(HloSharding::Replicate());
+      HloInstruction* indices_min;
+      HloInstruction* indices_max_unused;
+      std::tie(indices_min, indices_max_unused) =
+          IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
+              operand, indices, partition_id_, scatter_dims_to_operand_dims,
+              dnums.index_vector_dim(), &b_);
+      auto adjusted_indices = b_.AddInstruction(HloInstruction::CreateBinary(
+          indices.hlo()->shape(), HloOpcode::kSubtract, indices.hlo(),
+          indices_min));
+      auto pscatter = b_.AddInstruction(HloInstruction::CreateScatter(
+          operand.hlo()->shape(), operand.hlo(), adjusted_indices,
+          updates.hlo(), scatter->to_apply(), dnums,
+          scatter->indices_are_sorted(), scatter->unique_indices()));
+      pscatter->set_sharding(operand.sharding());
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pscatter, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleSlice(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  auto operand = GetPartitionedHlo(hlo->operand(0)).Reshard(sharding);
+
+  // Create a window config to represent the slice.
+  Window window;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(1);
+    dim->set_stride(hlo->slice_strides(i));
+    dim->set_window_dilation(1);
+    dim->set_window_reversal(false);
+    dim->set_padding_low(-hlo->slice_starts(i));
+    dim->set_padding_high(hlo->slice_limits(i) -
+                          hlo->operand(0)->shape().dimensions(i));
+    dim->set_base_dilation(1);
+  }
+
+  auto reshard_operand = operand.ReshardAsWindowedInput(
+      window, sharding,
+      CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+      /*mask_invalid_region=*/false);
+  if (!reshard_operand.has_value()) {
+    return DefaultAction(hlo);
+  }
+  TF_RET_CHECK(!reshard_operand->dynamic_slice_index_on_output.has_value());
+  const Shape& operand_shape = reshard_operand->sharded_input->shape();
+
+  std::vector<int64> start_indices = hlo->slice_starts();
+  std::vector<int64> limit_indices = hlo->slice_limits();
+  std::vector<int64> strides = hlo->slice_strides();
+  bool need_slice = false;
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    auto dim = reshard_operand->shard_window.dimensions(i);
+    start_indices[i] = -dim.padding_low();
+    limit_indices[i] = operand_shape.dimensions(i) + dim.padding_high();
+    if (start_indices[i] != 0 || strides[i] != 1 ||
+        limit_indices[i] != operand_shape.dimensions(i)) {
+      need_slice = true;
+    }
+  }
+
+  SetPartitionedHlo(hlo, [&] {
+    if (need_slice) {
+      auto shard_shape = MakePartitionedShape(hlo->shape(), sharding);
+      return b_.AddInstruction(HloInstruction::CreateSlice(
+          shard_shape, reshard_operand->sharded_input, start_indices,
+          limit_indices, strides));
+    }
+    return reshard_operand->sharded_input;
+  });
+
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleSort(HloInstruction* hlo) {
+  HloSharding sharding = hlo->sharding();
+  if (hlo->shape().IsTuple()) {
+    // Check that all elements are sharded in the same way.
+    if (hlo->shape().tuple_shapes_size() == 0) {
+      return DefaultAction(hlo);
+    }
+    sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
+    for (int64 i = 1; i < hlo->operand_count(); ++i) {
+      if (sharding != hlo->sharding().GetSubSharding(hlo->shape(), {i})) {
+        return DefaultAction(hlo);
+      }
+    }
+  }
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 dim : hlo->dimensions()) {
+    if (sharding.tile_assignment().dim(dim) > 1) {
+      return DefaultAction(hlo);
+    }
+  }
+  // Reshard operands to the same as the output.
+  std::vector<HloInstruction*> new_operands;
+  for (HloInstruction* operand : hlo->operands()) {
+    new_operands.push_back(GetPartitionedHlo(operand).Reshard(sharding).hlo());
+  }
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleCustomCall(HloInstruction* hlo) {
+  if (hlo->custom_call_target() == "SPMDFullToShardShape") {
+    // This op switches from auto partitioning to manual partitioning.
+    auto input_partitioned = GetPartitionedHlo(hlo->operand(0));
+    if (!EvenlyPartitions(hlo->shape(), input_partitioned.sharding())) {
+      input_partitioned = input_partitioned.PadWithValue(
+          CreateR0WithType(hlo->shape().element_type(), 0, &b_));
+    }
+    auto input = input_partitioned.hlo();
+    CHECK(hlo->sharding().IsReplicated());
+    CHECK(ShapeUtil::Compatible(input->shape(), hlo->shape()));
+    auto copy = b_.AddInstruction(
+        HloInstruction::CreateUnary(input->shape(), HloOpcode::kCopy, input));
+    SetPartitionedHlo(hlo, [&] { return copy; });
+    return Status::OK();
+  }
+  if (hlo->custom_call_target() == "SPMDShardToFullShape") {
+    // This op switches from manual partitioning to auto partitioning.
+    auto input = GetPartitionedHlo(hlo->operand(0)).hlo();
+    CHECK(input->sharding().IsReplicated());
+    auto copy = b_.AddInstruction(
+        HloInstruction::CreateUnary(input->shape(), HloOpcode::kCopy, input));
+    CHECK(ShapeUtil::Compatible(
+        copy->shape(), MakePartitionedShape(hlo->shape(), hlo->sharding())));
+    SetPartitionedHlo(hlo, [&] { return copy; });
+    return Status::OK();
+  }
+  if (hlo->custom_call_target() != "TopK") {
+    return DefaultAction(hlo);
+  }
+
+  if (!hlo->operand(0)->has_sharding()) {
+    return DefaultAction(hlo);
+  }
+
+  const HloSharding& sharding = hlo->operand(0)->sharding();
+  if (sharding.IsTileMaximal() || sharding.IsReplicated()) {
+    return DefaultAction(hlo);
+  }
+
+  const int64 sort_dim = 1;
+  const int64 shard_count = sharding.tile_assignment().dim(sort_dim);
+
+  if (shard_count <= 1) {
+    return DefaultAction(hlo);
+  }
+
+  const int64 input_size = hlo->operand(0)->shape().dimensions(sort_dim);
+  const int64 batch_size = hlo->shape().tuple_shapes(0).dimensions(0);
+  const int64 k = hlo->shape().tuple_shapes(0).dimensions(sort_dim);
+  const int64 per_partition_size = CeilOfRatio(input_size, shard_count);
+
+  if (k >= per_partition_size) {
+    return DefaultAction(hlo);
+  }
+
+  auto input = hlo->operand(0);
+  const auto element_type = input->shape().element_type();
+
+  // Pad input with minimal value.
+  auto min_value = b_.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::MinValue(element_type)));
+  // TODO(wangtao): add test to see if -NaN < -Inf in BF16.
+  if (element_type == F32) {
+    auto float_pad_value = std::numeric_limits<float>::quiet_NaN();
+    min_value = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<float>(-float_pad_value)));
+  }
+  auto partitioned_input = GetPartitionedHlo(input).PadWithValue(min_value);
+
+  // Each partition needs to do TopK separately, thus the base shape
+  // becomes [batch_size, k * shard_count].
+  const Shape replicated_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(hlo->operand(0)->shape().element_type(),
+                            {batch_size, k * shard_count}),
+       ShapeUtil::MakeShape(S32, {batch_size, k * shard_count})});
+  auto custom_call_sharding =
+      sharding.GetTupleSharding(replicated_shape).ValueOrDie();
+  auto shard_shape =
+      MakePartitionedShape(replicated_shape, custom_call_sharding);
+  auto topk = b_.AddInstruction(
+      hlo->CloneWithNewOperands(shard_shape, {partitioned_input.hlo()}));
+  topk->set_sharding(custom_call_sharding);
+  // Partition customcall.
+  PartitionedHlo partitioned_topk(topk, replicated_shape,
+                                  MakePartitioningState());
+  topk = partitioned_topk.hlo();
+
+  // Get value from TopK.
+  HloInstruction* value_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          topk->shape().tuple_shapes(0), topk, 0));
+  value_gte->set_sharding(sharding);
+  // Partition GetTupleElement of value.
+  PartitionedHlo value_partitioned_gte(
+      value_gte, partitioned_topk.base_shape().tuple_shapes(0),
+      MakePartitioningState());
+  // Reshard value to be replicated.
+  auto replicated_value_gte =
+      value_partitioned_gte.Reshard(HloSharding::Replicate()).hlo();
+
+  // Get index from TopK.
+  HloInstruction* index_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          topk->shape().tuple_shapes(1), topk, 1));
+  auto partition_id_s32 = b_.AddInstruction(HloInstruction::CreateConvert(
+      ShapeUtil::MakeShape(S32, partition_id_->shape().dimensions()),
+      partition_id_));
+  // Add per partition offset to index, index returned from CustomCall always
+  // starts from 0.
+  auto index_offset = b_.AddInstruction(HloInstruction::CreateBroadcast(
+      index_gte->shape(),
+      b_.AddInstruction(HloInstruction::CreateBinary(
+          partition_id_s32->shape(), HloOpcode::kMultiply, partition_id_s32,
+          b_.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<int32>(per_partition_size))))),
+      {}));
+  index_gte = b_.AddInstruction(HloInstruction::CreateBinary(
+      index_offset->shape(), HloOpcode::kAdd, index_gte, index_offset));
+  index_gte->set_sharding(sharding);
+  // Parttion GetTupleElement of index.
+  PartitionedHlo index_partitioned_gte(
+      index_gte, partitioned_topk.base_shape().tuple_shapes(1),
+      MakePartitioningState());
+  // Reshard index to be replicated.
+  auto replicated_index_gte =
+      index_partitioned_gte.Reshard(HloSharding::Replicate()).hlo();
+
+  // Creates replicated sort to do TopK, the input is value and index pairs
+  // from all the partitions. The reason to use Sort instead of CustomCall TopK
+  // is CustomCall only takes value as input. There will be an extra Gather
+  // to get the correct index if CustomCall is used here.
+
+  // Create comparator for the sort.
+  XlaBuilder b("Sort.Compare");
+  XlaComputation comparator = CreateScalarComparisonComputation(
+      "compare-value-and-index", {input->shape().element_type(), S32}, {Gt, Lt},
+      &b);
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, comparator.GetProgramShape());
+  HloModuleConfig config(program_shape);
+  TF_ASSIGN_OR_RETURN(auto new_module,
+                      HloModule::CreateFromProto(comparator.proto(), config));
+  HloCloneContext context(module_);
+  auto compare_computation =
+      module_->DeepCloneComputation(new_module->entry_computation(), &context);
+  auto sort = b_.AddInstruction(HloInstruction::CreateSort(
+      replicated_shape, sort_dim, {replicated_value_gte, replicated_index_gte},
+      compare_computation, true));
+  sort->set_sharding(
+      HloSharding::Replicate().GetTupleSharding(sort->shape()).ValueOrDie());
+  PartitionedHlo replicated_sort(sort, replicated_shape,
+                                 MakePartitioningState());
+
+  // Slice value and index from top-k for output.
+  HloInstruction* sort_value_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          replicated_sort.hlo()->shape().tuple_shapes(0), replicated_sort.hlo(),
+          0));
+  HloInstruction* sort_index_gte =
+      b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          replicated_sort.hlo()->shape().tuple_shapes(1), replicated_sort.hlo(),
+          1));
+  const Shape& hlo_shape = sort_value_gte->shape();
+  auto hlo_dims = hlo_shape.dimensions();
+  std::vector<int64> start_indices(hlo_shape.dimensions_size(), 0);
+  std::vector<int64> limit_indices(hlo_dims.begin(), hlo_dims.end());
+  std::vector<int64> strides(hlo_shape.dimensions_size(), sort_dim);
+  limit_indices[sort_dim] = k;
+  auto output_shape = hlo_shape;
+  output_shape.set_dimensions(sort_dim, k);
+  // Slice value from final sort.
+  HloInstruction* slice_sort_value =
+      b_.AddInstruction(HloInstruction::CreateSlice(
+          output_shape, sort_value_gte, start_indices, limit_indices, strides));
+  // Slice index from final sort.
+  auto index_output_shape = sort_index_gte->shape();
+  index_output_shape.set_dimensions(sort_dim, k);
+  HloInstruction* slice_index_value = b_.AddInstruction(
+      HloInstruction::CreateSlice(index_output_shape, sort_index_gte,
+                                  start_indices, limit_indices, strides));
+  auto create_tuple = b_.AddInstruction(
+      HloInstruction::CreateTuple({slice_sort_value, slice_index_value}));
+  create_tuple->set_sharding(HloSharding::Replicate());
+
+  SetPartitionedHlo(hlo, PartitionedHlo(create_tuple, create_tuple->shape(),
+                                        MakePartitioningState())
+                             .Reshard(hlo->sharding()));
+
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleTranspose(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  std::vector<int64> inverse_dimensions(hlo->shape().rank());
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    inverse_dimensions[hlo->dimensions(i)] = i;
+  }
+  auto desired_operand_sharding =
+      hlo_sharding_util::TransposeSharding(sharding, inverse_dimensions);
+
+  auto operand = GetPartitionedHlo(hlo->operand(0))
+                     .Reshard(desired_operand_sharding)
+                     .hlo();
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand}));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  auto operand = GetPartitionedHlo(hlo->operand(0));
+  // The output shape is the source and the operand shape is the target to get
+  // the aligned sharding for the operand.
+  auto desired_operand_sharding = hlo_sharding_util::ReshapeSharding(
+      hlo->shape(), hlo->operand(0)->shape(), hlo->sharding());
+  if (desired_operand_sharding.has_value()) {
+    auto operand_hlo = operand.Reshard(*desired_operand_sharding).hlo();
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()), {operand_hlo}));
+    });
+    return Status::OK();
+  }
+
+  // Try use halo exchange for certain split-dim/merge-dims cases.
+  // ReshapeSharding failed in these cases probably due to uneven partitioning,
+  // where halo exchange could help. Specifically we check the following
+  // conditions to detect supported cases:
+  // 1) Both input and output are partitioned on one dimension.
+  // 2) The combined size of dimensions before the partitioned dimension are the
+  // same on input and output. This means we don't need to consider the major
+  // dimensions.
+  // 3) Let A = the input size on the partitioned dimension, and
+  //        B = the output size on the partitioned dimension; then
+  //    either A % B == 0 (split dim) or B % A == 0 (merge dims).
+  auto maybe_input_sharded_dim = UniqueTiledDim(operand.sharding());
+  auto maybe_output_sharded_dim = UniqueTiledDim(sharding);
+  if (!maybe_input_sharded_dim || !maybe_output_sharded_dim) {
+    return DefaultAction(hlo);
+  }
+  int64 input_sharded_dim = *maybe_input_sharded_dim;
+  int64 output_sharded_dim = *maybe_output_sharded_dim;
+  // Check that the major dims before the sharded dim have the same total size
+  // for input and output.
+  int64 input_major_dims_size = 1;
+  for (int64 i = 0; i < input_sharded_dim; ++i) {
+    input_major_dims_size *= operand.base_shape().dimensions(i);
+  }
+  int64 output_major_dims_size = 1;
+  for (int64 i = 0; i < output_sharded_dim; ++i) {
+    output_major_dims_size *= hlo->shape().dimensions(i);
+  }
+  if (input_major_dims_size != output_major_dims_size) {
+    return DefaultAction(hlo);
+  }
+  // Fix potential device ordering mismatch in tile assignment.
+  Array<int64> new_input_tile_assignment = sharding.tile_assignment();
+  new_input_tile_assignment.Reshape(
+      operand.sharding().tile_assignment().dimensions());
+  operand = operand.Reshard(HloSharding::Tile(new_input_tile_assignment));
+
+  int64 input_dim_size = operand.base_shape().dimensions(input_sharded_dim);
+  int64 output_dim_size = hlo->shape().dimensions(output_sharded_dim);
+  auto input_shard_shape =
+      MakePartitionedShape(operand.base_shape(), operand.sharding());
+  auto output_shard_shape = MakePartitionedShape(hlo->shape(), sharding);
+  if (input_dim_size % output_dim_size == 0) {
+    // Split dim.
+    int64 split_factor = input_dim_size / output_dim_size;
+    int64 output_shard_size = output_shard_shape.dimensions(output_sharded_dim);
+    // Use halo exchange to fix misaligned data.
+    Window window;
+    for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+      WindowDimension* dim = window.add_dimensions();
+      dim->set_size(1);
+      dim->set_stride(1);
+      dim->set_window_dilation(1);
+      dim->set_window_reversal(false);
+      dim->set_base_dilation(1);
+      dim->set_padding_low(0);
+      if (i == input_sharded_dim) {
+        dim->set_padding_high(output_shard_size * split_factor *
+                                  num_partitions_ -
+                              input_dim_size);
+      } else {
+        dim->set_padding_high(0);
+      }
+    }
+
+    auto reshard_operand = operand.ReshardAsWindowedInput(
+        window, operand.sharding(),
+        CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+        /*mask_invalid_region=*/false);
+    if (!reshard_operand.has_value()) {
+      return DefaultAction(hlo);
+    }
+    TF_RET_CHECK(!reshard_operand->dynamic_slice_index_on_output.has_value());
+    CHECK_EQ(
+        reshard_operand->sharded_input->shape().dimensions(input_sharded_dim),
+        output_shard_size * split_factor);
+    SetPartitionedHlo(hlo, [&] {
+      // Do a local reshape.
+      return b_.AddInstruction(HloInstruction::CreateReshape(
+          output_shard_shape, reshard_operand->sharded_input));
+    });
+    return Status::OK();
+  } else if (output_dim_size % input_dim_size == 0) {
+    // Merge dims.
+    int64 merge_factor = output_dim_size / input_dim_size;
+    // First reshape locally. (The sharded dimension could include padded data.)
+    auto tmp_shard_shape = output_shard_shape;
+    tmp_shard_shape.set_dimensions(
+        output_sharded_dim,
+        input_shard_shape.dimensions(input_sharded_dim) * merge_factor);
+    auto tmp_reshape = b_.AddInstruction(
+        HloInstruction::CreateReshape(tmp_shard_shape, operand.hlo()));
+    tmp_reshape->set_metadata(hlo->metadata());
+    tmp_reshape->set_sharding(hlo->sharding());
+    auto tmp_full_shape = tmp_shard_shape;
+    tmp_full_shape.set_dimensions(
+        output_sharded_dim,
+        tmp_shard_shape.dimensions(output_sharded_dim) * num_partitions_);
+    auto tmp_output =
+        PartitionedHlo(tmp_reshape, tmp_full_shape, MakePartitioningState());
+
+    // Use halo exchange to fix misaligned data.
+    Window window;
+    for (int64 i = 0; i < tmp_shard_shape.rank(); ++i) {
+      WindowDimension* dim = window.add_dimensions();
+      dim->set_size(1);
+      dim->set_stride(1);
+      dim->set_window_dilation(1);
+      dim->set_window_reversal(false);
+      dim->set_base_dilation(1);
+      dim->set_padding_low(0);
+      if (i == output_sharded_dim) {
+        dim->set_padding_high(output_dim_size -
+                              tmp_shard_shape.dimensions(output_sharded_dim) *
+                                  num_partitions_);
+      } else {
+        dim->set_padding_high(0);
+      }
+    }
+
+    auto reshard_output = tmp_output.ReshardAsWindowedInput(
+        window, sharding,
+        CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
+        /*mask_invalid_region=*/false);
+    if (!reshard_output.has_value()) {
+      return DefaultAction(hlo);
+    }
+    TF_RET_CHECK(!reshard_output->dynamic_slice_index_on_output.has_value());
+    CHECK_EQ(
+        reshard_output->sharded_input->shape().dimensions(input_sharded_dim),
+        output_shard_shape.dimensions(output_sharded_dim));
+    SetPartitionedHlo(hlo, [&] { return reshard_output->sharded_input; });
+    return Status::OK();
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleIota(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+  if (sharding.IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  SetPartitionedHlo(hlo, [&] {
+    int64 dimension = Cast<HloIotaInstruction>(hlo)->iota_dimension();
+    auto iota = b_.AddInstruction(HloInstruction::CreateIota(
+        MakePartitionedShape(hlo->shape(), sharding), dimension));
+
+    if (sharding.tile_assignment().dim(dimension) > 1) {
+      auto partition_ordinals =
+          MakeTiledPartitionOrdinals(sharding, partition_id_, &b_);
+      auto multiplier = b_.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<int32>(iota->shape().dimensions(dimension))));
+      auto offset = b_.AddInstruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(S32, {}), HloOpcode::kMultiply,
+          partition_ordinals[dimension], multiplier));
+      if (iota->shape().element_type() != S32) {
+        offset = b_.AddInstruction(HloInstruction::CreateConvert(
+            ShapeUtil::MakeShape(iota->shape().element_type(), {}), offset));
+      }
+      auto broadcast = b_.AddInstruction(
+          HloInstruction::CreateBroadcast(iota->shape(), offset, {}));
+      return b_.AddInstruction(HloInstruction::CreateBinary(
+          iota->shape(), HloOpcode::kAdd, iota, broadcast));
+    }
+
+    return iota;
+  });
+
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleSingleDevice(const HloInstruction* hlo) {
+  TF_RET_CHECK(hlo->sharding().HasUniqueDevice());
+  int64 device = hlo->sharding().GetUniqueDevice();
+  const HloSharding sharding = HloSharding::AssignDevice(device);
+
+  std::vector<HloInstruction*> operands;
+  std::vector<Shape> operand_shapes;
+  for (const HloInstruction* operand : hlo->operands()) {
+    operands.push_back(GetPartitionedHlo(operand).Reshard(sharding).hlo());
+    operand_shapes.push_back(operand->shape());
+  }
+  auto operand = b_.AddInstruction(HloInstruction::CreateTuple(operands));
+  auto operand_shape = ShapeUtil::MakeTupleShape(operand_shapes);
+
+  auto on_device = b_.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(device)));
+  auto pred = b_.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {}), partition_id_, on_device,
+      ComparisonDirection::kEq));
+
+  SpmdBuilder true_b("true_computation", visiting_hlo_);
+  HloComputation* true_computation;
+  {
+    auto param = true_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, operand_shape, "true_branch_param"));
+    std::vector<HloInstruction*> new_operands;
+    for (int64 i = 0; i < operands.size(); ++i) {
+      new_operands.push_back(true_b.AddInstruction(
+          HloInstruction::CreateGetTupleElement(operand_shapes[i], param, i)));
+    }
+    auto root = true_b.AddInstruction(
+        hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+    true_computation = module_->AddEmbeddedComputation(true_b.Build(root));
+  }
+
+  SpmdBuilder false_b("false_computation", visiting_hlo_);
+  HloComputation* false_computation;
+  {
+    false_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, operand_shape, "false_branch_param"));
+    auto root = CreateZero(hlo->shape(), &false_b);
+    false_computation = module_->AddEmbeddedComputation(false_b.Build(root));
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    return b_.AddInstruction(HloInstruction::CreateConditional(
+        hlo->shape(), pred, operand, true_computation, operand,
+        false_computation));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleAllReduce(HloInstruction* hlo) {
+  if (hlo->IsCrossReplicaAllReduce() && hlo->operand_count() == 1) {
+    return HandleElementwise(hlo);
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleBroadcast(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  auto& operand = GetPartitionedHlo(hlo->operand(0));
+
+  // Tiled output.
+  std::vector<int64> wanted_input_tile_size(operand.base_shape().rank());
+  std::vector<int64> sharded_new_dims;
+  for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+    wanted_input_tile_size[i] =
+        hlo->sharding().tile_assignment().dim(hlo->dimensions(i));
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (!absl::c_linear_search(hlo->dimensions(), i) &&
+        hlo->sharding().tile_assignment().dim(i) > 1) {
+      sharded_new_dims.push_back(i);
+    }
+  }
+  if (sharded_new_dims.empty()) {
+    // The new dimensions are replicated, so that we can do the adjustment on
+    // the input.
+    Array<int64> wanted_input_tile_assignment(wanted_input_tile_size);
+    wanted_input_tile_assignment.Each(
+        [&](absl::Span<const int64> indices, int64* val) {
+          std::vector<int64> indices_in_broadcast(hlo->shape().rank(), 0);
+          for (int64 i = 0; i < operand.base_shape().rank(); ++i) {
+            indices_in_broadcast[hlo->dimensions(i)] = indices[i];
+          }
+          *val = hlo->sharding().tile_assignment()(indices_in_broadcast);
+        });
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(hlo->CloneWithNewOperands(
+          MakePartitionedShape(hlo->shape(), hlo->sharding()),
+          {operand.Reshard(HloSharding::Tile(wanted_input_tile_assignment))
+               .hlo()}));
+    });
+  } else {
+    auto input = operand.Reshard(HloSharding::Replicate()).hlo();
+    // We pad and shard the input first, then broadcast to the final shard
+    // shape.
+    auto output_offsets =
+        MakePartitionOffsets(hlo->shape(), hlo->sharding(), partition_id_, &b_);
+    std::vector<HloInstruction*> input_offsets(operand.base_shape().rank());
+    auto output_shard_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto input_shard_shape = input->shape();
+    auto padded_input_shape = input->shape();
+    for (int64 i = 0; i < input_offsets.size(); ++i) {
+      input_offsets[i] = output_offsets[hlo->dimensions(i)];
+      input_shard_shape.set_dimensions(
+          i, output_shard_shape.dimensions(hlo->dimensions(i)));
+      padded_input_shape.set_dimensions(
+          i, hlo->sharding().tile_assignment().dim(hlo->dimensions(i)) *
+                 input_shard_shape.dimensions(i));
+    }
+    auto padded_input = PadToShape(input, padded_input_shape, &b_);
+    auto input_shard =
+        ShapeUtil::Compatible(input_shard_shape, padded_input->shape())
+            ? padded_input
+            : b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+                  input_shard_shape, padded_input, input_offsets,
+                  input_shard_shape.dimensions()));
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(
+          hlo->CloneWithNewOperands(output_shard_shape, {input_shard}));
+    });
+  }
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConstant(HloInstruction* hlo) {
+  const Literal& literal = hlo->literal();
+  if (literal.shape().IsTuple() ||
+      (!hlo->sharding().IsTileMaximal() &&
+       (!EvenlyPartitions(hlo->shape(), hlo->sharding()) ||
+        !literal.IsAllFirst()))) {
+    return DefaultAction(hlo);
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    std::vector<int64> start_indices(hlo->shape().rank(), 0);
+    auto constant = b_.AddInstruction(HloInstruction::CreateConstant(
+        literal.Slice(start_indices, shard_shape.dimensions())));
+    *constant->mutable_shape() = shard_shape;
+    return constant;
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleDynamicSlice(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (hlo->sharding().tile_assignment().dim(i) != 1 &&
+        (hlo->dynamic_slice_sizes()[i] != hlo->shape().dimensions(i) ||
+         !hlo->operand(i + 1)->IsConstant() ||
+         !hlo->operand(i + 1)->literal().IsZero({}))) {
+      // We currently do not partition the sliced dimensions.
+      return DefaultAction(hlo);
+    }
+  }
+  std::vector<HloInstruction*> new_indices(hlo->shape().rank());
+  auto new_input =
+      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
+  for (int64 i = 0; i < new_indices.size(); ++i) {
+    // Replicate the indices.
+    new_indices[i] = GetPartitionedHlo(hlo->operand(i + 1))
+                         .Reshard(HloSharding::Replicate())
+                         .hlo();
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    auto partitioned_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        partitioned_shape, new_input, new_indices,
+        partitioned_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleDynamicUpdateSlice(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    if (hlo->sharding().tile_assignment().dim(i) != 1 &&
+        (hlo->operand(1)->shape().dimensions(i) != hlo->shape().dimensions(i) ||
+         !hlo->operand(i + 2)->IsConstant() ||
+         !hlo->operand(i + 2)->literal().IsZero({}))) {
+      // We currently do not partition the sliced dimensions.
+      return DefaultAction(hlo);
+    }
+  }
+  std::vector<HloInstruction*> new_indices(hlo->shape().rank());
+  auto new_input =
+      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
+  auto new_update =
+      GetPartitionedHlo(hlo->operand(1)).Reshard(hlo->sharding()).hlo();
+  for (int64 i = 0; i < new_indices.size(); ++i) {
+    // Replicate the indices.
+    new_indices[i] = GetPartitionedHlo(hlo->operand(i + 2))
+                         .Reshard(HloSharding::Replicate())
+                         .hlo();
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    auto partitioned_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    return b_.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+        partitioned_shape, new_input, new_update, new_indices));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
+  auto gather = Cast<HloGatherInstruction>(hlo);
+  const auto& dnums = gather->gather_dimension_numbers();
+  auto operand = GetPartitionedHlo(gather->operand(0));
+  auto indices = GetPartitionedHlo(gather->operand(1));
+  std::vector<int64> collapsed_slice_dims(dnums.collapsed_slice_dims().begin(),
+                                          dnums.collapsed_slice_dims().end());
+  std::vector<int64> start_index_map(dnums.start_index_map().begin(),
+                                     dnums.start_index_map().end());
+  std::vector<int64> offset_dims(dnums.offset_dims().begin(),
+                                 dnums.offset_dims().end());
+  if (!operand.sharding().IsTileMaximal()) {
+    auto maybe_passthrough = PassthroughOperandToGatherOutputOrScatterUpdate(
+        operand, gather->shape(), collapsed_slice_dims, start_index_map,
+        offset_dims, gather->gather_slice_sizes());
+    if (maybe_passthrough.has_value()) {
+      indices = indices.Reshard(HloSharding::Replicate());
+      auto pshape = MakePartitionedShape(gather->shape(), *maybe_passthrough);
+      std::vector<int64> pslice_sizes(gather->gather_slice_sizes().begin(),
+                                      gather->gather_slice_sizes().end());
+      for (int64 i = 0; i < pslice_sizes.size(); ++i) {
+        if (operand.sharding().tile_assignment().dim(i) > 1) {
+          pslice_sizes[i] = operand.hlo()->shape().dimensions(i);
+        }
+      }
+      auto pgather = b_.AddInstruction(HloInstruction::CreateGather(
+          pshape, operand.hlo(), indices.hlo(), dnums, pslice_sizes,
+          gather->indices_are_sorted()));
+      pgather->set_sharding(*maybe_passthrough);
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(pgather, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+    if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
+            operand, start_index_map, gather->gather_slice_sizes(),
+            num_partitions_) &&
+        ShapeUtil::ByteSizeOf(gather->shape()) <
+            ShapeUtil::ByteSizeOf(gather->operand(0)->shape())) {
+      indices = indices.Reshard(HloSharding::Replicate());
+      // Now the operand is partitioned in trivial slice dimensions, and the
+      // indices are replicated. We execute a gather on partitioned operand,
+      // with full number of indices, where out-of-bounds indices are clamped,
+      // and masked out with 0 in the result; then we use all-reduce to combine
+      // results. Although gather will not get faster, we avoided the need to
+      // replicate the operand.
+      HloInstruction* indices_min;
+      HloInstruction* indices_max;
+      std::tie(indices_min, indices_max) =
+          IndexBoundsForGatherScatterOperandPartitionedOnTrivialSliceDims(
+              operand, indices, partition_id_, start_index_map,
+              dnums.index_vector_dim(), &b_);
+      // Clamp the indices.
+      auto adjusted_indices = b_.AddInstruction(HloInstruction::CreateTernary(
+          indices.base_shape(), HloOpcode::kClamp, indices_min, indices.hlo(),
+          indices_max));
+      // Adjust the indices by subtracting the offset.
+      adjusted_indices = b_.AddInstruction(HloInstruction::CreateBinary(
+          indices.base_shape(), HloOpcode::kSubtract, adjusted_indices,
+          indices_min));
+      // Gather on adjusted indices.
+      auto pgather = b_.AddInstruction(HloInstruction::CreateGather(
+          gather->shape(), operand.hlo(), adjusted_indices, dnums,
+          gather->gather_slice_sizes(), gather->indices_are_sorted()));
+      // Mask out invalid results.
+      auto filter = b_.AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::ChangeElementType(indices.base_shape(), PRED),
+          indices.hlo(), indices_min, ComparisonDirection::kLt));
+      filter = b_.AddInstruction(HloInstruction::CreateBinary(
+          filter->shape(), HloOpcode::kOr, filter,
+          b_.AddInstruction(HloInstruction::CreateCompare(
+              ShapeUtil::ChangeElementType(indices.base_shape(), PRED),
+              indices.hlo(), indices_max, ComparisonDirection::kGt))));
+      if (dnums.index_vector_dim() < indices.base_shape().rank()) {
+        std::vector<int64> reduced_filter_dims;
+        for (int64 i = 0; i < filter->shape().rank(); ++i) {
+          if (i != dnums.index_vector_dim()) {
+            reduced_filter_dims.push_back(filter->shape().dimensions(i));
+          }
+        }
+        filter = b_.AddInstruction(HloInstruction::CreateReduce(
+            ShapeUtil::MakeShape(PRED, reduced_filter_dims), filter,
+            CreateR0WithType(PRED, false, &b_), {dnums.index_vector_dim()},
+            MakeBinaryAdd(PRED, module_)));
+      }
+      std::vector<int64> batch_dims;
+      for (int64 i = 0; i < pgather->shape().rank(); ++i) {
+        if (!absl::c_linear_search(dnums.offset_dims(), i)) {
+          batch_dims.push_back(i);
+        }
+      }
+      auto broadcast_filter = b_.AddInstruction(HloInstruction::CreateBroadcast(
+          ShapeUtil::ChangeElementType(pgather->shape(), PRED), filter,
+          batch_dims));
+      auto filtered = b_.AddInstruction(HloInstruction::CreateTernary(
+          pgather->shape(), HloOpcode::kSelect, broadcast_filter,
+          CreateZero(pgather->shape(), &b_), pgather));
+      // Combine from different partitions.
+      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, filtered,
+          MakeBinaryAdd(filtered->shape().element_type(), module_),
+          NewChannel());
+      ar->set_sharding(HloSharding::Replicate());
+      SetPartitionedHlo(hlo, [&]() {
+        return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleGetTupleElement(HloInstruction* hlo) {
+  const auto& tuple = GetPartitionedHlo(hlo->operand(0));
+  auto gte = b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::GetTupleElementShape(tuple.hlo()->shape(), hlo->tuple_index()),
+      tuple.hlo(), hlo->tuple_index()));
+  SetPartitionedHlo(hlo, [&]() {
+    const auto source_sharding = tuple.sharding().GetSubSharding(
+        tuple.base_shape(), {hlo->tuple_index()});
+    gte->set_sharding(source_sharding);
+    PartitionedHlo source_partitioned_gte(gte, hlo->shape(),
+                                          MakePartitioningState());
+    return source_partitioned_gte.Reshard(hlo->sharding()).hlo();
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleInfeed(HloInstruction* hlo) {
+  const Shape& shape = ShapeUtil::GetTupleElementShape(hlo->shape(), 0);
+  auto token = GetPartitionedHlo(hlo->operand(0)).hlo();
+  if (ShapeUtil::GetLeafCount(shape) == 0) {
+    // TODO(b/155819021): HloSharding has issues with tuple-shaped sharding: it
+    // requires one element for an empty tuple, but leaf-count number of
+    // elements for non-empty tuple. So if it has a nested empty tuple, we
+    // cannot invoke GetSubSharding() since it expects a sharding for the empty
+    // tuple. This is a workaround for that case.
+    SetPartitionedHlo(hlo, [&]() {
+      return b_.AddInstruction(
+          HloInstruction::CreateInfeed(shape, token, hlo->infeed_config()));
+    });
+    return Status::OK();
+  }
+  auto sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
+  auto shard_shape = MakePartitionedShape(shape, sharding);
+  if (EvenlyPartitions(shape, sharding)) {
+    SetPartitionedHlo(hlo, [&]() {
+      return b_.AddInstruction(HloInstruction::CreateInfeed(
+          shard_shape, token, hlo->infeed_config()));
+    });
+    return Status::OK();
+  }
+
+  if (hlo->sharding().HasUniqueDevice()) {
+    return HandleSingleDevice(hlo);
+  }
+
+  // Create a branch for each unique partitioned shape.
+  std::vector<Shape> per_branch_partitioned_shapes;
+  std::vector<int32> conditional_branch_indices(num_partitions_);
+  for (int64 i = 0; i < num_partitions_; ++i) {
+    auto partitioned_shape =
+        MakeNonPaddedShapeForGivenPartition(shape, sharding, i);
+    int64 matching_existing_index = 0;
+    for (; matching_existing_index < per_branch_partitioned_shapes.size();
+         ++matching_existing_index) {
+      if (ShapeUtil::Compatible(
+              partitioned_shape,
+              per_branch_partitioned_shapes[matching_existing_index])) {
+        break;
+      }
+    }
+    if (matching_existing_index < per_branch_partitioned_shapes.size()) {
+      conditional_branch_indices[i] = matching_existing_index;
+    } else {
+      conditional_branch_indices[i] = per_branch_partitioned_shapes.size();
+      per_branch_partitioned_shapes.push_back(std::move(partitioned_shape));
+    }
+  }
+
+  HloInstruction* branch_index;
+  if (per_branch_partitioned_shapes.size() == num_partitions_) {
+    // Use partition ID as the branch index if each partition has its own
+    // branch.
+    branch_index = partition_id_;
+    // PartitionId's output is U32 but conditional requires S32.
+    if (branch_index->shape().element_type() != S32) {
+      branch_index = b_.AddInstruction(HloInstruction::CreateConvert(
+          ShapeUtil::ChangeElementType(branch_index->shape(), S32),
+          branch_index));
+    }
+  } else {
+    // Otherwise, use a constant table to look up the branch index.
+    auto branch_index_table = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR1<int32>(conditional_branch_indices)));
+    branch_index = b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        ShapeUtil::MakeShape(S32, {1}), branch_index_table, {partition_id_},
+        {1}));
+    branch_index = b_.AddInstruction(HloInstruction::CreateReshape(
+        ShapeUtil::MakeShape(S32, {}), branch_index));
+  }
+
+  std::vector<HloComputation*> branches(per_branch_partitioned_shapes.size());
+  for (int64 i = 0; i < branches.size(); ++i) {
+    SpmdBuilder branch_b(absl::StrCat("infeed_branch_", i), visiting_hlo_);
+    auto param = branch_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0, token->shape(), "infeed_token_param"));
+    auto infeed = branch_b.AddInstruction(HloInstruction::CreateInfeed(
+        per_branch_partitioned_shapes[i], param, hlo->infeed_config()));
+    branches[i] = module_->AddEmbeddedComputation(branch_b.Build(infeed));
+    if (!ShapeUtil::Compatible(per_branch_partitioned_shapes[i], shard_shape)) {
+      TF_ASSIGN_OR_RETURN(
+          auto padded,
+          branches[i]->DeepCopyInstructionWithCustomCopier(
+              infeed, [&](HloInstruction* leaf, const ShapeIndex& leaf_index,
+                          HloComputation* comp) {
+                // Index {1} corresponds to the token.
+                if (leaf_index.empty() || leaf_index[0] != 0) {
+                  return leaf;
+                }
+                ShapeIndexView subindex(leaf_index, 1);
+                if (ShapeUtil::Compatible(
+                        ShapeUtil::GetSubshape(per_branch_partitioned_shapes[i],
+                                               subindex),
+                        ShapeUtil::GetSubshape(shard_shape, subindex))) {
+                  return leaf;
+                }
+                return PadToShape(leaf,
+                                  ShapeUtil::GetSubshape(shard_shape, subindex),
+                                  nullptr, comp);
+              }));
+      branches[i]->set_root_instruction(padded,
+                                        /*accept_different_shape=*/true);
+    }
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    return b_.AddInstruction(HloInstruction::CreateConditional(
+        ShapeUtil::MakeTupleShape({shard_shape, token->shape()}), branch_index,
+        branches, std::vector<HloInstruction*>(branches.size(), token)));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandlePad(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    const auto& pd = hlo->padding_config().dimensions(i);
+    // Right now we only support non-padded dimensions to be partitioned.
+    if (hlo->sharding().tile_assignment().dim(i) > 1 &&
+        (pd.edge_padding_high() != 0 || pd.edge_padding_low() != 0 ||
+         pd.interior_padding() != 0)) {
+      return DefaultAction(hlo);
+    }
+  }
+  auto resharded_lhs =
+      GetPartitionedHlo(hlo->operand(0)).Reshard(hlo->sharding()).hlo();
+  auto replicated_rhs = GetPartitionedHlo(hlo->operand(1))
+                            .Reshard(HloSharding::Replicate())
+                            .hlo();
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    return b_.AddInstruction(hlo->CloneWithNewOperands(
+        shard_shape, {resharded_lhs, replicated_rhs}));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleParameter(HloInstruction* hlo) {
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto new_param = b_.AddInstruction(HloInstruction::CreateParameter(
+        hlo->parameter_number(), shard_shape, "param"));
+    if (hlo->parameter_replicated_at_leaf_buffers()) {
+      new_param->set_parameter_replicated_at_leaf_buffers(
+          *hlo->parameter_replicated_at_leaf_buffers());
+    }
+    return new_param;
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReduce(HloInstruction* hlo) {
+  int64 input_count = 1;
+  auto per_input_sharding = hlo->sharding();
+  if (hlo->shape().IsTuple()) {
+    input_count = hlo->shape().tuple_shapes_size();
+    CHECK_GT(input_count, 0);
+    per_input_sharding = hlo->sharding().GetSubSharding(hlo->shape(), {0});
+  }
+
+  std::vector<PartitionedHlo> inputs;
+  std::vector<HloInstruction*> inits;
+  for (int64 operand_id = 0; operand_id < input_count; ++operand_id) {
+    inits.push_back(GetPartitionedHlo(hlo->operand(operand_id + input_count))
+                        .Reshard(HloSharding::Replicate())
+                        .hlo());
+    inputs.push_back(GetPartitionedHlo(hlo->operand(operand_id)));
+    if (operand_id > 0) {
+      // Make sure all operands are sharded in the same way.
+      inputs.back() = inputs.back().Reshard(inputs[0].sharding());
+    }
+    if (!inputs[0].sharding().IsTileMaximal()) {
+      inputs.back() = inputs.back().PadWithValue(inits[operand_id]);
+    }
+  }
+  bool reduce_sharded_dimension = false;
+  if (!inputs[0].sharding().IsTileMaximal()) {
+    reduce_sharded_dimension = absl::c_any_of(hlo->dimensions(), [&](int64 i) {
+      return inputs[0].sharding().tile_assignment().dim(i) > 1;
+    });
+
+    // reduce_sharded_dimension is not supported for tuple-shaped reduces.
+    if (reduce_sharded_dimension && input_count > 1) {
+      return DefaultAction(hlo);
+    }
+
+    // Currently we only support reducing all or none of the sharded
+    // dimensions.
+    if (reduce_sharded_dimension) {
+      for (int64 i = 0; i < inputs[0].base_shape().rank(); ++i) {
+        if (inputs[0].sharding().tile_assignment().dim(i) > 1 &&
+            absl::c_count(hlo->dimensions(), i) == 0) {
+          return DefaultAction(hlo);
+        }
+      }
+    }
+  }
+
+  std::vector<Shape*> new_operand_shapes(input_count * 2);
+  for (int64 i = 0; i < input_count; ++i) {
+    new_operand_shapes[i] = inputs[i].hlo()->mutable_shape();
+    new_operand_shapes[i + input_count] = inits[i]->mutable_shape();
+  }
+  // Create the shard shape of the reduce result.
+  TF_ASSIGN_OR_RETURN(
+      auto reduce_shape,
+      ShapeInference::InferReduceShape(new_operand_shapes, hlo->dimensions(),
+                                       hlo->to_apply()->ComputeProgramShape()));
+  *reduce_shape.mutable_layout() = hlo->shape().layout();
+
+  std::vector<HloInstruction*> input_hlos(input_count);
+  for (int64 i = 0; i < input_count; ++i) {
+    input_hlos[i] = inputs[i].hlo();
+  }
+  auto local_reduce = b_.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape, input_hlos, inits, hlo->dimensions(), hlo->to_apply()));
+  local_reduce->set_metadata(hlo->metadata());
+
+  SetPartitionedHlo(hlo, [&]() {
+    HloInstruction* reduce;
+    if (reduce_sharded_dimension) {
+      CHECK(local_reduce->shape().IsArray());
+      reduce = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, local_reduce, hlo->to_apply(), NewChannel());
+      reduce->set_sharding(HloSharding::Replicate());
+    } else {
+      reduce = local_reduce;
+      if (inputs[0].sharding().IsTileMaximal()) {
+        reduce->set_sharding(inputs[0].sharding());
+      } else {
+        // Remove tile assignment dimensions that are reduced.
+        std::vector<int64> tile_dimensions;
+        for (int64 i = 0; i < input_hlos[0]->shape().rank(); ++i) {
+          if (absl::c_count(hlo->dimensions(), i) == 0) {
+            tile_dimensions.push_back(
+                inputs[0].sharding().tile_assignment().dim(i));
+          }
+        }
+        Array<int64> new_tile = inputs[0].sharding().tile_assignment();
+        new_tile.Reshape(tile_dimensions);
+        auto sharding = HloSharding::Tile(new_tile);
+        if (input_count > 1) {
+          std::vector<HloSharding> tuple(input_count, sharding);
+          sharding = HloSharding::Tuple(hlo->shape(), tuple);
+        }
+        reduce->set_sharding(sharding);
+      }
+    }
+
+    return PartitionedHlo(reduce, hlo->shape(), MakePartitioningState())
+        .Reshard(hlo->sharding())
+        .hlo();
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReverse(HloInstruction* hlo) {
+  auto reverse = Cast<HloReverseInstruction>(hlo);
+  if (reverse->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  if (absl::c_all_of(reverse->dimensions(), [&](int64 d) {
+        return reverse->sharding().tile_assignment().dim(d) == 1;
+      })) {
+    auto operand =
+        GetPartitionedHlo(reverse->operand(0)).Reshard(reverse->sharding());
+    SetPartitionedHlo(hlo, [&] {
+      return b_.AddInstruction(
+          hlo->CloneWithNewOperands(operand.hlo()->shape(), {operand.hlo()}));
+    });
+    return Status::OK();
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleWhile(HloInstruction* hlo) {
+  const HloSharding& sharding = hlo->sharding();
+
+  // Shardings for the body parameter, body root, and cond parameter must be
+  // the same, and the condition root must be replicated so that all partitions
+  // follow the same control flow.
+  hlo->while_condition()->parameter_instruction(0)->set_sharding(sharding);
+  hlo->while_body()->parameter_instruction(0)->set_sharding(sharding);
+  TF_RETURN_IF_ERROR(partitioner_
+                         ->PartitionComputation(hlo->while_condition(),
+                                                HloSharding::Replicate(),
+                                                next_channel_id_, logger_)
+                         .status());
+  TF_RETURN_IF_ERROR(partitioner_
+                         ->PartitionComputation(hlo->while_body(), sharding,
+                                                next_channel_id_, logger_)
+                         .status());
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(HloInstruction::CreateWhile(
+        MakePartitionedShape(hlo->shape(), sharding), hlo->while_condition(),
+        hlo->while_body(),
+        GetPartitionedHlo(hlo->operand(0)).Reshard(sharding).hlo()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConditional(HloInstruction* hlo) {
+  std::vector<HloInstruction*> branch_args;
+  for (int64 i = 0; i < hlo->branch_count(); ++i) {
+    HloComputation* computation = hlo->branch_computation(i);
+
+    // Shardings of the branch computation parameter and its argument must be
+    // the same.
+    computation->parameter_instruction(0)->set_sharding(
+        hlo->operand(i + 1)->sharding());
+    branch_args.push_back(GetPartitionedHlo(hlo->operand(i + 1)).hlo());
+  }
+
+  // The root of the branch computations must follow the sharding of the
+  // conditional instruction.
+  for (int64 i = 0; i < hlo->branch_count(); ++i) {
+    HloComputation* computation = hlo->branch_computation(i);
+    TF_RETURN_IF_ERROR(partitioner_
+                           ->PartitionComputation(computation, hlo->sharding(),
+                                                  next_channel_id_, logger_)
+                           .status());
+  }
+
+  // We replicate the predicate of the conditional (the first operand) so that
+  // all partitions follow the same control flow.
+  SetPartitionedHlo(hlo, [&] {
+    return b_.AddInstruction(HloInstruction::CreateConditional(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()),
+        GetPartitionedHlo(hlo->operand(0))
+            .Reshard(HloSharding::Replicate())
+            .hlo(),
+        hlo->called_computations(), branch_args));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleOutfeed(HloInstruction* hlo) {
+  TF_RET_CHECK(hlo->sharding().HasUniqueDevice());
+  return HandleSingleDevice(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleRng(HloInstruction* hlo) {
+  if (hlo->sharding().HasUniqueDevice()) {
+    return HandleSingleDevice(hlo);
+  }
+
+  if (hlo->sharding().IsReplicated()) {
+    SetPartitionedHlo(hlo, [&] {
+      // Run on a single device (0) and distribute the data to all other cores.
+      std::vector<HloInstruction*> new_operands;
+      for (int64 i = 0; i < hlo->operand_count(); ++i) {
+        new_operands.push_back(GetPartitionedHlo(hlo->operand(i))
+                                   .Reshard(HloSharding::AssignDevice(0))
+                                   .hlo());
+      }
+      auto clone = b_.AddInstruction(
+          hlo->CloneWithNewOperands(hlo->shape(), new_operands));
+      clone->set_sharding(HloSharding::AssignDevice(0));
+      return PartitionedHlo(clone, hlo->shape(), MakePartitioningState())
+          .Reshard(HloSharding::Replicate())
+          .hlo();
+    });
+    return Status::OK();
+  }
+
+  TF_RET_CHECK(!hlo->sharding().IsTileMaximal());
+  SetPartitionedHlo(hlo, [&] {
+    // Replicate the operands and run partitioned Rng on all devices.
+    std::vector<HloInstruction*> new_operands;
+    for (int64 i = 0; i < hlo->operand_count(); ++i) {
+      new_operands.push_back(GetPartitionedHlo(hlo->operand(i))
+                                 .Reshard(HloSharding::Replicate())
+                                 .hlo());
+    }
+    return b_.AddInstruction(HloInstruction::CreateRng(
+        MakePartitionedShape(hlo->shape(), hlo->sharding()),
+        hlo->random_distribution(), new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleReduceWindow(HloInstruction* hlo) {
+  auto& operand = GetPartitionedHlo(hlo->operand(0));
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+
+  // Replicate init
+  auto replicated_init = GetPartitionedHlo(hlo->mutable_operand(1))
+                             .Reshard(HloSharding::Replicate());
+  auto resharded_operand_and_window = operand.ReshardAsWindowedInput(
+      hlo->window(), hlo->sharding(), replicated_init.hlo());
+  if (!resharded_operand_and_window.has_value()) {
+    return DefaultAction(hlo);
+  }
+
+  TF_ASSIGN_OR_RETURN(Shape sharded_rw_shape,
+                      ShapeInference::InferReduceWindowShape(
+                          resharded_operand_and_window->sharded_input->shape(),
+                          replicated_init.hlo()->shape(),
+                          resharded_operand_and_window->shard_window,
+                          hlo->to_apply()->ComputeProgramShape()));
+  auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+  *sharded_rw_shape.mutable_layout() = shard_shape.layout();
+  SetPartitionedHlo(hlo, [&]() {
+    auto sharded_rw = b_.AddInstruction(HloInstruction::CreateReduceWindow(
+        sharded_rw_shape, resharded_operand_and_window->sharded_input,
+        replicated_init.hlo(), resharded_operand_and_window->shard_window,
+        hlo->to_apply()));
+    if (!resharded_operand_and_window->dynamic_slice_index_on_output
+             .has_value()) {
+      CHECK(ShapeUtil::Compatible(shard_shape, sharded_rw->shape()));
+      return sharded_rw;
+    }
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, sharded_rw,
+        *resharded_operand_and_window->dynamic_slice_index_on_output,
+        shard_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleSelectAndScatter(HloInstruction* hlo) {
+  if (hlo->sharding().IsTileMaximal()) {
+    return DefaultAction(hlo);
+  }
+  auto operand = GetPartitionedHlo(hlo->operand(0));
+  auto source = GetPartitionedHlo(hlo->mutable_operand(1));
+  if (hlo->sharding() != operand.sharding()) {
+    operand = operand.Reshard(hlo->sharding());
+  }
+  if (hlo->sharding() != source.sharding()) {
+    source = source.Reshard(hlo->sharding());
+  }
+
+  // For F32 and BF16 types, we can use NaN padding to workaround the issue with
+  // low/high padding, since comparison will return false with NaN input.
+  if (hlo->shape().element_type() != F32 &&
+      hlo->shape().element_type() != BF16) {
+    return DefaultAction(hlo);
+  }
+
+  auto select = hlo->called_computations()[0];
+  auto select_root = select->root_instruction();
+  if (select_root->opcode() != HloOpcode::kCompare ||
+      select_root->operand(0)->opcode() != HloOpcode::kParameter ||
+      select_root->operand(1)->opcode() != HloOpcode::kParameter ||
+      select_root->operand(0)->parameter_number() +
+              select_root->operand(1)->parameter_number() !=
+          1) {
+    return DefaultAction(hlo);
+  }
+
+  float float_pad_value;
+  if (select_root->comparison_direction() == ComparisonDirection::kGe ||
+      select_root->comparison_direction() == ComparisonDirection::kGt) {
+    if (select_root->operand(0)->parameter_number() == 0) {
+      float_pad_value = -std::numeric_limits<float>::infinity();
+    } else {
+      float_pad_value = std::numeric_limits<float>::infinity();
+    }
+  } else if (select_root->comparison_direction() == ComparisonDirection::kLe ||
+             select_root->comparison_direction() == ComparisonDirection::kLt) {
+    if (select_root->operand(0)->parameter_number() == 0) {
+      float_pad_value = std::numeric_limits<float>::infinity();
+    } else {
+      float_pad_value = -std::numeric_limits<float>::infinity();
+    }
+  } else {
+    return DefaultAction(hlo);
+  }
+
+  auto pad_value = b_.AddInstruction(HloInstruction::CreateConstant(
+      hlo->shape().element_type() == BF16
+          ? LiteralUtil::CreateR0<bfloat16>(
+                static_cast<bfloat16>(float_pad_value))
+          : LiteralUtil::CreateR0<float>(float_pad_value)));
+
+  // Replicate init
+  auto replicated_init = GetPartitionedHlo(hlo->mutable_operand(2))
+                             .Reshard(HloSharding::Replicate());
+
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(hlo->sharding(), partition_id_, &b_);
+
+  // The first window for each dimension that overlaps with the shard area.
+  std::vector<MultiplyAddDivideOffsetCalculation> first_window(
+      hlo->shape().rank());
+  // The first window for each dimension that goes beyond with the shard area.
+  std::vector<MultiplyAddDivideOffsetCalculation> limit_window(
+      hlo->shape().rank());
+  std::vector<OffsetCalculation> data_left_halo_sizes(hlo->shape().rank());
+  std::vector<OffsetCalculation> data_right_halo_sizes(hlo->shape().rank());
+  std::vector<OffsetCalculation> source_left_halo_sizes(hlo->shape().rank());
+  std::vector<OffsetCalculation> source_right_halo_sizes(hlo->shape().rank());
+  auto unpadded_data_shard_shape =
+      MakePartitionedShape(hlo->shape(), hlo->sharding());
+  auto unpadded_source_shard_shape =
+      MakePartitionedShape(hlo->operand(1)->shape(), hlo->sharding());
+  auto source_shard_hlo = source.hlo();
+  auto data_shard_hlo = operand.hlo();
+  for (int64 i = 0; i < hlo->shape().rank(); ++i) {
+    int64 shard_count = hlo->sharding().tile_assignment().dim(i);
+    if (shard_count == 1) {
+      continue;
+    }
+    // If stride > window_size, there will be gaps between windows. These gaps
+    // will also exist in the output, so we keep them during halo exchange.
+    //
+    // TODO(yuanzx): This could introduce overhead if partitions start at
+    // different offsets in a gap.
+    auto wd = hlo->window().dimensions(i);
+    if (wd.stride() > wd.size()) {
+      wd.set_size(wd.stride());
+    }
+    // shard_size * i < stride * k - pad_low + window_size  =>
+    //   k > (shard_size * i + pad_low - window_size) / stride  =>
+    //   first_k == (shard_size * i + pad_low - window_size + stride) / stride
+    first_window[i] = MultiplyAddDivideOffsetCalculation(
+        unpadded_data_shard_shape.dimensions(i),
+        wd.padding_low() - wd.size() + wd.stride(), wd.stride());
+    // shard_size * (i + 1) <= stride * k - pad_low  =>
+    //   k >= (shard_size * i + shard_size + pad_low) / stride  =>
+    //   limit_k == (shard_size * i + shard_size + pad_low + stride - 1) /
+    //     stride
+    limit_window[i] = MultiplyAddDivideOffsetCalculation(
+        unpadded_data_shard_shape.dimensions(i),
+        unpadded_data_shard_shape.dimensions(i) + wd.padding_low() +
+            wd.stride() - 1,
+        wd.stride());
+    source_left_halo_sizes[i] =
+        MultiplyAddDivideOffsetCalculation(
+            unpadded_source_shard_shape.dimensions(i), 0, 1) -
+        first_window[i];
+    source_right_halo_sizes[i] =
+        limit_window[i] - MultiplyAddDivideOffsetCalculation(
+                              unpadded_source_shard_shape.dimensions(i),
+                              unpadded_source_shard_shape.dimensions(i), 1);
+    data_left_halo_sizes[i] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            unpadded_data_shard_shape.dimensions(i), wd.padding_low(), 1)) -
+        OffsetCalculation(
+            HloOpcode::kMultiply, first_window[i],
+            MultiplyAddDivideOffsetCalculation(0, wd.stride(), 1));
+    data_right_halo_sizes[i] =
+        OffsetCalculation(
+            HloOpcode::kMultiply, limit_window[i],
+            MultiplyAddDivideOffsetCalculation(0, wd.stride(), 1)) -
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            unpadded_data_shard_shape.dimensions(i),
+            unpadded_data_shard_shape.dimensions(i) + wd.stride() +
+                wd.padding_low() - wd.size(),
+            1));
+
+    int64 max_windows =
+        (limit_window[i] - first_window[i]).MaxInRange(0, shard_count);
+    auto first_window_hlo =
+        first_window[i].Calculate(partition_ordinals[i], &b_);
+    // Padding on the source is filled with the init value so they do not change
+    // the data on overlapping windows.
+    auto resharded_source = ExchangeHaloAndGetValidData(
+        source_shard_hlo, source.base_shape(), source_left_halo_sizes[i],
+        source_right_halo_sizes[i], 0,
+        limit_window[i].Calculate(shard_count - 1), max_windows, i,
+        hlo->sharding(), first_window_hlo, replicated_init.hlo(),
+        partition_ordinals[i], collective_ops_creator_, next_channel_id_, &b_);
+    if (!resharded_source) {
+      return DefaultAction(hlo);
+    }
+    source_shard_hlo = *resharded_source;
+
+    auto offset_start_in_data =
+        MultiplyAddDivideOffsetCalculation(wd.stride(), 0, 1)
+            .Calculate(first_window_hlo, &b_);
+    int64 padded_data_size =
+        (limit_window[i].Calculate(shard_count - 1) - 1) * wd.stride() +
+        wd.size();
+    int64 data_shard_size = (max_windows - 1) * wd.stride() + wd.size();
+    auto resharded_data = ExchangeHaloAndGetValidData(
+        data_shard_hlo, operand.base_shape(), data_left_halo_sizes[i],
+        data_right_halo_sizes[i], wd.padding_low(), padded_data_size,
+        data_shard_size, i, hlo->sharding(), offset_start_in_data, pad_value,
+        partition_ordinals[i], collective_ops_creator_, next_channel_id_, &b_);
+    if (!resharded_data) {
+      return DefaultAction(hlo);
+    }
+    data_shard_hlo = *resharded_data;
+  }
+
+  Window window_on_shard = hlo->window();
+  for (int64 i = 0; i < window_on_shard.dimensions_size(); ++i) {
+    int64 shard_count = hlo->sharding().tile_assignment().dim(i);
+    if (shard_count == 1) {
+      continue;
+    }
+    auto reshard_wd = window_on_shard.mutable_dimensions(i);
+    // The shards are already explicitly padded.
+    reshard_wd->set_padding_low(0);
+    reshard_wd->set_padding_high(0);
+  }
+
+  auto sharded_select_and_scatter =
+      b_.AddInstruction(HloInstruction::CreateSelectAndScatter(
+          data_shard_hlo->shape(), data_shard_hlo, select, window_on_shard,
+          source_shard_hlo, replicated_init.hlo(),
+          hlo->called_computations()[1]));
+  SetPartitionedHlo(hlo, [&]() {
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    if (ShapeUtil::Compatible(sharded_select_and_scatter->shape(),
+                              shard_shape)) {
+      return sharded_select_and_scatter;
+    }
+    auto zero = b_.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+    std::vector<HloInstruction*> slice_offsets(shard_shape.rank(), zero);
+    for (int64 i = 0; i < window_on_shard.dimensions_size(); ++i) {
+      if (hlo->sharding().tile_assignment().dim(i) == 1) {
+        continue;
+      }
+      int64 pad_low = hlo->window().dimensions(i).padding_low();
+      auto left_halo_size =
+          data_left_halo_sizes[i].Calculate(partition_ordinals[i], &b_);
+      if (data_left_halo_sizes[i].Calculate(0) == pad_low) {
+        slice_offsets[i] = left_halo_size;
+      } else {
+        auto is_shard0 = b_.AddInstruction(HloInstruction::CreateCompare(
+            ShapeUtil::MakeShape(PRED, {}), zero, partition_ordinals[i],
+            ComparisonDirection::kEq));
+        auto pad_low_hlo = b_.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32>(pad_low)));
+        slice_offsets[i] = b_.AddInstruction(HloInstruction::CreateTernary(
+            zero->shape(), HloOpcode::kSelect, is_shard0, pad_low_hlo,
+            left_halo_size));
+      }
+    }
+    return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+        shard_shape, sharded_select_and_scatter, slice_offsets,
+        shard_shape.dimensions()));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleTuple(HloInstruction* hlo) {
+  std::vector<HloInstruction*> new_operands;
+  for (int64 i = 0; i < hlo->operand_count(); ++i) {
+    new_operands.push_back(
+        GetPartitionedHlo(hlo->operand(i))
+            .Reshard(hlo->sharding().GetSubSharding(hlo->shape(), {i}))
+            .hlo());
+  }
+  SetPartitionedHlo(hlo, [&]() {
+    return b_.AddInstruction(HloInstruction::CreateTuple(new_operands));
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConvolutionTiledLhsAndRhs(
+    HloInstruction* hlo) {
+  TF_RET_CHECK(hlo->opcode() == HloOpcode::kConvolution);
+
+  auto lhs = GetPartitionedHlo(hlo->operand(0));
+  auto rhs = GetPartitionedHlo(hlo->operand(1));
+  TF_RET_CHECK(!lhs.sharding().IsTileMaximal() &&
+               !rhs.sharding().IsTileMaximal());
+
+  const auto& dnums = hlo->convolution_dimension_numbers();
+
+  // Check if the operand shardings are aligned. Also we currently don't
+  // support partitioning non-spatial dimensions.
+  std::vector<int64> rhs_to_lhs_indices(hlo->shape().rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_batch_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_feature_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+  std::vector<int64> lhs_to_rhs_indices(hlo->shape().rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+  auto aligned_rhs_sharding =
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
+  auto aligned_lhs_sharding =
+      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
+
+  auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
+                                  const HloSharding& rhs_sharding) {
+    return lhs_sharding.tile_assignment().dim(dnums.input_batch_dimension()) !=
+               1 ||
+           rhs_sharding.tile_assignment().dim(
+               dnums.kernel_output_feature_dimension()) != 1;
+  };
+
+  auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(hlo->shape().element_type())));
+  if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
+      ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+    if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
+      return DefaultAction(hlo);
+    }
+    lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
+    rhs = rhs.PadWithValue(zero);
+  } else {
+    if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
+      return DefaultAction(hlo);
+    }
+    lhs = lhs.PadWithValue(zero);
+    rhs = rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero);
+  }
+
+  // Reshard LHS by exchanging halo such that each shard computes the partial
+  // sum of the full shape result, and add AllReduce.
+  //
+  // The size of halo on each dimension can be calculated from the projection
+  // onto the LHS that each RHS shard i needs to read. RHS and LHS below refers
+  // to the shard size of RHS and LHS, WC is the number of windows, and D is the
+  // window dilation.
+  //
+  // * offset(i): RHS * D * i - low_padding
+  // * limit(i): {(RHS - 1) * D + 1} * (i + 1) + (WC - 1) * stride - low_padding
+  //
+  // Since shard i has LHS of range [i * LHS, (i + 1) * LHS)
+  // * left-halo: i * LHS - offset(i)
+  //              = (LHS - RHS) * i + low_padding
+  // * right-halo: limit(i) - (i + 1) * LHS
+  //   = [{(RHS - 1) * D + 1} - LHS] * (i + 1) + (WC - 1) * stride - low_padding
+
+  Window window = hlo->window();
+  std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
+  std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
+  std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+    int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+    int64 shard_count = lhs.sharding().tile_assignment().dim(lhs_dimension);
+    auto wd = window.dimensions(i);
+    if (wd.base_dilation() != 1 || wd.window_reversal()) {
+      return DefaultAction(hlo);
+    }
+
+    int64 lhs_shard_size =
+        CeilOfRatio(lhs.base_shape().dimensions(lhs_dimension), shard_count);
+    int64 rhs_shard_size =
+        CeilOfRatio(rhs.base_shape().dimensions(rhs_dimension), shard_count);
+    shard_counts[i] = shard_count;
+    lhs_shard_sizes[i] = lhs_shard_size;
+    rhs_shard_sizes[i] = rhs_shard_size;
+  }
+
+  std::vector<OffsetCalculation> left_halo_size_functions(hlo->shape().rank());
+  std::vector<OffsetCalculation> right_halo_size_functions(hlo->shape().rank());
+  Window new_window = window;
+
+  auto partition_ordinals =
+      MakeTiledPartitionOrdinals(lhs.sharding(), partition_id_, &b_);
+  HloInstruction* lhs_with_halo = lhs.hlo();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+    int64 lhs_shard_size = lhs_shard_sizes[i];
+    int64 rhs_shard_size = rhs_shard_sizes[i];
+
+    if (shard_counts[i] == 1) {
+      continue;
+    }
+
+    // Calculate the left and right halo sizes as described in the comments
+    // above.
+    auto wd = window.dimensions(i);
+    int64 padding_low = wd.padding_low();
+    int64 padding_high = wd.padding_high();
+    int64 base = lhs.base_shape().dimensions(lhs_dimension);
+    int64 window_count = 1 + (padding_low + padding_high + base -
+                              (1 + (wd.size() - 1) * wd.window_dilation())) /
+                                 wd.stride();
+    int64 rhs_shard_size_dilated =
+        (rhs_shard_size - 1) * wd.window_dilation() + 1;
+
+    left_halo_size_functions[lhs_dimension] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            lhs_shard_size - rhs_shard_size * wd.window_dilation(), padding_low,
+            1));
+    right_halo_size_functions[lhs_dimension] =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            rhs_shard_size_dilated - lhs_shard_size,
+            rhs_shard_size_dilated - lhs_shard_size +
+                wd.stride() * (window_count - 1) - padding_low,
+            1));
+
+    // Exchange halo and concatenate.
+    int64 dim = dnums.input_spatial_dimensions(i);
+    int64 explicit_left_padding_on_full_shape = padding_low;
+    int64 shard_size_with_halo =
+        wd.stride() * (window_count - 1) + rhs_shard_size_dilated;
+
+    new_window.mutable_dimensions(i)->set_padding_low(0);
+    new_window.mutable_dimensions(i)->set_padding_high(0);
+    new_window.mutable_dimensions(i)->set_size(rhs_shard_size);
+
+    // offset_on_padded_shape and padded_full_shape_size are needed only if
+    // we want to mask out-of-range values in ExchangeHaloAndGetValidData().
+    // Since the default value for both the collective-permute is zero and
+    // also we call PadWithValue() on both operands at the beginning, we
+    // don't need to mask here.
+    //
+    // TODO(hyoulkee): Consider removing one of the two PadWithValue() calls
+    // if it's always safe.
+    auto offset_on_padded_shape =
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation());
+    int64 padded_full_shape_size = 0;
+    auto concat = ExchangeHaloAndGetValidData(
+        lhs_with_halo, lhs.base_shape(), left_halo_size_functions[dim],
+        right_halo_size_functions[dim], explicit_left_padding_on_full_shape,
+        padded_full_shape_size, shard_size_with_halo, dim, lhs.sharding(),
+        offset_on_padded_shape.Calculate(partition_ordinals[dim], &b_), zero,
+        partition_ordinals[dim], collective_ops_creator_, next_channel_id_, &b_,
+        /*mask_invalid_region=*/false);
+    if (!concat) {
+      return DefaultAction(hlo);
+    }
+    lhs_with_halo = *concat;
+  }
+
+  SetPartitionedHlo(hlo, [&]() {
+    auto conv = b_.AddInstruction(HloInstruction::CreateConvolve(
+        hlo->shape(), lhs_with_halo, rhs.hlo(), hlo->feature_group_count(),
+        hlo->batch_group_count(), new_window,
+        hlo->convolution_dimension_numbers(), hlo->precision_config()));
+    auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+        &b_, conv, MakeBinaryAdd(hlo->shape().element_type(), module_),
+        NewChannel());
+    ar->set_sharding(HloSharding::Replicate());
+    return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+        .Reshard(hlo->sharding())
+        .hlo();
+  });
+  return Status::OK();
+}
+
+Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
+  auto lhs = GetPartitionedHlo(hlo->operand(0));
+  auto rhs = GetPartitionedHlo(hlo->operand(1));
+  const HloSharding& sharding = hlo->sharding();
+  const auto& dnums = hlo->convolution_dimension_numbers();
+  std::vector<int64> rhs_to_lhs_indices(hlo->shape().rank());
+  rhs_to_lhs_indices[dnums.kernel_output_feature_dimension()] =
+      dnums.input_batch_dimension();
+  rhs_to_lhs_indices[dnums.kernel_input_feature_dimension()] =
+      dnums.input_feature_dimension();
+  for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+    rhs_to_lhs_indices[dnums.kernel_spatial_dimensions(i)] =
+        dnums.input_spatial_dimensions(i);
+  }
+  std::vector<int64> lhs_to_rhs_indices(hlo->shape().rank());
+  for (int64 i = 0; i < rhs_to_lhs_indices.size(); ++i) {
+    lhs_to_rhs_indices[rhs_to_lhs_indices[i]] = i;
+  }
+  auto aligned_rhs_sharding =
+      hlo_sharding_util::TransposeSharding(lhs.sharding(), rhs_to_lhs_indices);
+  auto aligned_lhs_sharding =
+      hlo_sharding_util::TransposeSharding(rhs.sharding(), lhs_to_rhs_indices);
+
+  // Handling cases where both operands' shardings are aligned. We check that
+  // the LHS batch dimension is not partitioned because it is mapped to the
+  // output feature dimension in aligned_rhs_sharding, which are not the same
+  // dimension.
+  if (!lhs.sharding().IsTileMaximal() && !rhs.sharding().IsTileMaximal()) {
+    if (options_.conv_halo_exchange_always_on_lhs) {
+      return HandleConvolutionTiledLhsAndRhs(hlo);
+    } else {
+      // Reshard RHS so that each shard computes the partial sum of the full
+      // shape result, and add AllReduce. See HandleConvolutionTiledLhsAndRhs()
+      // that reshards LHS.
+      //
+      // The size of halo on each dimension can be calculated from the
+      // projection onto the RHS that shard i needs to read. RHS and LHS below
+      // refers to the shard size of RHS and LHS, WC is the number of windows,
+      // and D is the window dilation.
+      //
+      // * offset(i): LHS * i + low_padding - (WC - 1) * stride
+      // * limit(i): LHS * (i + 1) + low_padding
+      //
+      // Since shard i has RHS of range [i * RHS * D, (i + 1) * RHS * D)
+      // * left-halo: i * RHS - offset(i)
+      //              = i * (RHS * D - LHS) + (WC - 1) * stride - low_padding
+      // * right-halo: limit(i) - (i + 1) * RHS
+      //              = (i + 1) * (LHS - RHS * D) + low_pading
+
+      auto unsupported_sharding = [&](const HloSharding& lhs_sharding,
+                                      const HloSharding& rhs_sharding) {
+        // We currently don't support partitioning input batch or output feature
+        // dimensions.
+        return lhs_sharding.tile_assignment().dim(
+                   dnums.input_batch_dimension()) != 1 ||
+               rhs_sharding.tile_assignment().dim(
+                   dnums.kernel_output_feature_dimension()) != 1;
+      };
+      auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::Zero(hlo->shape().element_type())));
+      if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
+          ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+        if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
+          return DefaultAction(hlo);
+        }
+        lhs = lhs.Reshard(aligned_lhs_sharding).PadWithValue(zero);
+        rhs = rhs.PadWithValue(zero);
+      } else {
+        if (unsupported_sharding(lhs.sharding(), aligned_rhs_sharding)) {
+          return DefaultAction(hlo);
+        }
+        lhs = lhs.PadWithValue(zero);
+        rhs = rhs.Reshard(aligned_rhs_sharding).PadWithValue(zero);
+      }
+
+      Window window = hlo->window();
+      std::vector<int64> shard_counts(dnums.input_spatial_dimensions_size());
+      std::vector<int64> lhs_shard_sizes(dnums.input_spatial_dimensions_size());
+      std::vector<int64> rhs_shard_sizes(dnums.input_spatial_dimensions_size());
+      for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+        int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+        int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+        int64 shard_count = rhs.sharding().tile_assignment().dim(rhs_dimension);
+        auto wd = window.dimensions(i);
+        if (wd.base_dilation() != 1 || wd.window_reversal()) {
+          return DefaultAction(hlo);
+        }
+
+        int64 lhs_shard_size = CeilOfRatio(
+            lhs.base_shape().dimensions(lhs_dimension), shard_count);
+        int64 rhs_shard_size = CeilOfRatio(
+            rhs.base_shape().dimensions(rhs_dimension), shard_count);
+        shard_counts[i] = shard_count;
+        lhs_shard_sizes[i] = lhs_shard_size;
+        rhs_shard_sizes[i] = rhs_shard_size;
+      }
+
+      std::vector<OffsetCalculation> left_halo_size_functions(
+          hlo->shape().rank());
+      std::vector<OffsetCalculation> right_halo_size_functions(
+          hlo->shape().rank());
+      Window new_window = window;
+
+      // Data structures needed for Pad and DynamicSlice on LHS if needed.
+      bool need_dynamic_slice_lhs = false;
+      auto partition_ordinals =
+          MakeTiledPartitionOrdinals(lhs.sharding(), partition_id_, &b_);
+      std::vector<int64> zero_padding(hlo->shape().rank());
+      PaddingConfig pad_config =
+          window_util::MakeSymmetricPadding(zero_padding);
+      auto zero_s32 = b_.AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      std::vector<HloInstruction*> dynamic_slice_start_indices(
+          hlo->shape().rank(), zero_s32);
+      Shape dynamic_slice_shape = lhs.hlo()->shape();
+      Shape pad_shape = lhs.hlo()->shape();
+
+      for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+        int64 lhs_dimension = dnums.input_spatial_dimensions(i);
+        int64 rhs_dimension = dnums.kernel_spatial_dimensions(i);
+        int64 lhs_shard_size = lhs_shard_sizes[i];
+        int64 rhs_shard_size = rhs_shard_sizes[i];
+
+        if (shard_counts[i] == 1) {
+          continue;
+        }
+
+        // Calculate the left and right halo sizes as described in the comments
+        // above. It calculcates the halo sizes with dilation, so we apply
+        // CeilOfRatio({left,right}_halo_size, window_dilation).
+        auto wd = window.dimensions(i);
+        int64 padding_low = wd.padding_low();
+        int64 padding_high = wd.padding_high();
+        int64 base = lhs.base_shape().dimensions(lhs_dimension);
+        int64 window_count =
+            1 + (padding_low + padding_high + base -
+                 (1 + (wd.size() - 1) * wd.window_dilation())) /
+                    wd.stride();
+        left_halo_size_functions[rhs_dimension] =
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                rhs_shard_size * wd.window_dilation() - lhs_shard_size,
+                (window_count - 1) * wd.stride() - padding_low +
+                    wd.window_dilation() - 1,
+                wd.window_dilation()));
+        right_halo_size_functions[rhs_dimension] =
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                lhs_shard_size - rhs_shard_size * wd.window_dilation(),
+                lhs_shard_size - rhs_shard_size * wd.window_dilation() +
+                    padding_low + wd.window_dilation() - 1,
+                wd.window_dilation()));
+
+        // New RHS window size includes the maximum of both left and right
+        // halos.
+        int64 halo_size = left_halo_size_functions[rhs_dimension].MaxInRange(
+                              1, shard_counts[i]) +
+                          right_halo_size_functions[rhs_dimension].MaxInRange(
+                              0, shard_counts[i] - 1);
+        int64 new_window_size =
+            rhs.hlo()->shape().dimensions(rhs_dimension) + halo_size;
+
+        // The amount of new low padding could be dynamic (e.g., window_dilation
+        // != 1), which requires pad (to the maximum) and dynamic slice on LHS.
+        //
+        // If we consider the first window, the offset of the dilated RHS that
+        // aligns with the first valid LHS element for shard i is 'padding_low +
+        // LHS * i'. When the left halo is added to RHS, the offset of the first
+        // RHS element is (RHS * i - left_halo) * window_dilation. The
+        // difference between the two values is the amount of padding_low we
+        // need on LHS.
+        auto new_padding_low_function =
+            OffsetCalculation(
+                HloOpcode::kMultiply, left_halo_size_functions[rhs_dimension],
+                OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                    0, wd.window_dilation(), 1))) -
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                rhs_shard_size * wd.window_dilation() - lhs_shard_size,
+                -padding_low, 1));
+
+        int64 new_padding_low_max =
+            new_padding_low_function.MaxInRange(0, shard_counts[i]);
+        int64 new_padding_low = new_padding_low_max;
+        int64 new_padding_high = window_count * wd.stride() +
+                                 (new_window_size - 1) * wd.window_dilation() -
+                                 new_padding_low - lhs_shard_size;
+
+        // We do pad/dynamic-slice only when the padding is dynamic.
+        if (!new_padding_low_function.IsConstant()) {
+          need_dynamic_slice_lhs = true;
+          new_padding_low = 0;
+          pad_config.mutable_dimensions(lhs_dimension)
+              ->set_edge_padding_low(new_padding_low_max);
+          pad_config.mutable_dimensions(lhs_dimension)
+              ->set_edge_padding_high(new_padding_low_max);
+          pad_shape.set_dimensions(lhs_dimension,
+                                   lhs_shard_size + 2 * new_padding_low_max);
+          dynamic_slice_start_indices[lhs_dimension] =
+              (OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                   0, new_padding_low_max, 1)) -
+               new_padding_low_function)
+                  .Calculate(partition_ordinals[lhs_dimension], &b_);
+          dynamic_slice_shape.set_dimensions(
+              lhs_dimension, lhs_shard_size + new_padding_low_max);
+        }
+
+        // Since the convolution RHS operand size increased with halos, adjust
+        // the window config accordingly.
+        new_window.mutable_dimensions(i)->set_padding_low(new_padding_low);
+        new_window.mutable_dimensions(i)->set_padding_high(new_padding_high);
+        new_window.mutable_dimensions(i)->set_size(
+            rhs.hlo()->shape().dimensions(rhs_dimension) + halo_size);
+      }
+
+      HloInstruction* conv_lhs = lhs.hlo();
+      if (need_dynamic_slice_lhs) {
+        auto pad = b_.AddInstruction(
+            HloInstruction::CreatePad(pad_shape, lhs.hlo(), zero, pad_config));
+        conv_lhs = b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+            dynamic_slice_shape, pad, dynamic_slice_start_indices,
+            dynamic_slice_shape.dimensions()));
+      }
+
+      // Exchange halo and concatenate.
+      HloInstruction* rhs_with_halo = rhs.hlo();
+      for (int i = 0; i < dnums.kernel_spatial_dimensions_size(); ++i) {
+        int64 dim = dnums.kernel_spatial_dimensions(i);
+        int64 explicit_left_padding_on_full_shape =
+            left_halo_size_functions[dim].Calculate(0);
+        int64 shard_size_with_halo = new_window.dimensions(i).size();
+
+        // offset_on_padded_shape and padded_full_shape_size are needed only if
+        // we want to mask out-of-range values in ExchangeHaloAndGetValidData().
+        // Since the default value for both the collective-permute is zero and
+        // also we call PadWithValue() on both operands at the beginning, we
+        // don't need to mask here.
+        //
+        // TODO(hyoulkee): Consider removing one of the two PadWithValue() calls
+        // if it's always safe.
+        auto offset_on_padded_shape =
+            OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+                rhs_shard_sizes[i], explicit_left_padding_on_full_shape, 1)) -
+            left_halo_size_functions[dim];
+        int64 padded_full_shape_size =
+            offset_on_padded_shape.Calculate(shard_counts[i] - 1) +
+            new_window.dimensions(i).size();
+        auto concat = ExchangeHaloAndGetValidData(
+            rhs_with_halo, rhs.base_shape(), left_halo_size_functions[dim],
+            right_halo_size_functions[dim], explicit_left_padding_on_full_shape,
+            padded_full_shape_size, shard_size_with_halo, dim, rhs.sharding(),
+            offset_on_padded_shape.Calculate(partition_ordinals[dim], &b_),
+            zero, partition_ordinals[dim], collective_ops_creator_,
+            next_channel_id_, &b_, /*mask_invalid_region=*/false);
+        if (!concat) {
+          return DefaultAction(hlo);
+        }
+        rhs_with_halo = *concat;
+      }
+
+      SetPartitionedHlo(hlo, [&]() {
+        auto conv = b_.AddInstruction(HloInstruction::CreateConvolve(
+            hlo->shape(), conv_lhs, rhs_with_halo, hlo->feature_group_count(),
+            hlo->batch_group_count(), new_window, dnums,
+            hlo->precision_config()));
+        auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+            &b_, conv, MakeBinaryAdd(hlo->shape().element_type(), module_),
+            NewChannel());
+        ar->set_sharding(HloSharding::Replicate());
+        return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+            .Reshard(hlo->sharding())
+            .hlo();
+      });
+      return Status::OK();
+    }
+  }
+
+  if (!sharding.IsTileMaximal()) {
+    // We don't currently support sharding on output feature dimension.
+    if (sharding.tile_assignment().dim(dnums.output_feature_dimension()) > 1) {
+      return DefaultAction(hlo);
+    }
+
+    // Check if the operand and the output sharding are aligned.
+    std::vector<int64> input_to_output_indices(hlo->shape().rank());
+    input_to_output_indices[dnums.input_batch_dimension()] =
+        dnums.output_batch_dimension();
+    input_to_output_indices[dnums.input_feature_dimension()] =
+        dnums.output_feature_dimension();
+    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      input_to_output_indices[dnums.input_spatial_dimensions(i)] =
+          dnums.output_spatial_dimensions(i);
+    }
+    auto target_operand_sharding =
+        hlo_sharding_util::TransposeSharding(sharding, input_to_output_indices);
+    lhs = lhs.Reshard(target_operand_sharding);
+
+    // Replicate the RHS.
+    rhs = rhs.Reshard(HloSharding::Replicate());
+
+    // Convolution window config does not include batch and feature dimensions,
+    // whereas ReshardAsWindowedInput() expects the same number of window
+    // dimensions as the rank of the operand. So add two more trivial
+    // dimensions.
+    std::vector<int64> ones(hlo->shape().rank(), 1);
+    auto operand_window = window_util::MakeWindow(ones);
+    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      *operand_window.mutable_dimensions(dnums.input_spatial_dimensions(i)) =
+          hlo->window().dimensions(i);
+    }
+
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    auto resharded_operand_and_window = lhs.ReshardAsWindowedInput(
+        operand_window, target_operand_sharding, zero);
+    if (!resharded_operand_and_window.has_value()) {
+      return DefaultAction(hlo);
+    }
+    Window new_window;
+    for (int64 i = 0; i < dnums.input_spatial_dimensions_size(); ++i) {
+      *new_window.add_dimensions() =
+          resharded_operand_and_window->shard_window.dimensions(
+              dnums.input_spatial_dimensions(i));
+    }
+    TF_ASSIGN_OR_RETURN(
+        Shape sharded_conv_shape,
+        ShapeInference::InferConvolveShape(
+            resharded_operand_and_window->sharded_input->shape(),
+            rhs.hlo()->shape(), hlo->feature_group_count(),
+            hlo->batch_group_count(), new_window, dnums));
+    auto shard_shape = MakePartitionedShape(hlo->shape(), hlo->sharding());
+    *sharded_conv_shape.mutable_layout() = shard_shape.layout();
+    SetPartitionedHlo(hlo, [&]() {
+      auto sharded_conv = b_.AddInstruction(HloInstruction::CreateConvolve(
+          sharded_conv_shape, resharded_operand_and_window->sharded_input,
+          rhs.hlo(), hlo->feature_group_count(), hlo->batch_group_count(),
+          new_window, dnums, hlo->precision_config()));
+      if (!resharded_operand_and_window->dynamic_slice_index_on_output
+               .has_value()) {
+        CHECK(ShapeUtil::Compatible(shard_shape, sharded_conv->shape()));
+        return sharded_conv;
+      }
+      return b_.AddInstruction(HloInstruction::CreateDynamicSlice(
+          shard_shape, sharded_conv,
+          *resharded_operand_and_window->dynamic_slice_index_on_output,
+          shard_shape.dimensions()));
+    });
+    return Status::OK();
+  }
+  return DefaultAction(hlo);
+}
+
+Status SpmdPartitioningVisitor::HandleDot(HloInstruction* hlo) {
+  DotGeneralDimsMapping mapping;
+  const auto& dnums = hlo->dot_dimension_numbers();
+  int64 next_output_dim = 0;
+  for (int64 i = 0; i < dnums.lhs_batch_dimensions_size(); ++i) {
+    mapping.batch_dims.emplace_back();
+    mapping.batch_dims.back().lhs = dnums.lhs_batch_dimensions(i);
+    mapping.batch_dims.back().rhs = dnums.rhs_batch_dimensions(i);
+    mapping.batch_dims.back().output = next_output_dim++;
+  }
+  for (int64 i = 0; i < dnums.lhs_contracting_dimensions_size(); ++i) {
+    mapping.contracting_dims.emplace_back();
+    mapping.contracting_dims.back().lhs = dnums.lhs_contracting_dimensions(i);
+    mapping.contracting_dims.back().rhs = dnums.rhs_contracting_dimensions(i);
+    mapping.contracting_dims.back().output = -1;
+  }
+  for (int64 i = 0; i < hlo->operand(0)->shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.lhs_batch_dimensions(), i) ||
+        absl::c_linear_search(dnums.lhs_contracting_dimensions(), i)) {
+      continue;
+    }
+    mapping.lhs_non_contracting_dims.emplace_back();
+    mapping.lhs_non_contracting_dims.back().lhs = i;
+    mapping.lhs_non_contracting_dims.back().rhs = -1;
+    mapping.lhs_non_contracting_dims.back().output = next_output_dim++;
+  }
+  for (int64 i = 0; i < hlo->operand(1)->shape().rank(); ++i) {
+    if (absl::c_linear_search(dnums.rhs_batch_dimensions(), i) ||
+        absl::c_linear_search(dnums.rhs_contracting_dimensions(), i)) {
+      continue;
+    }
+    mapping.rhs_non_contracting_dims.emplace_back();
+    mapping.rhs_non_contracting_dims.back().lhs = -1;
+    mapping.rhs_non_contracting_dims.back().rhs = i;
+    mapping.rhs_non_contracting_dims.back().output = next_output_dim++;
+  }
+  auto create_sharded_dot = [&](HloInstruction* l, HloInstruction* r,
+                                SpmdBuilder* b) -> StatusOr<HloInstruction*> {
+    TF_ASSIGN_OR_RETURN(
+        auto sharded_dot_shape,
+        ShapeInference::InferDotOpShape(l->shape(), r->shape(),
+                                        hlo->dot_dimension_numbers()));
+    return b->AddInstruction(HloInstruction::CreateDot(
+        sharded_dot_shape, l, r, hlo->dot_dimension_numbers(),
+        hlo->precision_config()));
+  };
+  return HandleDotHelper(hlo, mapping, create_sharded_dot);
+}
+
+Status SpmdPartitioningVisitor::HandleDotHelper(
+    HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
+    const std::function<StatusOr<HloInstruction*>(
+        HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot) {
+  const HloSharding& lhs_sharding = hlo->operand(0)->sharding();
+  const HloSharding& rhs_sharding = hlo->operand(1)->sharding();
+
+  // Similar to hlo_sharding_util::TransposeSharding(), but allows
+  // removing/adding non-partitioned dimensions.
+  auto transpose_sharding =
+      [&](const HloSharding& source, absl::Span<int64 const> src_to_tgt,
+          absl::Span<int64 const> tgt_to_src) -> absl::optional<HloSharding> {
+    if (source.IsTileMaximal()) {
+      return source;
+    }
+    std::vector<int64> tgt_dims_skipping_new(tgt_to_src.size(), -1);
+    int64 skipped_tgt_dims = 0;
+    for (int64 i = 0; i < tgt_to_src.size(); ++i) {
+      if (tgt_to_src[i] < 0) {
+        skipped_tgt_dims++;
+      } else {
+        tgt_dims_skipping_new[i] = i - skipped_tgt_dims;
+      }
+    }
+    int64 skipped_src_dims = absl::c_count(src_to_tgt, -1);
+    std::vector<int64> perm(src_to_tgt.size());
+    for (int64 i = 0; i < src_to_tgt.size(); ++i) {
+      if (src_to_tgt[i] < 0) {
+        if (source.tile_assignment().dim(i) > 1) {
+          return absl::nullopt;
+        }
+        perm[src_to_tgt.size() - skipped_src_dims] = i;
+        skipped_src_dims--;
+      } else {
+        perm[tgt_dims_skipping_new[src_to_tgt[i]]] = i;
+      }
+    }
+    auto tgt_sharding = hlo_sharding_util::TransposeSharding(source, perm);
+    if (skipped_tgt_dims == 0) {
+      return tgt_sharding;
+    }
+    auto reshape_tiles = tgt_sharding.tile_assignment();
+    std::vector<int64> tgt_tiles(tgt_to_src.size(), 1);
+    for (int64 i = 0; i < tgt_tiles.size(); ++i) {
+      if (tgt_to_src[i] >= 0) {
+        tgt_tiles[i] = reshape_tiles.dim(tgt_dims_skipping_new[i]);
+      }
+    }
+    reshape_tiles.Reshape(tgt_tiles);
+    return HloSharding::Tile(reshape_tiles);
+  };
+
+  std::vector<int64> lhs_to_rhs_indices(hlo->operand(0)->shape().rank(), -1);
+  std::vector<int64> lhs_to_output_indices(hlo->operand(0)->shape().rank(), -1);
+  std::vector<int64> rhs_to_lhs_indices(hlo->operand(1)->shape().rank(), -1);
+  std::vector<int64> rhs_to_output_indices(hlo->operand(1)->shape().rank(), -1);
+  std::vector<int64> output_to_lhs_indices(hlo->shape().rank(), -1);
+  std::vector<int64> output_to_rhs_indices(hlo->shape().rank(), -1);
+  auto populate_indices_mapping =
+      [&](const DotGeneralDimsMapping::DimsMapping& mapping) {
+        if (mapping.lhs >= 0) {
+          lhs_to_rhs_indices[mapping.lhs] = mapping.rhs;
+          lhs_to_output_indices[mapping.lhs] = mapping.output;
+        }
+        if (mapping.rhs >= 0) {
+          rhs_to_lhs_indices[mapping.rhs] = mapping.lhs;
+          rhs_to_output_indices[mapping.rhs] = mapping.output;
+        }
+        if (mapping.output >= 0) {
+          output_to_lhs_indices[mapping.output] = mapping.lhs;
+          output_to_rhs_indices[mapping.output] = mapping.rhs;
+        }
+      };
+  for (const auto& mapping : dims_mapping.batch_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.lhs_non_contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  for (const auto& mapping : dims_mapping.rhs_non_contracting_dims) {
+    populate_indices_mapping(mapping);
+  }
+  auto lhs_sharding_transposed_to_match_rhs =
+      transpose_sharding(lhs_sharding, lhs_to_rhs_indices, rhs_to_lhs_indices);
+  auto rhs_sharding_transposed_to_match_lhs =
+      transpose_sharding(rhs_sharding, rhs_to_lhs_indices, lhs_to_rhs_indices);
+  auto lhs_sharding_transposed_to_match_output = transpose_sharding(
+      lhs_sharding, lhs_to_output_indices, output_to_lhs_indices);
+  auto rhs_sharding_transposed_to_match_output = transpose_sharding(
+      rhs_sharding, rhs_to_output_indices, output_to_rhs_indices);
+  auto output_sharding_transposed_to_match_lhs = transpose_sharding(
+      hlo->sharding(), output_to_lhs_indices, lhs_to_output_indices);
+  auto output_sharding_transposed_to_match_rhs = transpose_sharding(
+      hlo->sharding(), output_to_rhs_indices, rhs_to_output_indices);
+
+  // lhs_rhs_or_output: 0 lhs, 1 rhs, 2 output.
+  auto get_partitions_for_dims =
+      [&](const HloSharding& sharding,
+          absl::Span<const DotGeneralDimsMapping::DimsMapping> dims,
+          int lhs_rhs_or_output) {
+        int64 partitions = 1;
+        if (sharding.IsTileMaximal()) {
+          return partitions;
+        }
+        for (const auto& dim : dims) {
+          if (lhs_rhs_or_output == 0) {
+            partitions *= sharding.tile_assignment().dim(dim.lhs);
+          } else if (lhs_rhs_or_output == 1) {
+            partitions *= sharding.tile_assignment().dim(dim.rhs);
+          } else {
+            CHECK_EQ(lhs_rhs_or_output, 2);
+            partitions *= sharding.tile_assignment().dim(dim.output);
+          }
+        }
+        return partitions;
+      };
+  const int64 lhs_batch_partitions =
+      get_partitions_for_dims(lhs_sharding, dims_mapping.batch_dims, 0);
+  const int64 rhs_batch_partitions =
+      get_partitions_for_dims(rhs_sharding, dims_mapping.batch_dims, 1);
+  const int64 output_batch_partitions =
+      get_partitions_for_dims(hlo->sharding(), dims_mapping.batch_dims, 2);
+  const int64 lhs_contracting_partitions =
+      get_partitions_for_dims(lhs_sharding, dims_mapping.contracting_dims, 0);
+  const int64 rhs_contracting_partitions =
+      get_partitions_for_dims(rhs_sharding, dims_mapping.contracting_dims, 1);
+  const int64 lhs_non_contracting_partitions = get_partitions_for_dims(
+      lhs_sharding, dims_mapping.lhs_non_contracting_dims, 0);
+  const int64 rhs_non_contracting_partitions = get_partitions_for_dims(
+      rhs_sharding, dims_mapping.rhs_non_contracting_dims, 1);
+  const int64 output_lhs_non_contracting_partitions = get_partitions_for_dims(
+      hlo->sharding(), dims_mapping.lhs_non_contracting_dims, 2);
+  const int64 output_rhs_non_contracting_partitions = get_partitions_for_dims(
+      hlo->sharding(), dims_mapping.rhs_non_contracting_dims, 2);
+
+  auto& lhs = GetPartitionedHlo(hlo->operand(0));
+  auto& rhs = GetPartitionedHlo(hlo->operand(1));
+  // LHS and RHS are partitioned the same way and only partitioned in batch
+  // dimensions.
+  if (lhs_batch_partitions == rhs_batch_partitions &&
+      rhs_batch_partitions == num_partitions_ &&
+      lhs_sharding_transposed_to_match_rhs == rhs_sharding) {
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] {
+      dot->set_sharding(*lhs_sharding_transposed_to_match_output);
+      return PartitionedHlo(dot, hlo->shape(), MakePartitioningState())
+          .Reshard(hlo->sharding())
+          .hlo();
+    });
+    return Status::OK();
+  }
+
+  // Try emit batch-partitioned einsum with one operand resharded. Returns
+  // whether the attempt succeeds. If may_reshard_with_allreduce is false,
+  // reshard must be done using all-to-all; otherwise this attempt fails.
+  auto try_emit_output_batch_partitioned_einsum_with_reshard =
+      [&](bool may_reshard_with_allreduce) -> StatusOr<bool> {
+    // LHS and output are batch partitioned in the same way.
+    if (lhs_batch_partitions == num_partitions_ &&
+        output_batch_partitions == num_partitions_ &&
+        lhs_sharding_transposed_to_match_output == hlo->sharding()) {
+      if (!may_reshard_with_allreduce &&
+          !CanReshardWithAllToAll(rhs.sharding(),
+                                  *lhs_sharding_transposed_to_match_rhs)) {
+        return false;
+      }
+      auto resharded_rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs);
+      TF_ASSIGN_OR_RETURN(
+          auto dot, create_sharded_dot(lhs.hlo(), resharded_rhs.hlo(), &b_));
+      SetPartitionedHlo(hlo, [&] { return dot; });
+      return true;
+    }
+    // RHS and output are batch partitioned in the same way.
+    if (rhs_batch_partitions == num_partitions_ &&
+        output_batch_partitions == num_partitions_ &&
+        rhs_sharding_transposed_to_match_output == hlo->sharding()) {
+      if (!may_reshard_with_allreduce &&
+          !CanReshardWithAllToAll(lhs.sharding(),
+                                  *rhs_sharding_transposed_to_match_lhs)) {
+        return false;
+      }
+      auto resharded_lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs);
+      TF_ASSIGN_OR_RETURN(
+          auto dot, create_sharded_dot(resharded_lhs.hlo(), rhs.hlo(), &b_));
+      SetPartitionedHlo(hlo, [&] { return dot; });
+      return true;
+    }
+    return false;
+  };
+
+  {
+    // Try batch-parallel by resharding one operand, and not using all-reduce.
+    TF_ASSIGN_OR_RETURN(
+        bool emitted,
+        try_emit_output_batch_partitioned_einsum_with_reshard(false));
+    if (emitted) {
+      return Status::OK();
+    }
+  }
+
+  // Try to emit windowed DotGeneral when one operand is partitioned in the same
+  // way as the output along non-contracting dimensions, but the other operand
+  // is tiled in other dimensions.
+  auto emit_windowed_dot_general = [&](int64 matching_operand,
+                                       int64 windowing_operand,
+                                       bool windowed_at_contracting_dims,
+                                       bool windowed_at_batch_dims) {
+    CHECK_EQ(matching_operand + windowing_operand, 1);
+    CHECK(!windowed_at_batch_dims || !windowed_at_contracting_dims);
+    auto unpadded_result_buffer_shape =
+        MakePartitionedShape(hlo->shape(), hlo->sharding());
+    auto padded_result_buffer_shape = unpadded_result_buffer_shape;
+    // For windowing at batch/non-contracting dims, we produce the result one
+    // partition at a time, so we need to pad the shape in case of uneven
+    // partitioning in order to make dynamic-update-slice in-bound.
+    if (!windowed_at_contracting_dims) {
+      padded_result_buffer_shape = GetPaddedShapeForUnevenPartitioning(
+          padded_result_buffer_shape,
+          windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
+                                 : *rhs_sharding_transposed_to_match_output);
+    }
+    // Mask the padding area of the windowed operand with zero if there is
+    // uneven partitioning.
+    if (windowed_at_contracting_dims) {
+      auto& to_mask = windowing_operand == 0 ? lhs : rhs;
+      to_mask =
+          to_mask.PadWithValue(b_.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::Zero(hlo->shape().element_type()))));
+    }
+    auto result_buffer = CreateZero(padded_result_buffer_shape, &b_);
+    auto iteration = b_.AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(0)));
+
+    // Create a while loop that computes one window per iteration. During each
+    // iteration, each partition sends its input window to its neighbor using
+    // collective-permute for the next iteration.
+    SpmdBuilder body_b("windowed_dot_general_body", visiting_hlo_);
+    auto param = body_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0,
+        ShapeUtil::MakeTupleShape({lhs.hlo()->shape(), rhs.hlo()->shape(),
+                                   result_buffer->shape(), iteration->shape()}),
+        "param"));
+    auto l = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(lhs.hlo()->shape(), param, 0));
+    auto r = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(rhs.hlo()->shape(), param, 1));
+    auto o = body_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+        result_buffer->shape(), param, 2));
+    auto i = body_b.AddInstruction(
+        HloInstruction::CreateGetTupleElement(iteration->shape(), param, 3));
+
+    auto partition_id = collective_ops_creator_.create_partition_id(&body_b);
+    auto data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i, partition_id));
+    auto partition_count = body_b.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<uint32>(num_partitions_)));
+    data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kRemainder, data_partition_id, partition_count));
+    auto dot_lhs = l;
+    auto dot_rhs = r;
+    if (windowed_at_contracting_dims || windowed_at_batch_dims) {
+      // Slice the matching operand according to the partitioned contracting
+      // dimensions on the windowed operand. We do this by treating the matching
+      // operand as replicated, and resharding it to match the windowed operand.
+      auto slice_operand = matching_operand == 0 ? l : r;
+      slice_operand->set_sharding(HloSharding::Replicate());
+      auto state = MakePartitioningState();
+      state.b = &body_b;
+      state.partition_id = data_partition_id;
+      auto slice = PartitionedHlo(slice_operand, slice_operand->shape(), state)
+                       .Reshard(windowing_operand == 0
+                                    ? *lhs_sharding_transposed_to_match_rhs
+                                    : *rhs_sharding_transposed_to_match_lhs)
+                       .hlo();
+      slice_operand->clear_sharding();
+      if (matching_operand == 0) {
+        dot_lhs = slice;
+      } else {
+        dot_rhs = slice;
+      }
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(dot_lhs, dot_rhs, &body_b));
+    if (windowed_at_contracting_dims) {
+      // Accumulate the partial output to the result buffer.
+      o = body_b.AddInstruction(
+          HloInstruction::CreateBinary(o->shape(), HloOpcode::kAdd, o, dot));
+    } else {
+      // The windowing operand is partitioned along batch/non-contracting
+      // dimensions, so we need a dynamic-update-slice to save the partial
+      // output in the result buffer.
+      auto offsets = MakePartitionOffsets(
+          o->shape(),
+          windowing_operand == 0 ? *lhs_sharding_transposed_to_match_output
+                                 : *rhs_sharding_transposed_to_match_output,
+          data_partition_id, &body_b);
+      o = body_b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          o->shape(), o, dot, offsets));
+    }
+
+    // ++i
+    i = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i,
+        body_b.AddInstruction(
+            HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)))));
+    auto has_more = body_b.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), i,
+        body_b.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<uint32>(num_partitions_))),
+        ComparisonDirection::kLt));
+    // Collective-permute for the next window. We don't need it for the last
+    // iteration, so we use a conditional around the collective-permute.
+    HloInstruction* conditional;
+    {
+      SpmdBuilder cp_b("window_collective_permute", visiting_hlo_);
+      {
+        auto p = cp_b.AddInstruction(HloInstruction::CreateParameter(
+            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
+        std::vector<std::pair<int64, int64>> sd_pairs(num_partitions_);
+        for (int64 source = 0; source < num_partitions_; ++source) {
+          // 0 -> n-1, 1 -> 0, 2 -> 1, ...
+          sd_pairs[source] = {source,
+                              (source - 1 + num_partitions_) % num_partitions_};
+        }
+        collective_ops_creator_.create_cross_partition_collective_permute(
+            &cp_b, p, sd_pairs, (*next_channel_id_)++);
+      }
+      SpmdBuilder ncp_b("last_iteration_noop", visiting_hlo_);
+      {
+        ncp_b.AddInstruction(HloInstruction::CreateParameter(
+            0, windowing_operand == 0 ? l->shape() : r->shape(), "window"));
+      }
+      conditional = body_b.AddInstruction(HloInstruction::CreateConditional(
+          windowing_operand == 0 ? l->shape() : r->shape(), has_more,
+          windowing_operand == 0 ? l : r,
+          module_->AddEmbeddedComputation(cp_b.Build()),
+          windowing_operand == 0 ? l : r,
+          module_->AddEmbeddedComputation(ncp_b.Build())));
+    }
+    if (windowing_operand == 0) {
+      l = conditional;
+    } else {
+      r = conditional;
+    }
+    body_b.AddInstruction(HloInstruction::CreateTuple({l, r, o, i}));
+
+    SpmdBuilder cond_b("windowed_dot_general_cond", visiting_hlo_);
+    auto cond_param = cond_b.AddInstruction(HloInstruction::CreateParameter(
+        /*parameter_number=*/0,
+        ShapeUtil::MakeTupleShape({lhs.hlo()->shape(), rhs.hlo()->shape(),
+                                   result_buffer->shape(), iteration->shape()}),
+        "param"));
+    auto cond_i = cond_b.AddInstruction(HloInstruction::CreateGetTupleElement(
+        iteration->shape(), cond_param, 3));
+    cond_b.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), cond_i,
+        cond_b.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<uint32>(num_partitions_))),
+        ComparisonDirection::kLt));
+    auto while_loop = b_.AddInstruction(HloInstruction::CreateWhile(
+        cond_param->shape(), module_->AddEmbeddedComputation(cond_b.Build()),
+        module_->AddEmbeddedComputation(body_b.Build()),
+        b_.AddInstruction(HloInstruction::CreateTuple(
+            {lhs.hlo(), rhs.hlo(), result_buffer, iteration}))));
+    windowed_dot_general_loops_.push_back({while_loop, windowing_operand,
+                                           windowed_at_contracting_dims,
+                                           windowed_at_batch_dims});
+    SetPartitionedHlo(hlo, [&] {
+      auto result = b_.AddInstruction(HloInstruction::CreateGetTupleElement(
+          result_buffer->shape(), while_loop, 2));
+      if (!ShapeUtil::Compatible(padded_result_buffer_shape,
+                                 unpadded_result_buffer_shape)) {
+        result = b_.AddInstruction(HloInstruction::CreateSlice(
+            unpadded_result_buffer_shape, result,
+            std::vector<int64>(padded_result_buffer_shape.rank(), 0),
+            unpadded_result_buffer_shape.dimensions(),
+            std::vector<int64>(padded_result_buffer_shape.rank(), 1)));
+      }
+      return result;
+    });
+    return Status::OK();
+  };
+  if (output_lhs_non_contracting_partitions == num_partitions_ &&
+      output_sharding_transposed_to_match_lhs == lhs_sharding &&
+      ShapeUtil::ByteSizeOf(hlo->operand(1)->shape()) >=
+          options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
+    if (rhs_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(0, 1, true, false);
+    }
+    if (rhs_non_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(0, 1, false, false);
+    }
+    if (rhs_batch_partitions == num_partitions_) {
+      return emit_windowed_dot_general(0, 1, false, true);
+    }
+  }
+  if (output_rhs_non_contracting_partitions == num_partitions_ &&
+      output_sharding_transposed_to_match_rhs == rhs_sharding &&
+      ShapeUtil::ByteSizeOf(hlo->operand(0)->shape()) >=
+          options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
+    if (lhs_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(1, 0, true, false);
+    }
+    if (lhs_non_contracting_partitions == num_partitions_) {
+      return emit_windowed_dot_general(1, 0, false, false);
+    }
+    if (lhs_batch_partitions == num_partitions_) {
+      return emit_windowed_dot_general(1, 0, false, true);
+    }
+  }
+
+  {
+    // Try batch-parallel by resharding one operand, and allowing all-reduce.
+    TF_ASSIGN_OR_RETURN(
+        bool emitted,
+        try_emit_output_batch_partitioned_einsum_with_reshard(true));
+    if (emitted) {
+      return Status::OK();
+    }
+  }
+
+  // LHS and RHS have the same partitioned contracting dimensions.
+  if (lhs_contracting_partitions == rhs_contracting_partitions &&
+      lhs_contracting_partitions == num_partitions_) {
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    // Pad both sides with zero, since NaN at one side cannot be masked by zero
+    // on the other side.
+    if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
+        ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+      lhs =
+          lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
+      rhs = rhs.PadWithValue(zero);
+    } else {
+      lhs = lhs.PadWithValue(zero);
+      rhs =
+          rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] {
+      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, dot, MakeBinaryAdd(hlo->shape().element_type(), module_),
+          NewChannel());
+      ar->set_sharding(HloSharding::Replicate());
+      return PartitionedHlo(ar, hlo->shape(), MakePartitioningState())
+          .Reshard(hlo->sharding())
+          .hlo();
+    });
+    return Status::OK();
+  }
+
+  // LHS and output have the same partitioned non-contracting dimensions.
+  if (lhs_non_contracting_partitions == num_partitions_ &&
+      output_lhs_non_contracting_partitions == num_partitions_ &&
+      lhs_sharding == hlo->sharding()) {
+    auto rhs_replicated = rhs.Reshard(HloSharding::Replicate()).hlo();
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs_replicated, &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+
+  // RHS and output have the same partitioned non-contracting dimensions.
+  if (rhs_non_contracting_partitions == num_partitions_ &&
+      output_rhs_non_contracting_partitions == num_partitions_ &&
+      rhs_sharding_transposed_to_match_output == hlo->sharding()) {
+    auto lhs_replicated = lhs.Reshard(HloSharding::Replicate()).hlo();
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs_replicated, rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+
+  // Output is batch partitioned.
+  if (output_batch_partitions == num_partitions_) {
+    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(resharded_lhs.hlo(),
+                                                     resharded_rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+  // Output is partitioned along LHS non-contracting dimensions.
+  if (output_lhs_non_contracting_partitions == num_partitions_) {
+    auto resharded_lhs = lhs.Reshard(*output_sharding_transposed_to_match_lhs);
+    auto replicated_rhs = rhs.Reshard(HloSharding::Replicate());
+    TF_ASSIGN_OR_RETURN(
+        auto dot,
+        create_sharded_dot(resharded_lhs.hlo(), replicated_rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+  // Output is partitioned along RHS non-contracting dimensions.
+  if (output_rhs_non_contracting_partitions == num_partitions_) {
+    auto replicated_lhs = lhs.Reshard(HloSharding::Replicate());
+    auto resharded_rhs = rhs.Reshard(*output_sharding_transposed_to_match_rhs);
+    TF_ASSIGN_OR_RETURN(auto dot, create_sharded_dot(replicated_lhs.hlo(),
+                                                     resharded_rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] { return dot; });
+    return Status::OK();
+  }
+
+  // Returns true if it is beneficial to reshard the operand at `operand_idx`
+  // across the contracting dimension.
+  const auto should_partition_contracting_dim = [&](int64 operand_idx) {
+    if (!hlo->sharding().IsReplicated()) {
+      return false;
+    }
+
+    if (operand_idx == 0) {
+      // If LHS and output are replicated, we compare the cost of all-gather
+      // on RHS vs all-reduce on the output.
+      return (rhs_contracting_partitions == num_partitions_) &&
+             lhs.sharding().IsReplicated() &&
+             ShapeUtil::ElementsIn(hlo->operand(1)->shape()) >
+                 ShapeUtil::ElementsIn(hlo->shape());
+    } else {
+      return (lhs_contracting_partitions == num_partitions_) &&
+             rhs.sharding().IsReplicated() &&
+             ShapeUtil::ElementsIn(hlo->operand(0)->shape()) >
+                 ShapeUtil::ElementsIn(hlo->shape());
+    }
+  };
+
+  // When the output is replicated and one of the operands is partitioned along
+  // contracting dimension, align the other operand to be partitioned along
+  // the contracting dimensions.
+  if (hlo->sharding().IsReplicated() && (should_partition_contracting_dim(0) ||
+                                         should_partition_contracting_dim(1))) {
+    auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(hlo->shape().element_type())));
+    if (should_partition_contracting_dim(0)) {
+      lhs =
+          lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
+      rhs = rhs.PadWithValue(zero);
+    } else {
+      lhs = lhs.PadWithValue(zero);
+      rhs =
+          rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithValue(zero);
+    }
+    TF_ASSIGN_OR_RETURN(auto dot,
+                        create_sharded_dot(lhs.hlo(), rhs.hlo(), &b_));
+    SetPartitionedHlo(hlo, [&] {
+      auto ar = collective_ops_creator_.create_cross_partition_all_reduce(
+          &b_, dot, MakeBinaryAdd(hlo->shape().element_type(), module_),
+          NewChannel());
+      ar->set_sharding(HloSharding::Replicate());
+      return PartitionedHlo(ar, hlo->shape(), MakePartitioningState()).hlo();
+    });
+    return Status::OK();
+  }
+
+  return DefaultAction(hlo);
+}
+
+namespace {
+
+// Finds a cluster of nodes that produce the inputs for `hlo` which only depend
+// on small operands, which means the cluster should start with broadcasts,
+// constants and iotas. All other internal nodes must be non-side-effecting
+// elemntwise ops. Returns the set of nodes, and the small operands. E.g., for
+// the following graph,
+//
+//     a -> broadcast -> multiply
+//     iota  ---> add--/
+//     constant/
+//
+// FindInputNodesIfOnlyDependOnSmallOperands(multiply) will return
+//    <{broadcast, iota, constant, add, multiply}, [a]>.
+std::pair<std::unordered_set<HloInstruction*>, std::vector<HloInstruction*>>
+FindInputNodesIfOnlyDependOnSmallOperands(HloInstruction* hlo) {
+  std::unordered_set<HloInstruction*> nodes_found;
+  std::vector<HloInstruction*> new_operands;
+  std::unordered_set<const HloInstruction*> new_operands_set;
+  std::vector<HloInstruction*> worklist;
+  worklist.push_back(hlo);
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (nodes_found.count(inst) > 0) {
+      continue;
+    }
+    if (inst->opcode() == HloOpcode::kBroadcast ||
+        inst->opcode() == HloOpcode::kConstant ||
+        inst->opcode() == HloOpcode::kIota) {
+      nodes_found.insert(inst);
+      for (auto o : inst->operands()) {
+        auto res = new_operands_set.emplace(o);
+        if (res.second) {
+          new_operands.push_back(o);
+        }
+      }
+    } else if (inst->IsElementwise() && !inst->HasSideEffectNoRecurse() &&
+               inst->opcode() != HloOpcode::kAllReduce &&
+               absl::c_all_of(inst->operands(),
+                              [inst](const HloInstruction* o) {
+                                return ShapeUtil::CompatibleIgnoringElementType(
+                                    o->shape(), inst->shape());
+                              })) {
+      nodes_found.insert(inst);
+      for (auto o : inst->operands()) {
+        worklist.push_back(o);
+      }
+    } else {
+      nodes_found.clear();
+      new_operands.clear();
+      break;
+    }
+  }
+  return {std::move(nodes_found), std::move(new_operands)};
+}
+
+// Moves a cluster of memory-reducing nodes into the windowed dot-general loop
+// on contracting dimensions. Such a loop has a dynamic slice on the
+// non-windowed operand. If we move the input nodes into the loop, the
+// dynamic-slice could be merged with them by later optimization passes, which
+// reduces memory.
+//
+// small_operands             small_operands
+//        |                          |
+// input_nodes                loop { |
+//        |          =>         input_nodes
+// loop { |                          |
+//    dynamic-slice             dynamic-slice
+//    ...                       ...
+// }                          }
+//
+// Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
+// with the input nodes.
+Status SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
+    HloInstruction* loop, int64 non_windowed_operand_index) {
+  auto input_tuple = loop->mutable_operand(0);
+  auto old_operand = input_tuple->mutable_operand(non_windowed_operand_index);
+  auto input_nodes = FindInputNodesIfOnlyDependOnSmallOperands(old_operand);
+  auto to_sink = std::move(input_nodes.first);
+  auto new_operands = std::move(input_nodes.second);
+  if (to_sink.empty()) {
+    return Status::OK();
+  }
+  auto computation = loop->parent();
+  // Replace the old operand with a tuple of the found small operands.
+  auto new_input_subtuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  TF_RETURN_IF_ERROR(input_tuple->ReplaceOperandWithDifferentShape(
+      non_windowed_operand_index, new_input_subtuple));
+
+  auto body = loop->while_body();
+  auto body_param = body->parameter_instruction(0);
+  auto old_body_param_users = body_param->users();
+  // Update all tuple shapes.
+  for (auto tuple : std::vector<HloInstruction*>{
+           input_tuple, loop, loop->while_condition()->parameter_instruction(0),
+           body_param, body->root_instruction()}) {
+    *ShapeUtil::GetMutableSubshape(tuple->mutable_shape(),
+                                   {non_windowed_operand_index}) =
+        new_input_subtuple->shape();
+  }
+  // Now update the loop body.
+  auto new_operand_tuple_inside =
+      body->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_input_subtuple->shape(), body_param, non_windowed_operand_index));
+  TF_RETURN_IF_ERROR(body->root_instruction()->ReplaceOperandWithDifferentShape(
+      non_windowed_operand_index, new_operand_tuple_inside));
+
+  // Create nodes inside the loop body.
+  std::vector<HloInstruction*> worklist;
+  std::unordered_map<const HloInstruction*, HloInstruction*> outside_to_inside;
+  auto add_users_if_available = [&](HloInstruction* inst) {
+    for (auto u : inst->users()) {
+      if (outside_to_inside.count(u) == 0 && to_sink.count(u) > 0 &&
+          absl::c_all_of(u->operands(), [&](const HloInstruction* o) {
+            return outside_to_inside.count(o) > 0;
+          })) {
+        worklist.push_back(u);
+      }
+    }
+  };
+  for (int64 i = 0; i < new_operands.size(); ++i) {
+    outside_to_inside[new_operands[i]] =
+        body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_operands[i]->shape(), new_operand_tuple_inside, i));
+    add_users_if_available(new_operands[i]);
+  }
+  // HLOs to sink without operands.
+  std::vector<HloInstruction*> nullaries_to_sink;
+  for (auto inst : to_sink) {
+    if (inst->operand_count() == 0) {
+      nullaries_to_sink.push_back(inst);
+    }
+  }
+  // Sort nullaries_to_sink to make it deterministic.
+  absl::c_sort(nullaries_to_sink,
+               [](const HloInstruction* a, const HloInstruction* b) {
+                 return a->unique_id() < b->unique_id();
+               });
+  for (auto inst : nullaries_to_sink) {
+    worklist.push_back(inst);
+  }
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    std::vector<HloInstruction*> inst_new_operands(inst->operand_count());
+    for (int64 i = 0; i < inst->operand_count(); ++i) {
+      inst_new_operands[i] = outside_to_inside[inst->operand(i)];
+    }
+    outside_to_inside[inst] = body->AddInstruction(
+        inst->CloneWithNewOperands(inst->shape(), inst_new_operands));
+    add_users_if_available(inst);
+  }
+  TF_RET_CHECK(outside_to_inside.count(old_operand) > 0);
+  for (auto ou : old_body_param_users) {
+    if (ou->opcode() == HloOpcode::kGetTupleElement &&
+        ou->tuple_index() == non_windowed_operand_index) {
+      TF_RETURN_IF_ERROR(
+          ou->ReplaceAllUsesWith(outside_to_inside[old_operand]));
+      TF_RETURN_IF_ERROR(body->RemoveInstruction(ou));
+    }
+  }
+  return Status::OK();
+}
+
+// Moves a cluster of memory-reducing nodes (with reduce nodes at the end) into
+// the windowed dot-general loop on non-contracting dimensions. Such a loop has
+// a dynamic-update-slice at the output. If we move the user nodes into the loop
+// and before the dynamic-update-slice, the user nodes can operate on smaller
+// shapes, which reduces memory.
+//
+// small_operands                   small_operands
+//  | |                 =>                  | |
+//  | |  loop {                     loop {  | |
+//  | |    conv                             | broadcast      conv
+//  | |      |                              |     |           /
+//  | | dynamic-update-slice                |  dynamic-slice /
+//  | |         |                           |     |         /
+//  | |  }      |                           |  multiply-----
+//  |broadcast  /                           |    /
+//  | |        /                            reduce
+//  |multiply--                             |
+//  \ |                                dynamic-update-slice
+//   reduce                         }
+//
+// Later optimization passes (TpuPadSliceMover) will merge the dynamic slice
+// with the input nodes (broadcast).
+Status MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
+    HloInstruction* loop) {
+  CHECK_EQ(loop->user_count(), 1);
+  // There should be a single direct user of the while loop, which is the
+  // gte for element 2, i.e., the dot output.
+  auto user_gte = loop->users().front();
+  CHECK_EQ(user_gte->opcode(), HloOpcode::kGetTupleElement);
+  CHECK_EQ(user_gte->tuple_index(), 2);
+  auto computation = loop->parent();
+
+  // Find the reduce outputs and the input nodes they depend on, if input nodes
+  // only have small operands.
+  std::unordered_set<HloInstruction*> to_move;
+  std::vector<HloInstruction*> new_operands;
+  std::unordered_set<const HloInstruction*> new_operands_set;
+  std::vector<HloInstruction*> reduce_outputs;
+  std::vector<HloInstruction*> worklist;
+  Shape padded_shape = user_gte->shape();
+  Shape unpadded_shape = user_gte->shape();
+  auto original_output = user_gte;
+
+  if (user_gte->user_count() == 1 &&
+      user_gte->users().back()->opcode() == HloOpcode::kSlice) {
+    original_output = user_gte->users().back();
+    unpadded_shape = original_output->shape();
+  }
+  for (auto u : original_output->users()) {
+    worklist.push_back(u);
+  }
+  to_move.insert(original_output);
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (to_move.count(inst) > 0) {
+      continue;
+    }
+    // We only support reduces with simple reduction function, since we may need
+    // to accumulate across iterations manually.
+    if (inst->opcode() == HloOpcode::kReduce &&
+        inst->to_apply()->instruction_count() == 3 &&
+        inst->to_apply()->num_parameters() == 2 &&
+        inst->to_apply()->root_instruction()->IsElementwise()) {
+      to_move.insert(inst);
+      auto other_operand = inst->mutable_operand(1);
+      auto res = new_operands_set.emplace(other_operand);
+      if (res.second) {
+        new_operands.push_back(other_operand);
+      }
+      reduce_outputs.push_back(inst);
+    } else if (inst != computation->root_instruction() &&
+               inst->user_count() > 0 && inst->IsElementwise() &&
+               !inst->HasSideEffectNoRecurse() &&
+               inst->opcode() != HloOpcode::kAllReduce &&
+               absl::c_all_of(inst->operands(),
+                              [inst](const HloInstruction* o) {
+                                return ShapeUtil::CompatibleIgnoringElementType(
+                                    o->shape(), inst->shape());
+                              })) {
+      // For an elementwise op, we need to make sure that they depend on only
+      // nodes already in to_move and nodes with small operands.
+      bool can_include = true;
+      for (auto operand : inst->operands()) {
+        if (to_move.count(operand) > 0) {
+          continue;
+        }
+        auto find_result = FindInputNodesIfOnlyDependOnSmallOperands(operand);
+        if (find_result.first.empty()) {
+          can_include = false;
+          break;
+        }
+        for (auto n : find_result.first) {
+          to_move.insert(n);
+        }
+        for (auto new_operand : find_result.second) {
+          auto res = new_operands_set.insert(new_operand);
+          if (res.second) {
+            new_operands.push_back(new_operand);
+          }
+        }
+      }
+      if (!can_include) {
+        to_move.clear();
+        break;
+      }
+      to_move.insert(inst);
+      for (auto u : inst->users()) {
+        worklist.push_back(u);
+      }
+    } else {
+      to_move.clear();
+      break;
+    }
+  }
+  // If nothing is found, to_move could contain only original_output, or cleared
+  // by the above code.
+  if (to_move.size() <= 1) {
+    return Status::OK();
+  }
+
+  // We will replace the original loop output with reduce-shape outputs. Create
+  // the initial buffers before the loop.
+  for (auto out : reduce_outputs) {
+    auto padded_out_shape = out->shape();
+    int64 operand_dim = 0;
+    int64 output_dim = 0;
+    while (output_dim < padded_out_shape.rank()) {
+      if (absl::c_linear_search(out->dimensions(), operand_dim)) {
+        // Dimension colapsed.
+        ++operand_dim;
+        continue;
+      }
+      // Kept dimensions have the same size of the padded shape.
+      padded_out_shape.set_dimensions(output_dim,
+                                      padded_shape.dimensions(operand_dim));
+      ++operand_dim;
+      ++output_dim;
+    }
+    auto broadcast =
+        computation->AddInstruction(HloInstruction::CreateBroadcast(
+            padded_out_shape,
+            computation->AddInstruction(HloInstruction::CreateConstant(
+                LiteralUtil::Zero(out->shape().element_type()))),
+            {}));
+    new_operands.push_back(broadcast);
+  }
+
+  auto input_tuple = loop->mutable_operand(0);
+  // Create the new input subtuple that contains the small operands and the
+  // reduce-shape result buffers.
+  auto new_input_subtuple =
+      computation->AddInstruction(HloInstruction::CreateTuple(new_operands));
+  TF_RETURN_IF_ERROR(
+      input_tuple->ReplaceOperandWithDifferentShape(2, new_input_subtuple));
+  auto body = loop->while_body();
+  auto body_param = body->parameter_instruction(0);
+  auto body_root = body->root_instruction();
+  CHECK_EQ(body_root->opcode(), HloOpcode::kTuple);
+  // Update tuple shapes.
+  for (auto tuple : std::vector<HloInstruction*>{
+           input_tuple, loop, loop->while_condition()->parameter_instruction(0),
+           body_param, body_root}) {
+    *ShapeUtil::GetMutableSubshape(tuple->mutable_shape(), {2}) =
+        new_input_subtuple->shape();
+  }
+  auto new_loop_input =
+      body->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_input_subtuple->shape(), body_param, 2));
+
+  // Now create the moved nodes inside the loop body.
+  std::unordered_map<const HloInstruction*, HloInstruction*> outside_to_inside;
+  worklist.clear();
+  auto add_users_if_available = [&](HloInstruction* inst) {
+    for (auto u : inst->users()) {
+      if (outside_to_inside.count(u) == 0 && to_move.count(u) > 0 &&
+          absl::c_all_of(u->operands(), [&](const HloInstruction* o) {
+            return outside_to_inside.count(o) > 0;
+          })) {
+        worklist.push_back(u);
+      }
+    }
+  };
+  for (int64 i = 0; i < new_operands.size(); ++i) {
+    outside_to_inside[new_operands[i]] =
+        body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_operands[i]->shape(), new_loop_input, i));
+    add_users_if_available(new_operands[i]);
+  }
+  // The elementwise nodes will be created with sliced shape. The original loop
+  // output corresponds to the dynamic-update-slice's update slice.
+  auto dus = body_root->mutable_operand(2);
+  CHECK_EQ(dus->opcode(), HloOpcode::kDynamicUpdateSlice);
+  outside_to_inside[original_output] = dus->mutable_operand(1);
+  add_users_if_available(original_output);
+  std::vector<HloInstruction*> slice_offsets(padded_shape.rank());
+  for (int64 i = 0; i < slice_offsets.size(); ++i) {
+    slice_offsets[i] = dus->mutable_operand(i + 2);
+  }
+  auto get_slice = [&](HloInstruction* padded) {
+    return body->AddInstruction(HloInstruction::CreateDynamicSlice(
+        ShapeUtil::ChangeElementType(dus->operand(1)->shape(),
+                                     padded->shape().element_type()),
+        padded, slice_offsets, dus->operand(1)->shape().dimensions()));
+  };
+  // Helper functions to create nodes with small operands.
+  auto add_broadcast = [&](const HloInstruction* broadcast) {
+    auto padded_operand_shape = broadcast->operand(0)->shape();
+    for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
+      padded_operand_shape.set_dimensions(
+          i, padded_shape.dimensions(broadcast->dimensions(i)));
+    }
+    auto padded_operand = PadToShape(outside_to_inside[broadcast->operand(0)],
+                                     padded_operand_shape, nullptr, body);
+    outside_to_inside[broadcast] =
+        get_slice(body->AddInstruction(broadcast->CloneWithNewOperands(
+            ShapeUtil::ChangeElementType(padded_shape,
+                                         padded_operand_shape.element_type()),
+            {padded_operand})));
+  };
+  auto add_iota = [&](const HloInstruction* iota) {
+    outside_to_inside[iota] =
+        get_slice(body->AddInstruction(iota->CloneWithNewOperands(
+            ShapeUtil::ChangeElementType(padded_shape,
+                                         iota->shape().element_type()),
+            {})));
+  };
+  auto add_constant = [&](const HloInstruction* constant) {
+    outside_to_inside[constant] = body->AddInstruction(constant->Clone());
+    outside_to_inside[constant] = get_slice(
+        PadToShape(outside_to_inside[constant],
+                   ShapeUtil::ChangeElementType(
+                       padded_shape, constant->shape().element_type()),
+                   nullptr, body));
+  };
+  while (!worklist.empty()) {
+    auto inst = worklist.back();
+    worklist.pop_back();
+    if (outside_to_inside.count(inst) > 0) {
+      continue;
+    }
+    if (inst->opcode() == HloOpcode::kBroadcast) {
+      add_broadcast(inst);
+    } else if (inst->opcode() == HloOpcode::kIota) {
+      add_iota(inst);
+    } else if (inst->opcode() == HloOpcode::kConstant) {
+      add_constant(inst);
+    } else if (inst->opcode() == HloOpcode::kReduce) {
+      // This is an output, for which we has special handling later.
+    } else {
+      std::vector<HloInstruction*> operands_inside(inst->operand_count());
+      for (int64 i = 0; i < operands_inside.size(); ++i) {
+        operands_inside[i] = outside_to_inside[inst->operand(i)];
+      }
+      outside_to_inside[inst] = body->AddInstruction(inst->CloneWithNewOperands(
+          ShapeUtil::ChangeElementType(dus->operand(1)->shape(),
+                                       inst->shape().element_type()),
+          operands_inside));
+    }
+    add_users_if_available(inst);
+  }
+  std::vector<HloInstruction*> new_outputs_inside(new_operands.size());
+  for (int64 i = 0; i < new_outputs_inside.size(); ++i) {
+    new_outputs_inside[i] = outside_to_inside[new_operands[i]];
+  }
+  // Now create the reduce outpus inside of the loop.
+  for (int64 i = 0; i < reduce_outputs.size(); ++i) {
+    auto reduce_outside = reduce_outputs[i];
+    CHECK_EQ(reduce_outside->opcode(), HloOpcode::kReduce);
+    int64 index_in_operand = new_operands.size() - reduce_outputs.size() + i;
+    auto last_iter_result = outside_to_inside[new_operands[index_in_operand]];
+    auto operand0 = outside_to_inside[reduce_outside->operand(0)];
+    auto operand1 = outside_to_inside[reduce_outside->operand(1)];
+    TF_ASSIGN_OR_RETURN(auto reduce_shape,
+                        ShapeInference::InferReduceShape(
+                            {&operand0->shape(), &operand1->shape()},
+                            reduce_outside->dimensions(),
+                            reduce_outside->to_apply()->ComputeProgramShape()));
+    *reduce_shape.mutable_layout() = reduce_outside->shape().layout();
+    std::vector<HloInstruction*> reduce_dus_offsets;
+    // If any collapsed dimension is windowed, we need to accumulate with last
+    // iteration's result. If such a dimension has padding, we also need to mask
+    // off invalid data.
+    bool needs_accumulate = false;
+    std::vector<int64> dims_to_mask;
+    for (int64 i = 0; i < slice_offsets.size(); ++i) {
+      if (absl::c_linear_search(reduce_outside->dimensions(), i)) {
+        if (reduce_outside->operand(0)->shape().dimensions(i) !=
+            operand0->shape().dimensions(i)) {
+          needs_accumulate = true;
+          if (unpadded_shape.dimensions(i) != padded_shape.dimensions(i)) {
+            dims_to_mask.push_back(i);
+          }
+        }
+        continue;
+      }
+      reduce_dus_offsets.push_back(slice_offsets[i]);
+    }
+    // Mask off invalid data in collapsed dimensions.
+    for (int64 dim : dims_to_mask) {
+      auto iota = body->AddInstruction(HloInstruction::CreateIota(
+          ShapeUtil::ChangeElementType(operand0->shape(), S32), dim));
+      auto add = body->AddInstruction(HloInstruction::CreateBinary(
+          iota->shape(), HloOpcode::kAdd, iota,
+          body->AddInstruction(HloInstruction::CreateBroadcast(
+              iota->shape(), slice_offsets[dim], {}))));
+      auto limit = body->AddInstruction(HloInstruction::CreateBroadcast(
+          iota->shape(),
+          body->AddInstruction(
+              HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                  reduce_outside->operand(0)->shape().dimensions(dim)))),
+          {}));
+      auto compare = body->AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::ChangeElementType(iota->shape(), PRED), add, limit,
+          ComparisonDirection::kLt));
+      operand0 = body->AddInstruction(HloInstruction::CreateTernary(
+          operand0->shape(), HloOpcode::kSelect, compare, operand0,
+          body->AddInstruction(HloInstruction::CreateBroadcast(
+              operand0->shape(), operand1, {}))));
+    }
+    auto output_inside =
+        body->AddInstruction(reduce_outside->CloneWithNewOperands(
+            reduce_shape, {operand0, operand1}));
+    // Accumulate with previous results if needed.
+    if (needs_accumulate) {
+      auto input_slice =
+          body->AddInstruction(HloInstruction::CreateDynamicSlice(
+              output_inside->shape(), last_iter_result, reduce_dus_offsets,
+              output_inside->shape().dimensions()));
+      output_inside = body->AddInstruction(HloInstruction::CreateBinary(
+          output_inside->shape(),
+          reduce_outside->to_apply()->root_instruction()->opcode(),
+          output_inside, input_slice));
+    }
+    // Dynamic-update-slice if needed.
+    if (!ShapeUtil::Compatible(output_inside->shape(),
+                               last_iter_result->shape())) {
+      output_inside =
+          body->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+              last_iter_result->shape(), last_iter_result, output_inside,
+              reduce_dus_offsets));
+    }
+    new_outputs_inside[index_in_operand] = output_inside;
+  }
+  // Body output.
+  auto new_output_inside =
+      body->AddInstruction(HloInstruction::CreateTuple(new_outputs_inside));
+  TF_RETURN_IF_ERROR(
+      body_root->ReplaceOperandWithDifferentShape(2, new_output_inside));
+  TF_RETURN_IF_ERROR(body->RemoveInstructionAndUnusedOperands(dus));
+  // Replace uses of the reduces outside the loop.
+  auto new_output_gte =
+      computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_output_inside->shape(), loop, 2));
+  for (int64 i = 0; i < reduce_outputs.size(); ++i) {
+    int64 index_in_operand = new_operands.size() - reduce_outputs.size() + i;
+    auto new_output =
+        computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            new_outputs_inside[index_in_operand]->shape(), new_output_gte,
+            index_in_operand));
+    if (!ShapeUtil::Compatible(new_output->shape(),
+                               reduce_outputs[i]->shape())) {
+      new_output = computation->AddInstruction(HloInstruction::CreateSlice(
+          reduce_outputs[i]->shape(), new_output,
+          std::vector<int64>(new_output->shape().rank(), 0),
+          reduce_outputs[i]->shape().dimensions(),
+          std::vector<int64>(new_output->shape().rank(), 1)));
+    }
+    TF_RETURN_IF_ERROR(reduce_outputs[i]->ReplaceAllUsesWith(new_output));
+    TF_RETURN_IF_ERROR(
+        computation->RemoveInstructionAndUnusedOperands(reduce_outputs[i]));
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
+Status SpmdPartitioningVisitor::DoCodeMotionForWindowedDotGeneralLoops(
+    HloComputation* computation) {
+  for (auto& loop : windowed_dot_general_loops_) {
+    if (loop.windowed_in_contracting_dims || loop.windowed_in_batch_dims) {
+      // We have a dynamic-slice for the non-windowed operand in
+      // batch/contracting-dim windowed dot-general. So moving the
+      // broadcast/iota/elementwise ops into the loop could help reduce memory
+      // via fusion.
+      TF_RETURN_IF_ERROR(
+          SinkInputNodesIntoWindowedDotGeneralLoopOnContractingDimensions(
+              loop.while_loop, 1 - loop.windowed_operand));
+    }
+    if (!loop.windowed_in_contracting_dims) {
+      // We have a dynamic-update-slice for the output in
+      // batch/non-contracting-dim windowed dot-general. So moving reduce ops
+      // into the loop could help reduce memory.
+      TF_RETURN_IF_ERROR(
+          MoveUsersIntoWindowedDotGeneralLoopOnNonContractingDimensions(
+              loop.while_loop));
+    }
+  }
+  return Status::OK();
+}
+
+StatusOr<bool> SpmdPartitioningVisitor::DoPartition(
+    HloComputation* computation, const HloSharding& root_sharding) {
+  VLOG(2) << "Partitioning computation " << computation->name() << " for "
+          << num_replicas_ << " replicas and " << num_partitions_
+          << " partitions";
+  TF_RETURN_IF_ERROR(computation->Accept(this));
+
+  HloModule* module = computation->parent();
+  auto new_root =
+      GetPartitionedHlo(computation->root_instruction()).Reshard(root_sharding);
+  auto new_computation =
+      module->AddEmbeddedComputation(b_.Build(new_root.hlo()));
+  TF_RETURN_IF_ERROR(DoCodeMotionForWindowedDotGeneralLoops(new_computation));
+
+  // Replace the original computation with the new SPMD computation.
+  std::unordered_map<HloComputation*, HloComputation*> replacement;
+  replacement[computation] = new_computation;
+  module->ReplaceComputations(replacement);
+  return changed_;
+}
+
+Status SpmdPartitioningVisitor::HandlePartitionId(HloInstruction* hlo) {
+  return Unimplemented(
+      "PartitionId instruction is not supported for SPMD partitioning since "
+      "the meaning is ambiguous -- whether the instruction is replicated or "
+      "the data is replicated, and if the latter which data is replicated.");
+}
+
+SpmdPartitioner::SpmdPartitioner(int64 num_partitions, int64 num_replicas,
+                                 SpmdPartitionerOptions options)
+    : SpmdPartitioner(
+          num_partitions, num_replicas, std::move(options),
+          SPMDCollectiveOpsCreator{
+              [](SpmdBuilder* b) {
+                return b->AddInstruction(HloInstruction::CreatePartitionId());
+              },
+              [num_replicas](SpmdBuilder* b, HloInstruction* operand,
+                             HloComputation* reduction, int64 channel_id) {
+                return b->AddInstruction(HloInstruction::CreateAllReduce(
+                    operand->shape(), {operand}, reduction,
+                    CreateReplicaGroups(num_replicas),
+                    /*constrain_layout=*/false, channel_id,
+                    /*use_global_device_ids=*/false));
+              },
+              [](SpmdBuilder* b, HloInstruction* operand,
+                 std::vector<std::pair<int64, int64>>& src_dst_pairs,
+                 int64 channel_id) {
+                return b->AddInstruction(
+                    HloInstruction::CreateCollectivePermute(
+                        operand->shape(), operand, src_dst_pairs, channel_id));
+              },
+              [](SpmdBuilder* b, absl::Span<HloInstruction* const> operands,
+                 const std::vector<ReplicaGroup>& replica_groups,
+                 int64 channel_id, absl::optional<int64> split_dimension) {
+                std::vector<Shape> shapes(operands.size(),
+                                          operands[0]->shape());
+                const Shape output_shape =
+                    (shapes.size() == 1) ? shapes[0]
+                                         : ShapeUtil::MakeTupleShape(shapes);
+                return b->AddInstruction(HloInstruction::CreateAllToAll(
+                    output_shape, operands, replica_groups,
+                    /*constrain_layout=*/false, channel_id, split_dimension));
+              },
+          }) {}
+
+StatusOr<bool> SpmdPartitioner::PartitionComputation(
+    HloComputation* computation, const HloSharding& root_sharding,
+    int64* next_channel_id, SpmdLogger* logger) {
+  auto visitor =
+      CreateVisitor(computation, num_partitions_, num_replicas_,
+                    collective_ops_creator_, next_channel_id, logger, options_);
+  return visitor->DoPartition(computation, root_sharding);
+}
+
+std::unique_ptr<SpmdPartitioningVisitor> SpmdPartitioner::CreateVisitor(
+    HloComputation* computation, int64 num_partitions, int64 num_replicas,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdLogger* logger,
+    SpmdPartitionerOptions options) {
+  return absl::make_unique<SpmdPartitioningVisitor>(
+      computation, num_partitions, num_replicas, collective_ops_creator,
+      next_channel_id, logger, std::move(options), this);
+}
+
+StatusOr<bool> SpmdPartitioner::Run(HloModule* module) {
+  TF_RETURN_IF_ERROR(PreprocessSharding(module));
+
+  XLA_VLOG_LINES(1, SpmdLogger::ReportBeforePartition(
+                        *module, options_.report_instruction_count));
+
+  // Add the parameters' and output's shardings to the module.
+  std::vector<HloSharding> entry_params_shardings;
+  for (int64 i = 0; i < module->entry_computation()->num_parameters(); ++i) {
+    auto param = module->entry_computation()->parameter_instruction(i);
+    CHECK(param->has_sharding()) << "Missing sharding in entry parameter " << i;
+    entry_params_shardings.push_back(param->sharding());
+  }
+  module->set_spmd_parameters_shardings(entry_params_shardings);
+  auto entry_root = module->entry_computation()->root_instruction();
+  CHECK(entry_root->has_sharding()) << "Missing sharding in entry root.";
+  module->set_spmd_output_sharding(entry_root->sharding());
+
+  FlattenCallGraph flatten;
+  TF_ASSIGN_OR_RETURN(auto changed, flatten.Run(module));
+
+  SpmdLogger logger(options_.report_instruction_count);
+  auto program_shape = module->entry_computation()->ComputeProgramShape();
+  int64 next_channel_id = hlo_query::NextChannelId(*module);
+  TF_ASSIGN_OR_RETURN(
+      bool partition_changed,
+      PartitionComputation(
+          module->entry_computation(),
+          module->entry_computation()->root_instruction()->sharding(),
+          &next_channel_id, &logger));
+  changed |= partition_changed;
+
+  // For the entry computation, make sure that the root instruction and the
+  // parameters preserve their signatures.
+  auto new_program_shape = module->entry_computation()->ComputeProgramShape();
+  if (!options_.allow_module_signature_change) {
+    TF_RET_CHECK(Shape::Equal().MinorToMajorOnlyInLayout()(
+        program_shape.result(), new_program_shape.result()))
+        << "Result shape changed for the entry computation";
+    TF_RET_CHECK(program_shape.parameters_size() ==
+                 new_program_shape.parameters_size())
+        << "Parameter count changed for the entry computation";
+    for (int64 i = 0; i < program_shape.parameters_size(); ++i) {
+      TF_RET_CHECK(Shape::Equal().MinorToMajorOnlyInLayout()(
+          program_shape.parameters(i), new_program_shape.parameters(i)))
+          << "Parameter shape changed for the entry computation";
+    }
+  } else {
+    const auto& old_entry_layout = module->entry_computation_layout();
+    // Shapes can change but the layout should still remain the same.
+    for (int64 i = 0; i < new_program_shape.parameters_size(); ++i) {
+      TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+          old_entry_layout.parameter_shape(i),
+          new_program_shape.mutable_parameters(i)));
+    }
+    TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
+        old_entry_layout.result_shape(), new_program_shape.mutable_result()));
+
+    HloModuleConfig config = module->config();
+    *config.mutable_entry_computation_layout() =
+        ComputationLayout(new_program_shape, /*ignore_layouts=*/false);
+    module->set_config(config);
+  }
+
+  XLA_VLOG_LINES(1, SpmdLogger::ReportAfterPartition(
+                        *module, options_.report_instruction_count));
+  XLA_VLOG_LINES(1, logger.MakeReport());
+
+  if (changed) {
+    HloPassPipeline pass("spmd-cleanup");
+    pass.AddPass<TupleSimplifier>();
+    pass.AddPass<HloDCE>();
+    pass.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
+    pass.AddPass<FlattenCallGraph>();
+    TF_RETURN_IF_ERROR(pass.Run(module).status());
+  }
+
+  TF_RETURN_IF_ERROR(ClearShardingAttributes(module));
+  return changed;
+}
+
+Status SpmdPartitioner::PreprocessSharding(HloModule* module) {
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* hlo : computation->instructions()) {
+      if (hlo->HasSideEffectNoRecurse() && hlo->opcode() != HloOpcode::kRng) {
+        TF_RET_CHECK(hlo->has_sharding())
+            << "Side-effect HLO must have sharding: " << hlo->ToString();
+        TF_RET_CHECK(!HasReplicatedSharding(hlo->sharding()) ||
+                     hlo->opcode() == HloOpcode::kInfeed)
+            << "Non-infeed side-effect HLO cannot have a replicated sharding:"
+            << hlo->ToString();
+      }
+
+      // For unassigned HLOs, annotate with replicated sharding.
+      //
+      // Among side-effecting ops, only Rng is allowed to omit the annotation.
+      // In that case, we currently force it to run on core 0, since we don't
+      // support partitioning or replicating the Rng op (the values depend on
+      // the seed provided to each device).
+      //
+      // TODO(hyouklee): Should we also convert single-device shardings (without
+      // side-effects) into replicated?
+      if (!hlo->has_sharding()) {
+        if (hlo->opcode() == HloOpcode::kRng) {
+          hlo->set_sharding(HloSharding::AssignDevice(0));
+        } else {
+          hlo->set_sharding(
+              HloSharding::Single(hlo->shape(), HloSharding::Replicate()));
+        }
+      } else if (!hlo->sharding().IsTileMaximal()) {
+        std::vector<int64> available(num_partitions_);
+        std::iota(available.begin(), available.end(), 0);
+        TF_RET_CHECK(num_partitions_ == hlo_sharding_util::DevicesForSharding(
+                                            hlo->sharding(), available)
+                                            .size())
+            << "num_partitions:" << num_partitions_ << "\n"
+            << "SPMD partitioner only supports tile sharding that includes all "
+               "partitions. If you didn't add this sharding annotation in the "
+               "model, please file a bug to XLA team.\n"
+            << hlo->ToString();
+      }
+    }
+  }
+
+  // Entry computation's parameter and root sharding must be either all
+  // replicated or all on a single device.
+  if (!options_.allow_module_signature_change) {
+    const HloComputation* entry = module->entry_computation();
+    TF_RET_CHECK(entry->root_instruction()->has_sharding());
+    const HloSharding& root_sharding = entry->root_instruction()->sharding();
+    TF_RET_CHECK(root_sharding.IsReplicated() ||
+                 root_sharding.UniqueDevice().has_value())
+        << "Unsupported entry root sharding: " << root_sharding.ToString();
+
+    for (const HloInstruction* param : entry->parameter_instructions()) {
+      TF_RET_CHECK(param->has_sharding());
+      TF_RET_CHECK(param->sharding().IsReplicated() ||
+                   param->sharding().UniqueDevice().has_value())
+          << "Unsupported entry parameter sharding:"
+          << param->sharding().ToString();
+    }
+  }
+
+  return Status::OK();
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
new file mode 100644
index 00000000000..09d2c4af908
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -0,0 +1,435 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+
+namespace xla {
+namespace spmd {
+
+struct SpmdPartitionerOptions {
+  // Always exchange halo on LHS for all convolutions. If false, backprop filter
+  // convolution exchanges halo on RHS.
+  bool conv_halo_exchange_always_on_lhs = true;
+
+  // The number of instructions to be reported for the highest memory profile
+  // instructions.
+  int64 report_instruction_count = 5;
+
+  // The minimum size in MiB of an einsum operand to be considered using
+  // windowed implementation in an HLO loop.
+  int64 threshold_for_windowed_einsum_mib = 256;
+
+  // Whether the entry computations' signature could change after partitioning.
+  bool allow_module_signature_change = false;
+};
+
+// Class to wrap the computation builder to capture information during SPMD
+// transformation.
+class SpmdBuilder : public HloComputation::Builder {
+ public:
+  SpmdBuilder(const std::string& name, HloInstruction* hlo)
+      : HloComputation::Builder(name) {
+    visiting_hlo_ = hlo;
+  }
+  HloInstruction* AddInstruction(std::unique_ptr<HloInstruction> instruction);
+
+  const std::vector<HloInstruction*>& derived_instructions(
+      HloInstruction* hlo) {
+    return instructions_.at(hlo);
+  }
+
+  void set_visiting_hlo(HloInstruction* hlo) { visiting_hlo_ = hlo; }
+
+  HloInstruction* visiting_hlo() const { return visiting_hlo_; }
+
+ private:
+  // Currently visiting instruction.
+  HloInstruction* visiting_hlo_;
+
+  // Map from the currently visiting (old) instruction to new instructions
+  // created during SPMD partitioning.
+  HloInstructionMap<std::vector<HloInstruction*>> instructions_;
+};
+
+// A set of functions that create the cross-partition collective ops.
+struct SPMDCollectiveOpsCreator {
+  // Function used to create a partition ID HLO.
+  std::function<HloInstruction*(SpmdBuilder*)> create_partition_id;
+
+  // Function used to create a cross-partition all-reduce HLO.
+  std::function<HloInstruction*(SpmdBuilder*, HloInstruction* operand,
+                                HloComputation* reduction, int64 channel_id)>
+      create_cross_partition_all_reduce;
+
+  // Function used to create a cross-partition collective-permute HLO.
+  std::function<HloInstruction*(
+      SpmdBuilder*, HloInstruction* operand,
+      std::vector<std::pair<int64, int64>>& src_dst_pairs,
+      int64 next_channel_id)>
+      create_cross_partition_collective_permute;
+
+  // Function used to create a cross-partition all-to-all HLO.
+  std::function<HloInstruction*(
+      SpmdBuilder*, absl::Span<HloInstruction* const> operands,
+      const std::vector<ReplicaGroup>& replica_groups, int64 channel_id,
+      absl::optional<int64> split_dimension)>
+      create_cross_partition_all_to_all;
+};
+
+// Logger to report memory usage during SPMD partitioning.
+class SpmdLogger {
+ public:
+  explicit SpmdLogger(int64 report_instruction_count)
+      : report_instruction_count_(report_instruction_count) {}
+  static std::string ReportBeforePartition(const HloModule& module,
+                                           int64 report_instruction_count);
+  static std::string ReportAfterPartition(const HloModule& module,
+                                          int64 report_instruction_count);
+
+  // Registers the logging for the groups of instructions created to transform
+  // the given hlo.
+  void RegisterLogEntry(HloInstruction* hlo,
+                        const std::vector<HloInstruction*>& group);
+
+  std::string MakeReport();
+
+ private:
+  template <typename F>
+  static std::string ReportMemoryUsage(const HloModule& module, const F& filter,
+                                       int64 report_instruction_count);
+
+  // A vector of logging messages (one for each original HLO instruction), where
+  // the first integer of the pair represents the size of the HBM used.
+  std::vector<std::pair<int64, std::string>> entries_;
+
+  int64 report_instruction_count_;
+};
+
+class SpmdPartitioningVisitor;
+
+class SpmdPartitioner : public HloModulePass {
+ public:
+  SpmdPartitioner(int64 num_partitions, int64 num_replicas,
+                  SpmdPartitionerOptions options);
+  SpmdPartitioner(int64 num_partitions, int64 num_replicas,
+                  SpmdPartitionerOptions options,
+                  SPMDCollectiveOpsCreator collective_ops_creator)
+      : num_partitions_(num_partitions),
+        num_replicas_(num_replicas),
+        options_(std::move(options)),
+        collective_ops_creator_(std::move(collective_ops_creator)) {}
+  absl::string_view name() const override { return "spmd-partitioning"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+  // Transforms the given computation with SPMD instructions, replacing it with
+  // a new computation.
+  StatusOr<bool> PartitionComputation(HloComputation* computation,
+                                      const HloSharding& root_sharding,
+                                      int64* next_channel_id,
+                                      SpmdLogger* logger);
+
+ protected:
+  virtual std::unique_ptr<SpmdPartitioningVisitor> CreateVisitor(
+      HloComputation* computation, int64 num_partitions, int64 num_replicas,
+      const SPMDCollectiveOpsCreator& collective_ops_creator,
+      int64* next_channel_id, SpmdLogger* logger,
+      SpmdPartitionerOptions options);
+
+ private:
+  // Verify that the sharding of instructions in the module are valid, and also
+  // fill in missing sharding information.
+  Status PreprocessSharding(HloModule* module);
+
+  const int64 num_partitions_;
+  const int64 num_replicas_;
+
+  SpmdPartitionerOptions options_;
+  SPMDCollectiveOpsCreator collective_ops_creator_;
+};
+
+// Class describes partition state of the data represented by an HLO created
+// during SPMD partitioning pass.
+//
+// Data on some devices may include padding region, if the base (full) shape
+// could not be evenly partitioned.
+class PartitionedHlo {
+ public:
+  // Return value for ReshardAsWindowedInput which describes the resharded HLO,
+  // the window for the user on the shard, and if necessary, the dynamic slice
+  // offsets to be applied to the output of the op being sharded.
+  struct WindowedInputShardReturnValue {
+    HloInstruction* sharded_input;
+    Window shard_window;
+    absl::optional<std::vector<HloInstruction*>> dynamic_slice_index_on_output;
+  };
+  // A cache for resharding each partitioned HLO.
+  struct ReshardCache {
+    struct PerHloCache {
+      std::vector<std::pair<HloSharding, PartitionedHlo>> reshard_cache;
+      std::vector<
+          std::tuple<HloSharding, Window, WindowedInputShardReturnValue>>
+          window_reshard_cache;
+    };
+    std::unordered_map<HloInstruction*, PerHloCache> per_hlo_cache;
+  };
+  struct PartitioningState {
+    SpmdBuilder* b;
+    HloModule* module;
+    int64 num_replicas;
+    HloInstruction* partition_id;
+    SPMDCollectiveOpsCreator collective_ops_creator;
+    int64* next_channel_id;
+    ReshardCache* reshard_cache;
+  };
+  PartitionedHlo(HloInstruction* hlo, Shape base_shape, PartitioningState state)
+      : hlo_(hlo), base_shape_(base_shape), state_(std::move(state)) {
+    CHECK(hlo->has_sharding())
+        << "PartitionedHlo is missing sharding:" << hlo->ToString();
+    // If the tuple shape instruction does not have a tuple sharding, reassign
+    // to use the tuple sharding. Reshard() implementation assumes this.
+    if (hlo_->shape().IsTuple() && !hlo_->sharding().IsTuple()) {
+      hlo_->set_sharding(
+          hlo_->sharding().GetTupleSharding(hlo_->shape()).ValueOrDie());
+    }
+  }
+
+  // Reshards the current SPMD instruction to a new sharding. Could only modify
+  // the reshard cache.
+  PartitionedHlo Reshard(const HloSharding& target);
+
+  // Pads the garbage area of the output with the provided value.
+  PartitionedHlo PadWithValue(HloInstruction* pad_value) const;
+
+  // Returns the SPMD instruction.
+  HloInstruction* hlo() const { return hlo_; }
+
+  // Returns the sharding of the SPMD instruction.
+  const HloSharding& sharding() const { return hlo_->sharding(); }
+
+  // Original full shape of the data.
+  const Shape& base_shape() const { return base_shape_; }
+
+  int64 NewChannel() const { return (*state_.next_channel_id)++; }
+
+  // Reshards the HLO to a usable partitioned input for a windowed user. Could
+  // only modify the reshard cache.
+  absl::optional<WindowedInputShardReturnValue> ReshardAsWindowedInput(
+      const Window& window, const HloSharding& target,
+      HloInstruction* pad_value, bool mask_invalid_region = true);
+
+ private:
+  // Same as Reshard except that it does not explicitly modify the reshard
+  // cache, although it would indirectly modify by calling Replicate().
+  PartitionedHlo ReshardNoCache(const HloSharding& target);
+
+  // Helper function to replicate the data on all devices. Could only modify
+  // the reshard cache.
+  PartitionedHlo Replicate();
+
+  // Helper function to broadcast data from a single device to all devices.
+  PartitionedHlo Broadcast() const;
+
+  // Helper function to reshard the tensor using AllToAll (instead of the
+  // default of Replicate followed by Slice).
+  PartitionedHlo ReshardWithAllToAll(const HloSharding& target) const;
+
+  // Helper function to reshard the tensor using CollectivePermute.
+  PartitionedHlo ReshardWithCollectivePermute(const HloSharding& target) const;
+
+  // SPMD instruction.
+  HloInstruction* hlo_;
+
+  // The original shape of the data before SPMD transformation is applied.
+  Shape base_shape_;
+
+  PartitioningState state_;
+};
+
+struct DotGeneralDimsMapping {
+  // The dimension numbers for the operands and output corresponding to a
+  // logical dimension (e.g., batch, contracting, non-contracting). If an
+  // operand or the output doesn't have the logical dimension, it is set to
+  // -1.
+  struct DimsMapping {
+    int64 lhs;
+    int64 rhs;
+    int64 output;
+  };
+  std::vector<DimsMapping> batch_dims;
+  std::vector<DimsMapping> contracting_dims;
+  std::vector<DimsMapping> lhs_non_contracting_dims;
+  std::vector<DimsMapping> rhs_non_contracting_dims;
+};
+
+class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
+ public:
+  SpmdPartitioningVisitor(
+      HloComputation* computation, int64 num_partitions, int64 num_replicas,
+      const SPMDCollectiveOpsCreator& collective_ops_creator,
+      int64* next_channel_id, SpmdLogger* logger,
+      SpmdPartitionerOptions options, SpmdPartitioner* partitioner);
+
+  Status DefaultAction(HloInstruction* hlo) override;
+  Status HandleAllReduce(HloInstruction* hlo) override;
+  Status HandleBroadcast(HloInstruction* hlo) override;
+  Status HandleConstant(HloInstruction* hlo) override;
+  Status HandleCustomCall(HloInstruction* hlo) override;
+  Status HandleDot(HloInstruction* hlo) override;
+  Status HandleDynamicSlice(HloInstruction* hlo) override;
+  Status HandleDynamicUpdateSlice(HloInstruction* hlo) override;
+  Status HandleGather(HloInstruction* hlo) override;
+  Status HandleGetTupleElement(HloInstruction* hlo) override;
+  Status HandleInfeed(HloInstruction* hlo) override;
+  Status HandleOutfeed(HloInstruction* hlo) override;
+  Status HandlePad(HloInstruction* hlo) override;
+  Status HandleParameter(HloInstruction* hlo) override;
+  Status HandleReduce(HloInstruction* hlo) override;
+  Status HandleReverse(HloInstruction* hlo) override;
+  Status HandleWhile(HloInstruction* hlo) override;
+  Status HandleConditional(HloInstruction* hlo) override;
+  Status HandleReduceWindow(HloInstruction* hlo) override;
+  Status HandleSelectAndScatter(HloInstruction* hlo) override;
+  Status HandleTuple(HloInstruction* hlo) override;
+  Status HandleRng(HloInstruction* hlo) override;
+  Status HandleConvolution(HloInstruction* hlo) override;
+  Status HandleConcatenate(HloInstruction* hlo) override;
+  Status HandleScatter(HloInstruction* hlo) override;
+  Status HandleSlice(HloInstruction* hlo) override;
+  Status HandleSort(HloInstruction* hlo) override;
+  Status HandleTranspose(HloInstruction* hlo) override;
+  Status HandleReshape(HloInstruction* hlo) override;
+  Status HandleIota(HloInstruction* hlo) override;
+  Status HandlePartitionId(HloInstruction* hlo) override;
+
+  // Handles convolution where both LHS and RHS operands are tiled.
+  Status HandleConvolutionTiledLhsAndRhs(HloInstruction* hlo);
+
+  // Implementation of dot partitioning given DotGeneralDimsMapping.
+  Status HandleDotHelper(
+      HloInstruction* hlo, const DotGeneralDimsMapping& dims_mapping,
+      const std::function<StatusOr<HloInstruction*>(
+          HloInstruction*, HloInstruction*, SpmdBuilder*)>& create_sharded_dot);
+
+  // Common handle for elementwise HLOs.
+  Status HandleElementwise(HloInstruction* hlo);
+
+  // Common handle for HLOs that runs on a single device.
+  Status HandleSingleDevice(const HloInstruction* hlo);
+
+  // Returns the PartitionedHlo that corresponds to the original hlo.
+  PartitionedHlo& GetPartitionedHlo(const HloInstruction* hlo) {
+    CHECK_EQ(partitioned_instructions_.count(hlo), 1);
+    return partitioned_instructions_.find(hlo)->second;
+  }
+
+  // Sets the PartitionedHlo for the original hlo.
+  void SetPartitionedHlo(const HloInstruction* hlo,
+                         const PartitionedHlo& partitioned_hlo) {
+    CHECK_EQ(partitioned_instructions_.count(hlo), 0);
+    partitioned_instructions_.emplace(hlo, partitioned_hlo);
+    changed_ = true;
+  }
+
+  // Convenient wrapper that creates PartitionedHlo from the result of the func
+  // and maps it to the given original hlo.
+  void SetPartitionedHlo(const HloInstruction* hlo,
+                         const std::function<HloInstruction*()>& func) {
+    HloInstruction* new_hlo = func();
+    new_hlo->set_sharding(hlo->sharding());
+    new_hlo->set_metadata(hlo->metadata());
+    SetPartitionedHlo(
+        hlo, PartitionedHlo(new_hlo, hlo->shape(), MakePartitioningState()));
+    changed_ = true;
+  }
+
+  int64 NewChannel() { return (*next_channel_id_)++; }
+
+  PartitionedHlo::PartitioningState MakePartitioningState() {
+    return PartitionedHlo::PartitioningState{
+        .b = &b_,
+        .module = module_,
+        .num_replicas = num_replicas_,
+        .partition_id = partition_id_,
+        .collective_ops_creator = collective_ops_creator_,
+        .next_channel_id = next_channel_id_,
+        .reshard_cache = &reshard_cache_};
+  }
+
+  SpmdBuilder* builder() { return &b_; }
+
+  StatusOr<bool> DoPartition(HloComputation* computation,
+                             const HloSharding& root_sharding);
+
+ private:
+  Status Preprocess(HloInstruction* hlo) override;
+  Status Postprocess(HloInstruction* hlo) override;
+
+  // Performs code motion for windowed dot-general loops in
+  // windowed_dot_general_loops_. Invoked after the visitor finishes traversing
+  // the graph.
+  Status DoCodeMotionForWindowedDotGeneralLoops(HloComputation* computation);
+
+  bool changed_;
+  HloModule* module_;
+  int64 num_partitions_;
+  int64 num_replicas_;
+
+  SPMDCollectiveOpsCreator collective_ops_creator_;
+
+  // Tracks the next channel id to use for cross-partition all-reduce.
+  int64* next_channel_id_;
+  SpmdBuilder b_;
+
+  HloInstruction* partition_id_;
+
+  PartitionedHlo::ReshardCache reshard_cache_;
+
+  // Mapping from the instruction in the original computation to the new SPMD
+  // partitioned instruction.
+  ConstHloInstructionMap<PartitionedHlo> partitioned_instructions_;
+
+  // Information about a loop created for windowed dot-general. Used when
+  // DoCodeMotionForWindowedDotGeneralLoops() executes after the visitor
+  // finishes traversing the graph.
+  struct WindowedDotGeneralLoop {
+    HloInstruction* while_loop;
+    int64 windowed_operand;
+    bool windowed_in_contracting_dims;
+    bool windowed_in_batch_dims;
+  };
+  std::vector<WindowedDotGeneralLoop> windowed_dot_general_loops_;
+
+  HloInstruction* visiting_hlo_;
+  SpmdLogger* logger_;
+  const SpmdPartitionerOptions options_;
+  SpmdPartitioner* partitioner_;
+};
+
+}  // namespace spmd
+}  // namespace xla
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_H_
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
new file mode 100644
index 00000000000..7a7f2dcc807
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -0,0 +1,3191 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace spmd {
+namespace {
+
+using ::testing::AllOf;
+namespace op = xla::testing::opcode_matchers;
+
+class SpmdPartitioningTest : public HloTestBase {
+ public:
+  StatusOr<std::unique_ptr<HloModule>> PartitionComputation(
+      const char* hlo_module, int64 num_devices,
+      bool conv_halo_exchange_always_on_lhs = true) {
+    // Some tests (BackpropFilter convs) set this flag false to test two
+    // different paths of the implementation.
+    SpmdPartitionerOptions options;
+    options.conv_halo_exchange_always_on_lhs = conv_halo_exchange_always_on_lhs;
+    options.allow_module_signature_change = true;
+
+    TF_ASSIGN_OR_RETURN(auto module, ParseAndReturnVerifiedModule(
+                                         hlo_module, GetModuleConfigForTest()));
+    HloPassPipeline pass("spmd-partitioning");
+    pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
+                              /*allow_mixed_precision=*/false);
+    pass.AddPass<SpmdPartitioner>(num_devices, /*num_replicas=*/1, options);
+    pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
+                              /*allow_mixed_precision=*/false);
+    TF_RETURN_IF_ERROR(pass.Run(module.get()).status());
+    return StatusOr<std::unique_ptr<HloModule>>(std::move(module));
+  }
+};
+
+TEST_F(SpmdPartitioningTest, InvalidSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[8,2]{1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {maximal device=0}}
+  ROOT infeed.data = f32[8,2]{1,0} get-tuple-element(infeed), index=0,
+    sharding={maximal device=0}
+})";
+  auto module_status = PartitionComputation(hlo_string, /*num_devices=*/4);
+  EXPECT_FALSE(module_status.status().ok());
+  EXPECT_THAT(module_status.status().ToString(),
+              ::testing::HasSubstr(
+                  "only supports tile sharding that includes all partitions"));
+}
+
+TEST_F(SpmdPartitioningTest, SingleDeviceToReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={maximal device=0}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Copy(op::AllReduce(
+                              op::Select(op::Broadcast(op::Compare()),
+                                         op::Constant(), op::Broadcast()))),
+                          op::Shape("s32[2,3]")));
+}
+
+TEST_F(SpmdPartitioningTest, SingleDeviceToSingleDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={maximal device=0}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={maximal device=1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  VLOG(1) << module->ToString();
+  EXPECT_THAT(root, op::Copy(AllOf(op::Copy(op::AllReduce(op::Select(
+                                       op::Broadcast(op::Compare()),
+                                       op::Constant(), op::Broadcast()))),
+                                   op::Shape("s32[2,3]"))));
+}
+
+TEST_F(SpmdPartitioningTest, SingleDeviceToTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={maximal device=0}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant),
+    sharding={devices=[2,1]1,0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Copy(op::DynamicSlice(
+              op::AllReduce(op::Select(
+                  op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                  op::Constant(), op::Broadcast())),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                           op::Constant())),
+              op::Constant())),
+          op::Shape("s32[1,3]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::Copy(op::AllReduce(AllOf(
+          op::DynamicUpdateSlice(
+              op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                           op::Constant())),
+              op::Constant()),
+          op::Shape("s32[2,3]")))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToSingleDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %constant = s32[2,3]{1,0} constant({{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT %copy = s32[2,3]{1,0} copy(%constant), sharding={maximal device=0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::Copy(op::Copy(op::AllReduce(AllOf(
+          op::DynamicUpdateSlice(
+              op::Broadcast(), AllOf(op::Constant(), op::Shape("s32[1,3]")),
+              op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                           op::Constant())),
+              op::Constant()),
+          op::Shape("s32[2,3]"))))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledEven) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param= s32[8,2]{1,0} parameter(0), sharding={devices=[2,1]0,1}
+  ROOT %copy = s32[8,2]{1,0} copy(%param), sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Copy(op::Reshape(op::Transpose(op::AllToAll(AllOf(
+                op::Reshape(op::Parameter()), op::Shape("s32[4,2,1]")))))),
+            op::Shape("s32[8,1]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledUneven) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param= f32[7,31,128]{2,1,0} parameter(0), sharding={devices=[1,2,1]0,1}
+  ROOT %copy = f32[7,31,128]{2,1,0} copy(%param), sharding={devices=[2,1,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Copy(op::Slice(op::Reshape(AllOf(op::Transpose(op::AllToAll(
+          op::Reshape(AllOf(op::Pad(), op::Shape("f32[8,16,128]")))))))))));
+}
+
+TEST_F(SpmdPartitioningTest, GetTupleElementSwapDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param.0 = (f32[2,3]{1,0}, u32[]) parameter(0),
+    sharding={{maximal device=1}, {maximal device=1}}
+  %gte.0 = f32[2,3]{1,0} get-tuple-element(%param.0), index=0,
+    sharding={maximal device=0}
+  %gte.1 = u32[] get-tuple-element(%param.0), index=1,
+    sharding={maximal device=0}
+  ROOT %tuple = (f32[2,3]{1,0}, u32[]) tuple(%gte.0, %gte.1),
+    sharding={{maximal device=0},{maximal device=0}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, op::Tuple());
+
+  EXPECT_THAT(root->operand(0),
+              op::Copy(op::AllReduce(op::Select(
+                  op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                  op::GetTupleElement(op::Parameter()), op::Broadcast()))));
+  EXPECT_THAT(root->operand(1),
+              op::Copy(op::AllReduce(op::Select(
+                  op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                  op::GetTupleElement(op::Parameter()), op::Broadcast()))));
+}
+
+TEST_F(SpmdPartitioningTest, GetTupleElementTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param.0 = (f32[2,3]{1,0}, u32[2,3]{1,0}) parameter(0),
+    sharding={{replicated}, {replicated}}
+  gte.0 = f32[2,3]{1,0} get-tuple-element(param.0), index=0,
+    sharding={devices=[2,1]0,1}
+  gte.1 = u32[2,3]{1,0} get-tuple-element(param.0), index=1,
+    sharding={devices=[2,1]0,1}
+  ROOT %tuple = (f32[2,3]{1,0}, u32[2,3]{1,0}) tuple(gte.0, gte.1),
+    sharding={{devices=[2,1]0,1},{devices=[2,1]0,1}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  ASSERT_THAT(root, op::Tuple());
+
+  auto offset = op::Reshape(
+      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+
+  EXPECT_THAT(root->operand(0),
+              op::DynamicSlice(op::GetTupleElement(op::Parameter()), offset,
+                               op::Constant()));
+  EXPECT_THAT(root->operand(1),
+              op::DynamicSlice(op::GetTupleElement(op::Parameter()), offset,
+                               op::Constant()));
+}
+
+TEST_F(SpmdPartitioningTest, TiledInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[8,2]{1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {maximal device=0}}
+  ROOT infeed.data = f32[8,2]{1,0} get-tuple-element(infeed), index=0,
+    sharding={maximal device=0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, op::Copy(op::AllReduce(op::DynamicUpdateSlice(
+                op::Broadcast(),
+                op::GetTupleElement(
+                    AllOf(op::Infeed(), op::Shape("(f32[4,2]{1,0}, token[])"))),
+                op::Reshape(op::DynamicSlice(op::Constant(), op::PartitionId(),
+                                             op::Constant())),
+                op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, UnevenTiledInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[9,2]{1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {maximal device=0}}
+  ROOT infeed.data = f32[9,2]{1,0} get-tuple-element(infeed), index=0,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, AllOf(op::Shape("f32[5,2]"), op::GetTupleElement(op::Conditional(
+                                             op::Convert(op::PartitionId()),
+                                             op::AfterAll(), op::AfterAll()))));
+  EXPECT_THAT(
+      root->operand(0)->called_computations()[0]->root_instruction(),
+      AllOf(op::Shape("(f32[5,2], token[])"), op::Infeed(op::Parameter())));
+  auto second_infeed =
+      AllOf(op::Shape("(f32[4,2], token[])"), op::Infeed(op::Parameter()));
+  EXPECT_THAT(root->operand(0)->called_computations()[1]->root_instruction(),
+              AllOf(op::Shape("(f32[5,2], token[])"),
+                    op::Tuple(op::Pad(op::GetTupleElement(second_infeed),
+                                      op::Constant()),
+                              op::GetTupleElement(second_infeed))));
+}
+
+TEST_F(SpmdPartitioningTest, UnevenTiledTupleInfeed) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = ((f32[9,2]{1,0}, f32[2]{0}), token[]) infeed(token0),
+    sharding={{devices=[2,1]0,1}, {replicated}, {maximal device=0}}
+  ROOT infeed.data = (f32[9,2]{1,0}, f32[2]{0}) get-tuple-element(infeed),
+    index=0, sharding={{devices=[2,1]0,1}, {replicated}}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("(f32[5,2], f32[2])"),
+                          op::GetTupleElement(op::Conditional(
+                              op::Convert(op::PartitionId()), op::AfterAll(),
+                              op::AfterAll()))));
+  EXPECT_THAT(root->operand(0)->called_computations()[0]->root_instruction(),
+              AllOf(op::Shape("((f32[5,2], f32[2]), token[])"),
+                    op::Infeed(op::Parameter())));
+  auto second_infeed = AllOf(op::Shape("((f32[4,2], f32[2]), token[])"),
+                             op::Infeed(op::Parameter()));
+  EXPECT_THAT(
+      root->operand(0)->called_computations()[1]->root_instruction(),
+      AllOf(op::Shape("((f32[5,2], f32[2]), token[])"),
+            op::Tuple(op::Tuple(op::Pad(op::GetTupleElement(
+                                            op::GetTupleElement(second_infeed)),
+                                        op::Constant()),
+                                op::GetTupleElement(
+                                    op::GetTupleElement(second_infeed))),
+                      op::GetTupleElement(second_infeed))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToReplicatedReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[3,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce = f32[] reduce(constant, constant.1), dimensions={0,1},
+    to_apply=sum, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::AllReduce(op::Reduce(
+          op::Select(
+              op::Compare(op::Add(op::Iota(), op::Broadcast(op::Reshape())),
+                          op::Broadcast(op::Constant())),
+              AllOf(op::Shape("f32[2,3]{1,0}"),
+                    op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                     op::Reshape(), op::Constant())),
+              op::Broadcast(op::Constant())),
+          op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, TiledElementwise) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[3,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[3,3]{1,0} constant({{2,2,2},{2,2,2},{2,2,2}}),
+    sharding={replicated}
+  multiply = f32[3,3]{1,0} multiply(constant, constant.1),
+    sharding={devices=[2,1]0,1}
+  ROOT add = f32[3,3]{1,0} add(multiply, constant.1),
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(
+          op::Shape("f32[2,3]{1,0}"),
+          op::Add(op::Multiply(
+                      op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                       op::Reshape(), op::Constant()),
+                      op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                       op::Reshape(), op::Constant())),
+                  op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
+                                   op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, TiledAllReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  parameter = f32[3,3]{1,0} parameter(0), sharding={devices=[2,1]0,1}
+  ROOT all-reduce = f32[3,3]{1,0} all-reduce(parameter), to_apply=sum,
+    replica_groups={}, sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root, AllOf(op::Shape("f32[2,3]{1,0}"), op::AllReduce(op::Parameter(0))));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastOnlyNewDimsSharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={replicated}
+  ROOT broadcast = f32[3,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[2,1,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,4,3]{2,1,0}"),
+                          op::Broadcast(op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastOnlyOldDimsSharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={replicated}
+  ROOT broadcast = f32[4,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[1,2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,2,3]{2,1,0}"),
+                          op::Broadcast(op::DynamicSlice(
+                              op::Constant(), op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastBothOldAndNewDimsSharded) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={replicated}
+  ROOT broadcast = f32[4,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[2,2,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Shape("f32[2,2,3]{2,1,0}"),
+            op::Broadcast(AllOf(op::Shape("f32[2,3]{1,0}"),
+                                op::DynamicSlice(op::Constant(), op::Reshape(),
+                                                 op::Constant())))));
+}
+
+TEST_F(SpmdPartitioningTest, BroadcastPropagateTiledSharding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[4,3]{1,0} constant({{1,1,1},{1,4,1},{1,3,1},{1,2,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT broadcast = f32[4,4,3]{2,1,0} broadcast(constant), dimensions={1,2},
+    sharding={devices=[1,2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,2,3]{2,1,0}"),
+                          op::Broadcast(op::DynamicSlice(
+                              op::Constant(), op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, OutfeedSingleDevice) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  token.0 = token[] after-all()
+  data = f32[1024]{0} parameter(0), sharding={maximal device=0}
+  outfeed = token[] outfeed(data, token.0), sharding={maximal device=0}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("token[]"),
+                          op::Conditional(
+                              op::Compare(op::PartitionId(), op::Constant()),
+                              op::Tuple(op::Parameter(0), op::AfterAll()),
+                              op::Tuple(op::Parameter(0), op::AfterAll()))));
+
+  HloInstruction* root_b0 = root->branch_computation(0)->root_instruction();
+  EXPECT_THAT(root_b0,
+              AllOf(op::Shape("token[]"),
+                    op::Outfeed(op::GetTupleElement(op::Parameter(), 0),
+                                op::GetTupleElement(op::Parameter(), 1))));
+
+  HloInstruction* root_b1 = root->branch_computation(1)->root_instruction();
+  EXPECT_THAT(root_b1, AllOf(op::Shape("token[]"), op::AfterAll()));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowReplicatedInput) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[6,2]{1,0} constant({{1,1},{1,4},{2,1},{3,1},{1,2},{2,2}}),
+    sharding={replicated}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[3,2]{1,0} reduce-window(constant, constant.1),
+    window={size=3x1 stride=2x1 pad=1_0x0_0}, to_apply=sum,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::Shape("f32[2,2]{1,0}"),
+            op::ReduceWindow(
+                op::DynamicSlice(AllOf(op::Shape("f32[9,2]{1,0}"),
+                                       op::Pad(op::Constant(), op::Constant())),
+                                 op::Multiply(op::Reshape(), op::Constant()),
+                                 op::Constant()),
+                op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledNegativeLeftHalo) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[6,2]{1,0} constant({{1,1},{1,4},{2,1},{3,1},{1,2},{2,2}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT %reduce-window = f32[3,2]{1,0} reduce-window(%constant, %constant.1),
+    window={size=3x1 stride=2x1 pad=0_1x0_0}, to_apply=sum,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input =
+      op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant());
+  auto right_halo = AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::CollectivePermute(op::Slice(sharded_input)));
+  auto pre_masking = op::DynamicSlice(
+      AllOf(
+          op::Shape("f32[6,2]{1,0}"),
+          op::Pad(op::Concatenate(sharded_input, right_halo), op::Constant())),
+      op::Reshape(), op::Constant());
+  auto index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto masked =
+      op::Select(op::Compare(index_in_padded, op::Broadcast(op::Constant())),
+                 pre_masking, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledOneSideUnequalHalo) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[9,2]{1,0} constant(
+    {{1,1},{1,4},{2,1},{3,1},{1,2},{2,2},{4,1},{1,2},{2,1}}),
+    sharding={devices=[3,1]0,1,2}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[5,2]{1,0} reduce-window(constant, constant.1),
+    window={size=3x1 stride=2x1 pad=1_1x0_0}, to_apply=sum,
+    sharding={devices=[3,1]0,1,2}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/3));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input =
+      op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant());
+  auto right_halo = AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::CollectivePermute(op::Slice(sharded_input)));
+  auto pre_masking = op::DynamicSlice(
+      AllOf(
+          op::Shape("f32[7,2]{1,0}"),
+          op::Pad(op::Concatenate(sharded_input, right_halo), op::Constant())),
+      op::Reshape(), op::Constant());
+  auto index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto masked = op::Select(
+      op::And(op::Compare(index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(index_in_padded, op::Broadcast(op::Constant()))),
+      pre_masking, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledTwoSideHalo) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  constant = f32[4,2]{1,0} constant({{1,1},{1,4},{2,1},{3,1}}),
+    sharding={devices=[2,1]0,1}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[2,2]{1,0} reduce-window(constant, constant.1),
+    window={size=5x1 stride=3x1 pad=2_2x0_0}, to_apply=sum,
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input =
+      op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant());
+  auto left_halo = AllOf(op::Shape("f32[1,2]{1,0}"),
+                         op::CollectivePermute(op::Slice(sharded_input)));
+  auto right_halo = AllOf(op::Shape("f32[1,2]{1,0}"),
+                          op::CollectivePermute(op::Slice(sharded_input)));
+  auto pre_masking = AllOf(
+      op::Shape("f32[5,2]{1,0}"),
+      op::DynamicSlice(
+          AllOf(op::Shape("f32[6,2]{1,0}"),
+                op::Pad(op::Concatenate(left_halo, sharded_input, right_halo),
+                        op::Constant())),
+          op::Reshape(), op::Constant()));
+  auto index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto masked = op::Select(
+      op::And(op::Compare(index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(index_in_padded, op::Broadcast(op::Constant()))),
+      pre_masking, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[1,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ReduceWindowTiled2D) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  token0 = token[] after-all(), sharding={maximal device=0}
+  infeed = (f32[4,4,2,2]{3,2,1,0}, token[]) infeed(token0),
+    sharding={{devices=[2,2,1,1]0,1,2,3}, {maximal device=0}}
+  infeed.data = f32[4,4,2,2]{3,2,1,0} get-tuple-element(infeed), index=0,
+    sharding={devices=[2,2,1,1]0,1,2,3}
+  constant = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[2,2,2,2]{3,2,1,0} reduce-window(infeed.data, constant),
+    window={size=5x5x1x1 stride=3x3x1x1 pad=2_2x2_2x0_0x0_0}, to_apply=sum,
+    sharding={devices=[2,2,1,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  auto sharded_input = AllOf(op::Shape("f32[2,2,2,2]{3,2,1,0}"),
+                             op::GetTupleElement(op::Infeed()));
+  auto dim0_left_halo = AllOf(op::Shape("f32[1,2,2,2]{3,2,1,0}"),
+                              op::CollectivePermute(op::Slice(sharded_input)));
+  auto dim0_right_halo = AllOf(op::Shape("f32[1,2,2,2]{3,2,1,0}"),
+                               op::CollectivePermute(op::Slice(sharded_input)));
+  auto dim0_pre_masking = op::DynamicSlice(
+      AllOf(op::Shape("f32[6,2,2,2]{3,2,1,0}"),
+            op::Pad(
+                op::Concatenate(dim0_left_halo, sharded_input, dim0_right_halo),
+                op::Constant())),
+      op::Reshape(), op::Constant(), op::Constant(), op::Constant());
+  auto dim0_index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto dim0_masked = op::Select(
+      op::And(op::Compare(dim0_index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(dim0_index_in_padded, op::Broadcast(op::Constant()))),
+      dim0_pre_masking, op::Broadcast(op::Constant()));
+  auto dim0_resharded = AllOf(op::Shape("f32[5,2,2,2]{3,2,1,0}"), dim0_masked);
+  auto dim1_left_halo = AllOf(op::Shape("f32[5,1,2,2]{3,2,1,0}"),
+                              op::CollectivePermute(op::Slice(dim0_resharded)));
+  auto dim1_right_halo =
+      AllOf(op::Shape("f32[5,1,2,2]{3,2,1,0}"),
+            op::CollectivePermute(op::Slice(dim0_resharded)));
+  auto dim1_pre_masking = op::DynamicSlice(
+      AllOf(op::Shape("f32[5,6,2,2]{3,2,1,0}"),
+            op::Pad(op::Concatenate(dim1_left_halo, dim0_resharded,
+                                    dim1_right_halo),
+                    op::Constant())),
+      op::Constant(), op::Reshape(), op::Constant(), op::Constant());
+  auto dim1_index_in_padded = op::Add(
+      op::Iota(), op::Broadcast(op::Multiply(op::Reshape(), op::Constant())));
+  auto dim1_masked = op::Select(
+      op::And(op::Compare(dim1_index_in_padded, op::Broadcast(op::Constant())),
+              op::Compare(dim1_index_in_padded, op::Broadcast(op::Constant()))),
+      dim1_pre_masking, op::Broadcast(op::Constant()));
+  auto dim1_resharded = AllOf(op::Shape("f32[5,5,2,2]{3,2,1,0}"), dim1_masked);
+  EXPECT_THAT(root, AllOf(op::Shape("f32[1,1,2,2]{3,2,1,0}"),
+                          op::ReduceWindow(dim1_resharded, op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(f32[128,224,224,3] %lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[7,7,3,64] parameter(1)
+  %rhs.copy = f32[7,7,3,64] copy(f32[7,7,3,64] %rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,112,112,64] convolution(
+    f32[128,224,224,3] %lhs.copy,
+    f32[7,7,3,64] %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,112,224,3]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[7,7,3,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[128,3,224,3]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[128,2,224,3]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(
+                        op::Select(op::And(),
+                                   op::Concatenate(left_halo, lhs, right_halo),
+                                   op::Broadcast()),
+                        rhs),
+                    op::Shape("f32[128,56,112,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsReplicatedNeedReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(f32[128,224,224,3] %lhs),
+    sharding={devices=[2,1,1,1]0,1}
+  %rhs = f32[7,7,3,64] parameter(1)
+  %rhs.copy = f32[7,7,3,64] copy(f32[7,7,3,64] %rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,112,112,64] convolution(
+    f32[128,224,224,3] %lhs.copy,
+    f32[7,7,3,64] %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[64,224,224,3]"));
+  auto all_to_all =
+      AllOf(op::AllToAll(op::Reshape(lhs)), op::Shape("f32[64,2,112,224,3]"));
+  auto reshard_lhs = AllOf(op::Reshape(op::Transpose(all_to_all)),
+                           op::Shape("f32[128,112,224,3]"));
+
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[7,7,3,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(reshard_lhs)),
+                         op::Shape("f32[128,3,224,3]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(reshard_lhs)),
+                          op::Shape("f32[128,2,224,3]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Convolution(
+                op::Select(op::And(),
+                           op::Concatenate(left_halo, reshard_lhs, right_halo),
+                           op::Broadcast()),
+                rhs),
+            op::Shape("f32[128,56,112,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsReplicatedReordered) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[224,224,3,128] parameter(0)
+  %lhs.copy = f32[224,224,3,128] copy(%lhs), sharding={devices=[2,1,1,1]0,1}
+  %rhs = f32[7,7,3,64] parameter(1)
+  %rhs.copy = f32[7,7,3,64] copy(%rhs), sharding={replicated}
+  ROOT %conv = f32[128,112,112,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=7x7 stride=2x2 pad=3_3x3_3},
+    dim_labels=01fb_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[112,224,3,128]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[7,7,3,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[3,224,3,128]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[2,224,3,128]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(
+                        op::Select(op::And(),
+                                   op::Concatenate(left_halo, lhs, right_halo),
+                                   op::Broadcast()),
+                        rhs),
+                    op::Shape("f32[128,56,112,64]")));
+}
+
+// (stride * per_shard_window_count) % dilation == 0
+TEST_F(SpmdPartitioningTest,
+       ConvolutionBaseDilationSameStartPatternLhsTiledRhsReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,7,7,512] parameter(0)
+  %lhs.copy = f32[128,7,7,512] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[3,3,512,512] parameter(1)
+  %rhs.copy = f32[3,3,512,512] copy(%rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,4,4,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=3x3 stride=4x4 pad=1_1x1_1 lhs_dilate=2x2 rhs_reversal=1x1},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  // There is no halo exchange, and because the last element in the shard is not
+  // needed (stride == 4), the LHS will be just a slice.
+  auto sliced_lhs =
+      AllOf(op::Slice(op::Copy(op::DynamicSlice(
+                op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                op::Reshape(), op::Constant(), op::Constant()))),
+            op::Shape("f32[128,3,7,512]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[3,3,512,512]"));
+  EXPECT_THAT(root, AllOf(op::Convolution(sliced_lhs, rhs),
+                          op::Shape("f32[128,2,4,512]")));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 1);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 1);
+}
+
+// (stride * per_shard_window_count) % dilation != 0 but stride == 1
+TEST_F(SpmdPartitioningTest,
+       ConvolutionBaseDilationStride1LhsTiledRhsReplicated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,7,7,512] parameter(0)
+  %lhs.copy = f32[128,7,7,512] copy(%lhs),
+    sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[3,3,512,512] parameter(1)
+  %rhs.copy = f32[3,3,512,512] copy(%rhs),
+    sharding={replicated}
+  ROOT %conv = f32[128,14,14,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=3x3 pad=1_2x1_2 lhs_dilate=2x2 rhs_reversal=1x1},
+    dim_labels=b01f_01io->b01f,
+    sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(
+                       op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                       op::Reshape(), op::Constant(), op::Constant())),
+                   op::Shape("f32[128,4,7,512]"));
+  auto rhs = AllOf(op::Copy(op::Parameter()), op::Shape("f32[3,3,512,512]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[128,1,7,512]"));
+  auto start_window = op::Multiply(op::Reshape(), op::Constant());
+  auto start_input_element = op::Divide(start_window, op::Constant());
+  auto dynamic_offset_for_padded_concat = op::Subtract(
+      op::Constant(), op::Subtract(op::Multiply(op::Reshape(), op::Constant()),
+                                   start_input_element));
+  auto pre_masking =
+      AllOf(op::Shape("f32[128,5,7,512]"),
+            op::DynamicSlice(
+                AllOf(op::Shape("f32[128,6,7,512]"),
+                      op::Pad(op::Concatenate(left_halo, lhs), op::Constant())),
+                op::Constant(), dynamic_offset_for_padded_concat,
+                op::Constant(), op::Constant()));
+  auto masked = op::Select(
+      op::Compare(op::Add(op::Iota(), op::Broadcast(start_input_element)),
+                  op::Broadcast(op::Constant())),
+      pre_masking, op::Broadcast(op::Constant()));
+  auto dynamic_offset_on_output = op::Subtract(
+      start_window, op::Multiply(start_input_element, op::Constant()));
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(AllOf(op::Convolution(masked, rhs),
+                                           op::Shape("f32[128,8,14,512]")),
+                                     op::Constant(), dynamic_offset_on_output,
+                                     op::Constant(), op::Constant()),
+                    op::Shape("f32[128,7,14,512]")));
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_low(), 1);
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatterNoOverlap) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param = f32[11,4]{1,0} parameter(0)
+  %param.copy = f32[11,4] copy(%param),
+    sharding={devices=[4,1]0,1,2,3}
+  constant = f32[4,2]{1,0} constant({{1,2},{3,4},{1,0},{2,8}}),
+    sharding={devices=[4,1]0,1,2,3}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[11,4]{1,0} select-and-scatter(param.copy,
+    constant, constant.1), window={size=3x2 stride=3x2 pad=0_1x0_0},
+    select=ge, scatter=sum, sharding={devices=[4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto source =
+      AllOf(op::Shape("f32[1,2]{1,0}"),
+            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
+  auto masked_data = AllOf(
+      op::Shape("f32[3,4]{1,0}"),
+      op::Select(
+          op::Compare(op::Add(op::Iota(), op::Broadcast(op::Multiply(
+                                              op::Reshape(), op::Constant()))),
+                      op::Broadcast(op::Constant())),
+          op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                    op::Reshape(), op::Constant())),
+          op::Broadcast(op::Constant())));
+
+  EXPECT_THAT(root,
+              AllOf(op::SelectAndScatter(masked_data, source, op::Constant()),
+                    op::Shape("f32[3,4]{1,0}")));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatterNoOverlapReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param = f32[11,4]{1,0} parameter(0)
+  %param.copy = f32[11,4] copy(%param),
+    sharding={devices=[1,4]0,1,2,3}
+  constant = f32[4,2]{1,0} constant({{1,2},{3,4},{1,0},{2,8}}),
+    sharding={devices=[4,1]0,1,2,3}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[11,4]{1,0} select-and-scatter(param.copy,
+    constant, constant.1), window={size=3x2 stride=3x2 pad=0_1x0_0},
+    select=ge, scatter=sum, sharding={devices=[4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto source =
+      AllOf(op::Shape("f32[1,2]{1,0}"),
+            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
+  auto operand = AllOf(op::Copy(op::DynamicSlice(
+                           op::Parameter(0), op::Constant(), op::Reshape())),
+                       op::Shape("f32[11,1]"));
+  auto reshard_operand = op::Reshape(op::Transpose(
+      op::AllToAll(op::Reshape(op::Pad(operand, op::Constant())))));
+  auto masked_data = AllOf(
+      op::Shape("f32[3,4]{1,0}"),
+      op::Select(
+          op::Compare(op::Add(op::Iota(), op::Broadcast(op::Multiply(
+                                              op::Reshape(), op::Constant()))),
+                      op::Broadcast(op::Constant())),
+          reshard_operand, op::Broadcast(op::Constant())));
+
+  EXPECT_THAT(root,
+              AllOf(op::SelectAndScatter(masked_data, source, op::Constant()),
+                    op::Shape("f32[3,4]{1,0}")));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatterWithOverlap) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param = f32[11,4]{1,0} parameter(0)
+  %param.copy = f32[11,4] copy(%param),
+    sharding={devices=[4,1]0,1,2,3}
+  constant = f32[6,2]{1,0} constant({{1,2},{3,4},{1,0},{2,8},{6,6},{1,9}}),
+    sharding={devices=[4,1]0,1,2,3}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[11,4]{1,0} select-and-scatter(param.copy,
+    constant, constant.1), window={size=3x2 stride=2x2 pad=1_1x0_0},
+    select=ge, scatter=sum, sharding={devices=[4,1]0,1,2,3}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+
+  auto source_shard =
+      AllOf(op::Shape("f32[2,2]{1,0}"),
+            op::DynamicSlice(op::Pad(), op::Reshape(), op::Constant()));
+  // Max halo size is the same as the shard size, so slice is not needed.
+  auto source_left_halo = op::CollectivePermute(source_shard);
+  auto required_source_shard_start =
+      op::Divide(op::Multiply(op::Reshape(), op::Constant()), op::Constant());
+  auto source_with_halo = op::DynamicSlice(
+      AllOf(op::Shape("f32[5,2]{1,0}"),
+            op::Pad(op::Concatenate(source_left_halo, source_shard),
+                    op::Constant())),
+      op::Subtract(op::Constant(),
+                   op::Subtract(op::Multiply(op::Reshape(), op::Constant()),
+                                required_source_shard_start)),
+      op::Constant());
+  auto masked_source_with_halo = AllOf(
+      AllOf(op::Shape("f32[3,2]{1,0}")),
+      op::Select(
+          op::Compare(
+              op::Add(op::Iota(), op::Broadcast(required_source_shard_start)),
+              op::Broadcast(op::Constant())),
+          source_with_halo, op::Broadcast(op::Constant())));
+
+  auto data_shard =
+      AllOf(op::Shape("f32[3,4]{1,0}"),
+            op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                      op::Reshape(), op::Constant())));
+  auto data_left_halo = AllOf(op::Shape("f32[2,4]{1,0}"),
+                              op::CollectivePermute(op::Slice(data_shard)));
+  auto data_right_halo = AllOf(op::Shape("f32[2,4]{1,0}"),
+                               op::CollectivePermute(op::Slice(data_shard)));
+  auto required_data_start_on_padded =
+      op::Multiply(required_source_shard_start, op::Constant());
+  auto left_halo_size = op::Subtract(
+      op::Add(op::Multiply(op::Reshape(), op::Constant()), op::Constant()),
+      required_data_start_on_padded);
+  auto data_with_halo =
+      AllOf(op::Shape("f32[7,4]{1,0}"),
+            op::DynamicSlice(
+                AllOf(op::Shape("f32[8,4]{1,0}"),
+                      op::Pad(op::Concatenate(data_left_halo, data_shard,
+                                              data_right_halo),
+                              op::Constant())),
+                op::Subtract(op::Constant(), left_halo_size), op::Constant()));
+  auto index_on_padded =
+      op::Add(op::Iota(), op::Broadcast(required_data_start_on_padded));
+  auto masked_data_with_halo = op::Select(
+      op::And(op::Compare(index_on_padded, op::Broadcast(op::Constant())),
+              op::Compare(index_on_padded, op::Broadcast(op::Constant()))),
+      data_with_halo, op::Broadcast(op::Constant()));
+
+  EXPECT_THAT(
+      root, AllOf(op::DynamicSlice(op::SelectAndScatter(masked_data_with_halo,
+                                                        masked_source_with_halo,
+                                                        op::Constant()),
+                                   left_halo_size, op::Constant()),
+                  op::Shape("f32[3,4]{1,0}")));
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->operand(0)->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,64] parameter(0)
+  %lhs.copy = f32[128,56,56,64] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,56,56,256] parameter(1)
+  %rhs.copy = f32[128,56,56,256] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[1,1,64,256] convolution(%lhs.copy, %rhs.copy),
+    window={size=56x56}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,256]"));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                          op::Shape("f32[1,1,64,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, DotLhsTiledRhsTiledWithReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,64] parameter(0)
+  %lhs.copy = f32[128,56,56,64] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,56,56,256] parameter(1)
+  %rhs.copy = f32[128,56,56,256] copy(%rhs), sharding={devices=[2,1,1,1]0,1}
+  ROOT %conv = f32[1,1,64,256] convolution(%lhs.copy, %rhs.copy),
+    window={size=56x56}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[64,56,56,256]"));
+  auto all_to_all =
+      AllOf(op::AllToAll(op::Reshape(lhs)), op::Shape("f32[2,64,28,56,64]"));
+  auto reshard = AllOf(op::Reshape(op::Transpose(all_to_all)));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(reshard, rhs)),
+                          op::Shape("f32[1,1,64,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,512] parameter(0)
+  %lhs.copy = f32[128,56,56,512] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,28,28,64] parameter(1)
+  %rhs.copy = f32[128,28,28,64] copy(%rhs), sharding={devices=[2,1,1,1]0,1}
+  ROOT %conv = f32[1,1,512,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2},
+    dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,512]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Constant(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[64,28,28,64]"));
+  auto all_to_all =
+      AllOf(op::AllToAll(op::Reshape(rhs)), op::Shape("f32[64,2,14,28,64]"));
+  auto reshard = op::Reshape(op::Transpose(all_to_all));
+
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(op::Slice(lhs), reshard)),
+                    op::Shape("f32[1,1,512,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithPadding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,28,28,128] parameter(0)
+  %lhs.copy = f32[32,28,28,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,28,28,64] parameter(1)
+  %rhs.copy = f32[32,28,28,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,128,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=1_1x1_1}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,128]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                         op::Shape("f32[32,1,28,64]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                          op::Shape("f32[32,1,28,64]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(
+                        lhs, AllOf(op::Concatenate(left_halo, rhs, right_halo),
+                                   op::Shape("f32[32,16,28,64]")))),
+                    op::Shape("f32[3,3,128,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowDilate) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,112,112,64] parameter(1)
+  %rhs.copy = f32[128,112,112,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[7,7,3,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=112x112 pad=3_2x3_2 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,112,224,3]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,56,112,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                         op::Shape("f32[128,2,112,64]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                          op::Shape("f32[128,2,112,64]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(
+                        lhs, AllOf(op::Concatenate(left_halo, rhs, right_halo),
+                                   op::Shape("f32[128,60,112,64]")))),
+                    op::Shape("f32[7,7,3,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilateNegativeRhsPadding) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,256] parameter(0)
+  %lhs.copy = f32[128,56,56,256] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,28,28,512] parameter(1)
+  %rhs.copy = f32[128,28,28,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[1,1,256,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,256]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,14,28,512]"));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                          op::Shape("f32[1,1,256,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWindowDilateUneven) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,14,14,512] parameter(0)
+  %lhs.copy = f32[128,14,14,512] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,7,7,512] parameter(1)
+  %rhs.copy = f32[128,7,7,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,512,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=7x7 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,7,14,512]"));
+  auto rhs = AllOf(
+      op::Select(op::Compare(),
+                 op::Copy(op::DynamicSlice(
+                     op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                     op::Reshape(), op::Constant(), op::Constant())),
+                 op::Broadcast()),
+      op::Shape("f32[128,4,7,512]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(rhs)),
+                         op::Shape("f32[128,1,7,512]"));
+  EXPECT_THAT(root,
+              AllOf(op::AllReduce(op::Convolution(
+                        AllOf(op::DynamicSlice(op::Pad(lhs, op::Constant()),
+                                               op::Constant(), op::Subtract(),
+                                               op::Constant(), op::Constant()),
+                              op::Shape("f32[128,10,14,512]")),
+                        AllOf(op::Concatenate(left_halo, rhs),
+                              op::Shape("f32[128,5,7,512]")))),
+                    op::Shape("f32[3,3,512,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConvolutionLhsTiledRhsTiledWithPadding_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,28,28,128] parameter(0)
+  %lhs.copy = f32[32,28,28,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,28,28,64] parameter(1)
+  %rhs.copy = f32[32,28,28,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,128,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=1_1x1_1}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,128]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,14,28,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[32,1,28,128]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[32,1,28,128]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(
+                              AllOf(op::Concatenate(left_halo, lhs, right_halo),
+                                    op::Shape("f32[32,16,28,128]")),
+                              rhs)),
+                          op::Shape("f32[3,3,128,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilate_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,224,224,3] parameter(0)
+  %lhs.copy = f32[128,224,224,3] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,112,112,64] parameter(1)
+  %rhs.copy = f32[128,112,112,64] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[7,7,3,64] convolution(%lhs.copy, %rhs.copy),
+    window={size=112x112 pad=3_2x3_2 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,112,224,3]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,56,112,64]"));
+
+  auto left_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                         op::Shape("f32[128,3,224,3]"));
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[128,2,224,3]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(
+                              AllOf(op::Concatenate(left_halo, lhs, right_halo),
+                                    op::Shape("f32[128,117,224,3]")),
+                              rhs)),
+                          op::Shape("f32[7,7,3,64]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilateNegativeRhsPadding_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,56,56,256] parameter(0)
+  %lhs.copy = f32[128,56,56,256] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,28,28,512] parameter(1)
+  %rhs.copy = f32[128,28,28,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[1,1,256,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=28x28 pad=0_-1x0_-1 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,28,56,256]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,14,28,512]"));
+
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(op::Slice(lhs), rhs)),
+                          op::Shape("f32[1,1,256,512]")));
+}
+
+TEST_F(SpmdPartitioningTest,
+       ConvolutionLhsTiledRhsTiledWindowDilateUneven_HaloOnLhs) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,14,14,512] parameter(0)
+  %lhs.copy = f32[128,14,14,512] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[128,7,7,512] parameter(1)
+  %rhs.copy = f32[128,7,7,512] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %conv = f32[3,3,512,512] convolution(%lhs.copy, %rhs.copy),
+    window={size=7x7 pad=1_0x1_0 rhs_dilate=2x2}, dim_labels=f01b_i01o->01bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[128,7,14,512]"));
+  auto rhs = AllOf(
+      op::Select(op::Compare(),
+                 op::Copy(op::DynamicSlice(
+                     op::Pad(op::Parameter(), op::Constant()), op::Constant(),
+                     op::Reshape(), op::Constant(), op::Constant())),
+                 op::Broadcast()),
+      op::Shape("f32[128,4,7,512]"));
+
+  auto right_halo = AllOf(op::CollectivePermute(op::Slice(lhs)),
+                          op::Shape("f32[128,1,14,512]"));
+  EXPECT_THAT(
+      root, AllOf(op::AllReduce(op::Convolution(
+                      AllOf(op::DynamicSlice(
+                                AllOf(op::Pad(op::Concatenate(lhs, right_halo),
+                                              op::Constant()),
+                                      op::Shape("f32[128,10,14,512]")),
+                                op::Constant(), op::Reshape(), op::Constant(),
+                                op::Constant()),
+                            op::Shape("f32[128,9,14,512]")),
+                      rhs)),
+                  op::Shape("f32[3,3,512,512]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConcatenateAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[14,257] parameter(0)
+  %param0.copy = f32[14,257] copy(%param0), sharding={devices=[2,1]0,1}
+  %param1 = f32[14,116] parameter(1)
+  %param1.copy = f32[14,116] copy(%param1), sharding={devices=[2,1]0,1}
+  ROOT %concatenate = f32[14,373] concatenate(%param0.copy, %param1.copy),
+    dimensions={1}, sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("f32[7,257]"));
+  auto param1 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("f32[7,116]"));
+  EXPECT_THAT(root,
+              AllOf(op::Concatenate(param0, param1), op::Shape("f32[7,373]")));
+}
+
+TEST_F(SpmdPartitioningTest, ConcatenateAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[14,257] parameter(0)
+  %param0.copy = f32[14,257] copy(%param0), sharding={devices=[1,2]0,1}
+  %param1 = f32[14,116] parameter(1)
+  %param1.copy = f32[14,116] copy(%param1), sharding={devices=[1,2]0,1}
+  ROOT %concatenate = f32[14,373] concatenate(%param0.copy, %param1.copy),
+    dimensions={1}, sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                      op::Constant(), op::Reshape())),
+            op::Shape("f32[14,129]"));
+  auto param1 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
+                                                op::Reshape())),
+                      op::Shape("f32[14,58]"));
+  EXPECT_THAT(root, AllOf(op::DynamicSlice(
+                              AllOf(op::AllReduce(op::DynamicUpdateSlice(
+                                        op::DynamicUpdateSlice(
+                                            op::Broadcast(), param0,
+                                            op::Constant(), op::Multiply()),
+                                        param1, op::Constant(), op::Add())),
+                                    op::Shape("f32[14,374]")),
+                              op::Constant(), op::Multiply()),
+                          op::Shape("f32[14,187]")));
+}
+
+TEST_F(SpmdPartitioningTest, PadAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,1,2]0,1}
+  %const = f32[] constant(0)
+  ROOT %pad = f32[128,17,257] pad(%param0.copy, %const), padding=0_0x1_2x0_0,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                op::Constant(), op::Constant(), op::Reshape())),
+      op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(root, AllOf(op::Pad(param0, op::Constant()),
+                          op::Shape("f32[128,17,129]")));
+}
+
+TEST_F(SpmdPartitioningTest, SliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,1,2]0,1}
+  ROOT %slice = f32[128,11,257] slice(%param0.copy),
+    slice={[0:128:1], [2:13:1], [0:257:1]}, sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                op::Constant(), op::Constant(), op::Reshape())),
+      op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(root, AllOf(op::Slice(param0), op::Shape("f32[128,11,129]")));
+}
+
+TEST_F(SpmdPartitioningTest, SliceAlongPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,1,2]0,1}
+  ROOT %slice = f32[63,14,251] slice(%param0.copy),
+    slice={[2:128:2], [0:14:1], [5:256:1]}, sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Pad(op::Parameter(), op::Constant()),
+                                op::Constant(), op::Constant(), op::Reshape())),
+      op::Shape("f32[128,14,129]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Slice(AllOf(
+                op::DynamicSlice(
+                    AllOf(op::Concatenate(
+                              param0,
+                              AllOf(op::CollectivePermute(op::Slice(param0)),
+                                    op::Shape("f32[128,14,2]"))),
+                          op::Shape("f32[128,14,131]")),
+                    op::Constant(), op::Constant(), op::Add()),
+                op::Shape("f32[128,14,126]"))),
+            op::Shape("f32[63,14,126]")));
+}
+
+TEST_F(SpmdPartitioningTest, SortAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  p.0.lhs.1247 = f32[]{:T(256)} parameter(0), sharding={replicated}
+  bitcast-convert = s32[]{:T(256)} bitcast-convert(p.0.lhs.1247), sharding={replicated}
+  constant = s32[]{:T(256)} constant(0), sharding={replicated}
+  compare = pred[]{:T(256)E(32)} compare(bitcast-convert, constant), direction=LT, sharding={replicated}
+  constant.1 = u32[]{:T(256)} constant(2147483647), sharding={replicated}
+  bitcast-convert.1 = u32[]{:T(256)} bitcast-convert(p.0.lhs.1247), sharding={replicated}
+  subtract = u32[]{:T(256)} subtract(constant.1, bitcast-convert.1), sharding={replicated}
+  bitcast-convert.2 = s32[]{:T(256)} bitcast-convert(subtract), sharding={replicated}
+  select = s32[]{:T(256)} select(compare, bitcast-convert.2, bitcast-convert), sharding={replicated}
+  p.0.rhs.1248 = f32[]{:T(256)} parameter(1), sharding={replicated}
+  bitcast-convert.3 = s32[]{:T(256)} bitcast-convert(p.0.rhs.1248), sharding={replicated}
+  compare.1 = pred[]{:T(256)E(32)} compare(bitcast-convert.3, constant), direction=LT, sharding={replicated}
+  bitcast-convert.4 = u32[]{:T(256)} bitcast-convert(p.0.rhs.1248), sharding={replicated}
+  subtract.1 = u32[]{:T(256)} subtract(constant.1, bitcast-convert.4), sharding={replicated}
+  bitcast-convert.5 = s32[]{:T(256)} bitcast-convert(subtract.1), sharding={replicated}
+  select.1 = s32[]{:T(256)} select(compare.1, bitcast-convert.5, bitcast-convert.3), sharding={replicated}
+  compare.2 = pred[]{:T(256)E(32)} compare(select, select.1), direction=GT, sharding={replicated}
+  compare.258 = pred[]{:T(256)E(32)} compare(select.1, select), direction=GT, sharding={replicated}
+  compare.259 = pred[]{:T(256)E(32)} compare(compare.2, compare.258), direction=EQ, sharding={replicated}
+  p.1.lhs.1249 = s32[]{:T(256)} parameter(2), sharding={replicated}
+  p.1.rhs.1250 = s32[]{:T(256)} parameter(3), sharding={replicated}
+  compare.260 = pred[]{:T(256)E(32)} compare(p.1.lhs.1249, p.1.rhs.1250), direction=LT, sharding={replicated}
+  ROOT select.86 = pred[]{:T(256)E(32)} select(compare.259, compare.260, compare.2), sharding={replicated}
+}
+
+ENTRY entry {
+  %param0 = f32[128,14,257] parameter(0)
+  %param0.copy = f32[128,14,257] copy(%param0), sharding={devices=[1,2,1]0,1}
+  %param1 = s32[128,14,257] parameter(1)
+  %param1.copy = s32[128,14,257] copy(%param1), sharding={devices=[1,2,1]0,1}
+  ROOT %sort.6 = (f32[128,14,257]{2,1,0:T(8,128)}, s32[128,14,257]{2,1,0:T(8,128)})
+    sort(%param0.copy, %param1.copy), dimensions={2}, is_stable=true,
+    to_apply=%ge, sharding={{devices=[1,2,1]0,1},{devices=[1,2,1]0,1}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                      op::Reshape(), op::Constant())),
+            op::Shape("f32[128,7,257]"));
+  auto param1 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(),
+                                      op::Reshape(), op::Constant())),
+            op::Shape("s32[128,7,257]"));
+  EXPECT_THAT(root, AllOf(op::Sort(param0, param1),
+                          op::Shape("(f32[128,7,257], s32[128,7,257])")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionCustomCall) {
+  const char* const hlo_string = R"(
+HloModule cluster_2013453984438090939__.47
+
+ENTRY %cluster_2013453984438090939__.47
+  (arg_tuple.1: ()) -> (bf16[2,2000], s32[2,2000]) {
+  %arg_tuple.1 = bf16[2,209664] parameter(0)
+  %copy.arg_tuple.1 = bf16[2,209664] copy(%arg_tuple.1), sharding={devices=[1,2]0,1}
+  %custom-call = (bf16[2,2000]{1,0}, s32[2,2000]{1,0})
+    custom-call(bf16[2,209664]{1,0} %copy.arg_tuple.1), custom_call_target="TopK"
+  %get-tuple-element = bf16[2,2000]{1,0}
+    get-tuple-element((bf16[2,2000]{1,0}, s32[2,2000]{1,0}) %custom-call),
+    index=0, sharding={replicated}
+  %get-tuple-element.1 = s32[2,2000]{1,0} get-tuple-element((bf16[2,2000]{1,0},
+    s32[2,2000]{1,0}) %custom-call), index=1, sharding={replicated}
+  ROOT %tuple.46 = (bf16[2,2000]{1,0}, s32[2,2000]{1,0})
+    tuple(bf16[2,2000]{1,0} %get-tuple-element, s32[2,2000]{1,0}
+    %get-tuple-element.1), sharding={{replicated}, {replicated}},
+    metadata={op_name="XLA_Retvals"}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto custom_call = FindInstruction(module.get(), "custom-call.1");
+  EXPECT_EQ(custom_call->operand(0)->shape().dimensions(1), 104832);
+  auto sort = FindInstruction(module.get(), "sort");
+  EXPECT_EQ(sort->operand(0)->shape().dimensions(1), 4000);
+  EXPECT_EQ(sort->operand(1)->shape().dimensions(1), 4000);
+}
+
+TEST_F(SpmdPartitioningTest, ShardableTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0), sharding={devices=[1,2,1,1]0,1}
+  ROOT %transpose = f32[16,4,38,38] transpose(%param0.copy),
+    dimensions={0,3,1,2}, sharding={devices=[1,1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[16,19,38,4]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(param0), op::Shape("f32[16,4,19,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, MultiDimensionShardedTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0),
+    sharding={devices=[4,2,1,1]0,1,2,3,4,5,6,7}
+  ROOT %transpose = f32[38,4,16,38] transpose(%param0.copy),
+    dimensions={1,3,0,2}, sharding={devices=[2,1,4,1]0,2,4,6,1,3,5,7}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[4,19,38,4]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(param0), op::Shape("f32[19,4,4,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, NonShardableTranspose) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[16,38,38,4] parameter(0)
+  %param0.copy = f32[16,38,38,4] copy(%param0), sharding={devices=[1,2,1,1]0,1}
+  ROOT %transpose = f32[16,4,38,38] transpose(%param0.copy),
+    dimensions={0,3,1,2}, sharding={devices=[1,2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto resahrd = AllOf(op::Reshape(op::Transpose(op::Reshape(op::AllToAll()))),
+                       op::Shape("f32[16,38,38,2]"));
+  EXPECT_THAT(root, AllOf(op::Transpose(), op::Shape("f32[16,2,38,38]")));
+}
+
+TEST_F(SpmdPartitioningTest, ShardableReshape) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[38,38,324] parameter(0)
+  %param0.copy = f32[38,38,324] copy(%param0), sharding={devices=[2,1,1]0,1}
+  ROOT %reshape = f32[38,38,4,81] reshape(%param0.copy),
+    sharding={devices=[2,1,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 =
+      AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                      op::Constant(), op::Constant())),
+            op::Shape("f32[19,38,324]"));
+  EXPECT_THAT(root, AllOf(op::Reshape(param0), op::Shape("f32[19,38,4,81]")));
+}
+
+TEST_F(SpmdPartitioningTest, NonShardableReshape) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[38,38,324] parameter(0)
+  %param0.copy = f32[38,38,324] copy(%param0), sharding={devices=[1,1,2]0,1}
+  ROOT %transpose = f32[38,38,4,81] reshape(%param0.copy),
+    sharding={devices=[1,1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      AllOf(op::DynamicSlice(
+                AllOf(op::Pad(
+                          AllOf(op::Reshape(AllOf(op::AllReduce(),
+                                                  op::Shape("f32[38,38,324]"))),
+                                op::Shape("f32[38,38,4,81]")),
+                          op::Constant()),
+                      op::Shape("f32[38,38,4,82]")),
+                op::Constant(), op::Constant(), op::Constant(), op::Reshape()),
+            op::Shape("f32[38,38,4,41]")));
+}
+
+// Produces an invalid module after transformation.
+TEST_F(SpmdPartitioningTest, InceptionV3_4_way_ReduceWindowDilated) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[128,5,5,768] parameter(0)
+  %param0.copy = f32[128,5,5,768] copy(%param0),
+    sharding={devices=[1,4,1,1]0,1,2,3}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT %rw = f32[128,17,17,768] reduce-window(%param0.copy, %constant.1),
+    window={size=1x5x5x1 pad=0_0x4_4x4_4x0_0 lhs_dilate=1x3x3x1},
+    to_apply=sum, sharding={devices=[1,4,1,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto input_shard = op::Copy(op::DynamicSlice(
+      op::Pad(op::Parameter(0), op::Constant()), op::Constant(), op::Reshape(),
+      op::Constant(), op::Constant()));
+  auto id_mul4_add1 =
+      op::Add(op::Multiply(op::Reshape(), op::Constant()), op::Constant());
+  auto id_mul5 = op::Multiply(op::Reshape(), op::Constant());
+  auto id_mul5_add1_div3 =
+      op::Divide(op::Add(id_mul5, op::Constant()), op::Constant());
+  auto before_masking = AllOf(
+      op::Shape("f32[128,3,5,768]"),
+      op::DynamicSlice(
+          AllOf(
+              op::Shape("f32[128,4,5,768]"),
+              op::Concatenate(op::CollectivePermute(input_shard), input_shard)),
+          op::Constant(),
+          op::Subtract(op::Constant(),
+                       op::Subtract(id_mul4_add1, id_mul5_add1_div3)),
+          op::Constant(), op::Constant()));
+  auto masked = op::Select(
+      op::And(op::Compare(op::Add(op::Iota(), op::Broadcast(id_mul5_add1_div3)),
+                          op::Broadcast(op::Constant())),
+              op::Compare(op::Add(op::Iota(), op::Broadcast(id_mul5_add1_div3)),
+                          op::Broadcast(op::Constant()))),
+      before_masking, op::Broadcast(op::Constant()));
+  auto rw = AllOf(op::Shape("f32[128,7,17,768]"),
+                  op::ReduceWindow(masked, op::Constant()));
+  auto final_slice_index = op::Subtract(
+      id_mul5,
+      op::Add(op::Multiply(id_mul5_add1_div3, op::Constant()), op::Constant()));
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Shape("f32[128,5,17,768]"),
+                    op::DynamicSlice(rw, op::Constant(), final_slice_index,
+                                     op::Constant(), op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[4,32,32,128] parameter(0)
+  %param0.copy = f32[4,32,32,128] copy(%param0),
+    sharding={devices=[1,1,1,2]0,1}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  %reduce = f32[128] reduce(%param0.copy, %constant.1), dimensions={0,1,2},
+    to_apply=%sum, sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Constant(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[4,32,32,64]"));
+
+  EXPECT_THAT(root,
+              AllOf(op::Reduce(param0, op::Constant()), op::Shape("f32[64]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledTupleReduce) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%minmax_func {
+  %lhs_value = f32[] parameter(0)
+  %rhs_value = f32[] parameter(2)
+  %compare.2 = pred[] compare(%lhs_value, %rhs_value), direction=GT
+  %select.4 = f32[] select(%compare.2, %lhs_value, %rhs_value)
+  %lhs_index = s32[] parameter(1)
+  %rhs_index = s32[] parameter(3)
+  %select.5 = s32[] select(%compare.2, %lhs_index, %rhs_index)
+  ROOT %tuple.2 = (f32[], s32[]) tuple(%select.4, %select.5)
+}
+
+ENTRY %main {
+  %param0 = f32[28,10] parameter(0), sharding={devices=[2,1]0,1}
+  %param1 = s32[28,10] parameter(1), sharding={devices=[2,1]0,1}
+  %init0 = f32[] parameter(2)
+  %init1 = s32[] parameter(3)
+  ROOT %reduce = (f32[28], s32[28]) reduce(%param0, %param1, %init0, %init1),
+    dimensions={1}, to_apply=%minmax_func,
+    sharding={{devices=[2]0,1}, {devices=[2]0,1}}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Reduce(op::Parameter(0), op::Parameter(1),
+                                     op::Parameter(2), op::Parameter(3)),
+                          op::Shape("(f32[14], s32[14])")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledToTiledReduceOutputReshard) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %param0 = f32[4,32,32,128] parameter(0)
+  %param0.copy = f32[4,32,32,128] copy(%param0),
+    sharding={devices=[1,2,1,1]0,1}
+  %constant.1 = f32[] constant(0), sharding={replicated}
+  %reduce = f32[128] reduce(%param0.copy, %constant.1), dimensions={0,1,2},
+    to_apply=%sum, sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto param0 = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[4,16,32,128]"));
+
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(
+                        AllOf(op::AllReduce(op::Reduce(param0, op::Constant())),
+                              op::Shape("f32[128]")),
+                        op::Reshape()),
+                    op::Shape("f32[64]")));
+}
+
+TEST_F(SpmdPartitioningTest, IotaAlongNonTileDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT %iota = s32[16,80,91] iota(), iota_dimension=1,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Iota(), op::Shape("s32[16,80,46]")));
+}
+
+TEST_F(SpmdPartitioningTest, IotaAlongTileDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT %iota = s32[16,80,91] iota(), iota_dimension=2,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Add(op::Iota(), op::Broadcast()),
+                          op::Shape("s32[16,80,46]")));
+}
+
+TEST_F(SpmdPartitioningTest, U32IotaAlongTileDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  ROOT %iota = u32[16,80,91] iota(), iota_dimension=2,
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Add(op::Iota(), op::Broadcast()),
+                          op::Shape("u32[16,80,46]")));
+}
+
+TEST_F(SpmdPartitioningTest, Conditional) {
+  const char* const hlo_string = R"(
+HloModule module
+
+Negate {
+  x = f32[4,5] parameter(0), sharding={replicated}
+  ROOT negate = f32[4,5] negate(x), sharding={replicated}
+}
+
+Identity {
+  y = f32[4,5] parameter(0), sharding={devices=[2,1]0,1}
+  ROOT copy = f32[4,5] copy(y), sharding={devices=[2,1]0,1}
+}
+
+ENTRY entry {
+  %param.0 = pred[] parameter(0)
+  %param.0.copy = pred[] copy(%param.0), sharding={maximal device=0}
+  %param.1 = f32[4,5] parameter(1)
+  %param.1.copy = f32[4,5] copy(%param.1), sharding={replicated}
+  %param.2 = f32[4,5] parameter(2)
+  %param.2.copy = f32[4,5] copy(%param.2), sharding={devices=[2,1]0,1}
+  ROOT cond = f32[4,5] conditional(%param.0.copy, %param.1.copy, %param.2.copy),
+    true_computation=Negate, false_computation=Identity,
+    sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto param0 = AllOf(op::Copy(op::Copy(op::Parameter()), op::Shape("pred[]")));
+  auto param1 = AllOf(op::Copy(op::Parameter()), op::Shape("f32[4,5]"));
+  auto param2 = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("f32[2,5]"));
+
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Conditional(op::AllReduce(), param1, param2),
+                          op::Shape("f32[2,5]")));
+
+  auto then_branch_root = root->branch_computation(0)->root_instruction();
+  EXPECT_THAT(then_branch_root,
+              AllOf(op::DynamicSlice(op::Negate(op::Parameter()), op::Reshape(),
+                                     op::Constant()),
+                    op::Shape("f32[2,5]")));
+
+  auto else_branch_root = root->branch_computation(1)->root_instruction();
+  EXPECT_THAT(else_branch_root,
+              AllOf(op::Copy(op::Parameter()), op::Shape("f32[2,5]")));
+}
+
+TEST_F(SpmdPartitioningTest, SelectAndScatter_RetinaNet) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ge {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT compare = pred[] compare(a, b), direction=GE
+}
+
+sum {
+  c = f32[] parameter(0)
+  d = f32[] parameter(1)
+  ROOT add = f32[] add(c, d)
+}
+
+ENTRY entry {
+  %param.0 = f32[32,128,384,64] parameter(0)
+  %param.0.copy = f32[32,128,384,64] copy(%param.0),
+    sharding={devices=[1,8,1,1]0,1,2,3,4,5,6,7}
+  %param.1 = f32[32,64,192,64] parameter(1)
+  %param.1.copy = f32[32,64,192,64] copy(%param.1),
+    sharding={devices=[1,8,1,1]0,1,2,3,4,5,6,7}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT select-and-scatter = f32[32,128,384,64] select-and-scatter(param.0.copy,
+    %param.1.copy, constant.1), window={size=1x1x1x1 stride=1x2x2x1},
+    select=ge, scatter=sum, sharding={devices=[1,8,1,1]0,1,2,3,4,5,6,7}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto source = AllOf(
+      op::Shape("f32[32,8,192,64]"),
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())));
+  auto data = AllOf(
+      op::Shape("f32[32,16,384,64]"),
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())));
+
+  EXPECT_THAT(root, op::SelectAndScatter(data, source, op::Constant()));
+  EXPECT_EQ(root->window().dimensions(0).padding_low(), 0);
+  EXPECT_EQ(root->window().dimensions(0).padding_high(), 0);
+}
+
+TEST_F(SpmdPartitioningTest, TiledDot) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,64] parameter(0)
+  %lhs.copy = f32[128,64] copy(%lhs), sharding={devices=[1,2]0,1}
+  %rhs = f32[64,256] parameter(1)
+  %rhs.copy = f32[64,256] copy(%rhs), sharding={devices=[2,1]0,1}
+  ROOT %conv = f32[128,256] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=bf_io->bf, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module,
+      PartitionComputation(hlo_string, /*num_devices=*/2,
+                           /*conv_halo_exchange_always_on_lhs=*/false));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
+                                             op::Reshape())),
+                   op::Shape("f32[128,32]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                             op::Constant())),
+                   op::Shape("f32[32,256]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                          op::Shape("f32[128,256]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledDotOutputTiled) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,64] parameter(0)
+  %lhs.copy = f32[128,64] copy(%lhs), sharding={devices=[1,2]0,1}
+  %rhs = f32[64,256] parameter(1)
+  %rhs.copy = f32[64,256] copy(%rhs), sharding={devices=[2,1]0,1}
+  ROOT %conv = f32[128,256] convolution(%lhs.copy, %rhs.copy),
+    dim_labels=bf_io->bf, sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Constant(),
+                                             op::Reshape())),
+                   op::Shape("f32[128,32]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(), op::Reshape(),
+                                             op::Constant())),
+                   op::Shape("f32[32,256]"));
+  EXPECT_THAT(root, AllOf(op::DynamicSlice(
+                              AllOf(op::AllReduce(op::Convolution(lhs, rhs)),
+                                    op::Shape("f32[128,256]")),
+                              op::Constant(), op::Reshape()),
+                          op::Shape("f32[128,128]")));
+}
+
+TEST_F(SpmdPartitioningTest, BatchPartitionedConvolution) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[128,256,256] parameter(0)
+  %lhs.copy = f32[128,256,256] copy(%lhs), sharding={devices=[1,2,1]0,1}
+  %rhs = f32[256,8,1] parameter(1)
+  %rhs.copy = f32[256,8,1] copy(%rhs), sharding={replicated}
+  ROOT %conv = f32[128,256,8] convolution(%lhs.copy, %rhs.copy),
+    window={size=1}, dim_labels=0bf_io0->0bf, sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                             op::Reshape(), op::Constant())),
+                   op::Shape("f32[128,128,256]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[256,8,1]"));
+  EXPECT_THAT(root,
+              AllOf(op::Convolution(lhs, rhs), op::Shape("f32[128,128,8]")));
+}
+
+TEST_F(SpmdPartitioningTest, DotOutputFeaturePartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[24,64] parameter(0)
+  %lhs.copy = f32[24,64] copy(%lhs), sharding={replicated}
+  %rhs = f32[39296,64] parameter(1)
+  %rhs.copy = f32[39296,64] copy(%rhs), sharding={devices=[2,1]0,1}
+  ROOT %dot = f32[24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={}, rhs_batch_dims={},
+    lhs_contracting_dims={1}, rhs_contracting_dims={1},
+    sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[24,64]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant())),
+                   op::Shape("f32[19648,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[24,19648]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[2,1,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={devices=[2,1,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,24,64]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumLHSandOutputBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[2,1,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,24,64]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, op::DynamicSlice(rhs, op::Reshape(),
+                                                        op::Constant(),
+                                                        op::Constant())),
+                          op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSandOutputBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={devices=[1,2,1]0,1}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={devices=[2,1,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                             op::Reshape(), op::Constant())),
+                   op::Shape("f32[32,12,64]"));
+  auto rhs = AllOf(op::Copy(op::DynamicSlice(op::Parameter(1), op::Reshape(),
+                                             op::Constant(), op::Constant())),
+                   op::Shape("f32[16,39296,64]"));
+  auto lhs_reshard = op::Reshape(op::Transpose(op::AllToAll(op::Reshape(lhs))));
+  EXPECT_THAT(root,
+              AllOf(op::Dot(lhs_reshard, rhs), op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumOutputBatchPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[2,1,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs_slice =
+      AllOf(op::DynamicSlice(op::Copy(op::Parameter(0)), op::Reshape(),
+                             op::Constant(), op::Constant()),
+            op::Shape("f32[16,24,64]"));
+  auto rhs_slice =
+      AllOf(op::DynamicSlice(op::Copy(op::Parameter(1)), op::Reshape(),
+                             op::Constant(), op::Constant()),
+            op::Shape("f32[16,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs_slice, rhs_slice),
+                          op::Shape("f32[16,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumContractingDimsPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,1,2,2]0,1,2,3}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[1,1,2,2]0,1,2,3}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(),
+                                op::Constant(), op::Reshape(), op::Reshape())),
+      op::Shape("f32[32,24,32,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(),
+                                op::Constant(), op::Reshape(), op::Reshape())),
+      op::Shape("f32[32,39296,32,64]"));
+  EXPECT_THAT(root, AllOf(op::AllReduce(op::Dot(lhs, rhs)),
+                          op::Shape("f32[32,24,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumLHSNonContractingDimsPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  %rhs = f32[32,39296,64] parameter(1)
+  %rhs.copy = f32[32,39296,64] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,128,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[1,2,2,1]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[32,12,64,64]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[32,12,64,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSNonContractingDimsPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64] parameter(0)
+  %lhs.copy = f32[32,24,64] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={devices=[1,2,1,2]0,1,2,3}
+  ROOT %dot = f32[32,24,39296,128] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2}, rhs_contracting_dims={2},
+    sharding={devices=[1,1,2,2]0,1,2,3}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/4));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[32,24,64]"));
+  auto rhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(1), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Reshape())),
+      op::Shape("f32[32,19648,64,64]"));
+  EXPECT_THAT(root, AllOf(op::Dot(lhs, rhs), op::Shape("f32[32,24,19648,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumOutputLHSNonContractingDimPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[32,24,64,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64,128]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Dot(AllOf(op::DynamicSlice(lhs, op::Constant(), op::Reshape(),
+                                           op::Constant(), op::Constant()),
+                          op::Shape("f32[32,12,64,128]")),
+                    rhs),
+            op::Shape("f32[32,12,39296]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumOutputRHSNonContractingDimPartitioned) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={replicated}
+  %rhs = f32[32,39296,64,128] parameter(1)
+  %rhs.copy = f32[32,39296,64,128] copy(%rhs), sharding={replicated}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("f32[32,24,64,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::Parameter(1)), op::Shape("f32[32,39296,64,128]"));
+  EXPECT_THAT(root,
+              AllOf(op::Dot(lhs, AllOf(op::DynamicSlice(
+                                           rhs, op::Constant(), op::Reshape(),
+                                           op::Constant(), op::Constant()),
+                                       op::Shape("f32[32,19648,64,128]"))),
+                    op::Shape("f32[32,24,19648]")));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39295,64,128] parameter(1)
+  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  ROOT %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,12,64,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(1), op::Constant()),
+                                      op::Constant(), op::Reshape(),
+                                      op::Constant(), op::Constant())),
+            op::Shape("f32[32,19648,64,128]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Slice(AllOf(op::GetTupleElement(op::While(op::Tuple(
+                                lhs, rhs, op::Broadcast(), op::Constant()))),
+                            op::Shape("f32[32,12,39296]"))),
+            op::Shape("f32[32,12,39295]")));
+  auto while_loop = root->operand(0)->operand(0);
+  // Check loop condition.
+  EXPECT_THAT(
+      while_loop->while_condition()->root_instruction(),
+      op::Compare(op::GetTupleElement(op::Parameter(0)), op::Constant()));
+
+  // Check loop body.
+  auto next_i = op::Add(op::GetTupleElement(op::Parameter(0)), op::Constant());
+  auto window = op::Conditional(op::Compare(next_i, op::Constant()),
+                                op::GetTupleElement(op::Parameter(0)),
+                                op::GetTupleElement(op::Parameter(0)));
+  auto partial_output = op::Dot(op::GetTupleElement(op::Parameter(0)),
+                                op::GetTupleElement(op::Parameter(0)));
+  EXPECT_THAT(
+      while_loop->while_body()->root_instruction(),
+      op::Tuple(op::GetTupleElement(op::Parameter(0)), window,
+                op::DynamicUpdateSlice(op::GetTupleElement(op::Parameter(0)),
+                                       partial_output, op::Constant(),
+                                       op::Constant(), op::Reshape()),
+                next_i));
+
+  // Check the conditional that contains the collective permute.
+  auto cp_conditional =
+      while_loop->while_body()->root_instruction()->operand(1);
+  EXPECT_THAT(cp_conditional->true_computation()->root_instruction(),
+              op::CollectivePermute(op::Parameter(0)));
+  EXPECT_THAT(cp_conditional->false_computation()->root_instruction(),
+              op::Parameter(0));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedContracting) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = f32[32,24,63,128] parameter(0)
+  %lhs.copy = f32[32,24,63,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39296,63,128] parameter(1)
+  %rhs.copy = f32[32,39296,63,128] copy(%rhs), sharding={devices=[1,1,2,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(
+      op::Copy(op::DynamicSlice(op::Parameter(0), op::Constant(), op::Reshape(),
+                                op::Constant(), op::Constant())),
+      op::Shape("f32[32,12,63,128]"));
+  auto rhs =
+      AllOf(op::Copy(op::DynamicSlice(op::Pad(op::Parameter(1), op::Constant()),
+                                      op::Constant(), op::Constant(),
+                                      op::Reshape(), op::Constant())),
+            op::Shape("f32[32,39296,32,128]"));
+  auto masked_rhs =
+      op::Select(op::Compare(), rhs, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root,
+              AllOf(op::GetTupleElement(op::While(op::Tuple(
+                        lhs, masked_rhs, op::Broadcast(), op::Constant()))),
+                    op::Shape("f32[32,12,39296]")));
+  auto while_loop = root->operand(0);
+  // Check loop condition.
+  EXPECT_THAT(
+      while_loop->while_condition()->root_instruction(),
+      op::Compare(op::GetTupleElement(op::Parameter(0)), op::Constant()));
+
+  // Check loop body.
+  auto next_i = op::Add(op::GetTupleElement(op::Parameter(0)), op::Constant());
+  auto window = op::Conditional(op::Compare(next_i, op::Constant()),
+                                op::GetTupleElement(op::Parameter(0)),
+                                op::GetTupleElement(op::Parameter(0)));
+  auto partial_output = op::Dot(
+      op::DynamicSlice(
+          op::Pad(op::GetTupleElement(op::Parameter(0)), op::Constant()),
+          op::Constant(), op::Constant(), op::Reshape(), op::Constant()),
+      op::GetTupleElement(op::Parameter(0)));
+  EXPECT_THAT(
+      while_loop->while_body()->root_instruction(),
+      op::Tuple(op::GetTupleElement(op::Parameter(0)), window,
+                op::Add(op::GetTupleElement(op::Parameter(0)), partial_output),
+                next_i));
+
+  // Check the conditional that contains the collective permute.
+  auto cp_conditional =
+      while_loop->while_body()->root_instruction()->operand(1);
+  EXPECT_THAT(cp_conditional->true_computation()->root_instruction(),
+              op::CollectivePermute(op::Parameter(0)));
+  EXPECT_THAT(cp_conditional->false_computation()->root_instruction(),
+              op::Parameter(0));
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContractingReduce1) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39295,64,128] parameter(1)
+  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+  %constant = f32[] constant(0)
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,24,39295] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,2,1]0,1}
+  %multiply = f32[32,24,39295] multiply(%dot, %broadcast),
+  sharding={devices=[1,2,1]0,1}
+  ROOT %reduce = f32[32,24] reduce(%multiply, %constant), dimensions={2},
+    to_apply=sum, sharding={devices=[1,2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedNonContractingReduce2) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  %lhs = f32[32,24,64,128] parameter(0)
+  %lhs.copy = f32[32,24,64,128] copy(%lhs), sharding={devices=[1,2,1,1]0,1}
+  %rhs = f32[32,39295,64,128] parameter(1)
+  %rhs.copy = f32[32,39295,64,128] copy(%rhs), sharding={devices=[1,2,1,1]0,1}
+  %dot = f32[32,24,39295] dot(%lhs.copy, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+  %constant = f32[] constant(0)
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,24,39295] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,2,1]0,1}
+  %multiply = f32[32,24,39295] multiply(%dot, %broadcast),
+    sharding={devices=[1,2,1]0,1}
+  ROOT %reduce = f32[32,39295] reduce(%multiply, %constant), dimensions={1},
+    to_apply=sum, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest, EinsumRHSWindowedContractingFromBroadcast) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %rhs = f32[32,39296,63,128] parameter(0)
+  %rhs.copy = f32[32,39296,63,128] copy(%rhs), sharding={devices=[1,1,2,1]0,1}
+  %constant.1 = f32[] constant(2)
+  %broadcast = f32[32,24,63,128] broadcast(%constant.1), dimensions={},
+    sharding={devices=[1,2,1,1]0,1}
+  %add = f32[32,24,63,128] add(%broadcast, %broadcast),
+    sharding={devices=[1,2,1,1]0,1}
+  ROOT %dot = f32[32,24,39296] dot(%add, %rhs.copy),
+    lhs_batch_dims={0}, rhs_batch_dims={0},
+    lhs_contracting_dims={2,3}, rhs_contracting_dims={2,3},
+    sharding={devices=[1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, PartitionComputation(hlo_string,
+                                                            /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  // Involves loop code motion, skips pattern matching.
+}
+
+TEST_F(SpmdPartitioningTest, ReplicatedRng) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = s32[] parameter(0)
+  %lhs.copy = s32[] copy(%lhs), sharding={replicated}
+  %rhs = s32[] parameter(1)
+  %rhs.copy = s32[] copy(%rhs), sharding={replicated}
+  ROOT %rng = s32[4]{0} rng(%lhs.copy, %rhs.copy),
+      distribution=rng_uniform, sharding={replicated}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("s32[]"));
+  auto rhs = AllOf(op::Copy(op::Parameter(1)), op::Shape("s32[]"));
+  EXPECT_THAT(
+      root,
+      AllOf(op::AllReduce(op::Select(
+                op::Broadcast(op::Compare(op::PartitionId(), op::Constant())),
+                op::Rng(), op::Broadcast(op::Constant()))),
+            op::Shape("s32[4]")));
+}
+
+TEST_F(SpmdPartitioningTest, PartitionedRng) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %lhs = s32[] parameter(0)
+  %lhs.copy = s32[] copy(%lhs), sharding={replicated}
+  %rhs = s32[] parameter(1)
+  %rhs.copy = s32[] copy(%rhs), sharding={maximal device=1}
+  ROOT %rng = s32[4]{0} rng(%lhs.copy, %rhs.copy),
+      distribution=rng_uniform, sharding={devices=[2]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto lhs = AllOf(op::Copy(op::Parameter(0)), op::Shape("s32[]"));
+  auto rhs = AllOf(op::Copy(op::Copy(op::Parameter(1))), op::Shape("s32[]"));
+  EXPECT_THAT(root, AllOf(op::Rng(lhs, op::AllReduce(op::Select(
+                                           op::Broadcast(op::Compare()), rhs,
+                                           op::Broadcast(op::Constant())))),
+                          op::Shape("s32[2]")));
+}
+
+TEST_F(SpmdPartitioningTest, DynamicSliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[128,64] parameter(0)
+  %input.copy = s32[128,64] copy(%input), sharding={devices=[2,1]0,1}
+  %index = s32[] parameter(1)
+  %constant = s32[] constant(0)
+  ROOT %dynamic-slice = s32[128,2] dynamic-slice(%input.copy, %constant, %index),
+    dynamic_slice_sizes={128,2}, sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto input = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                               op::Constant())),
+                     op::Shape("s32[64,64]"));
+  EXPECT_THAT(root,
+              AllOf(op::DynamicSlice(input, op::Constant(), op::Parameter(1)),
+                    op::Shape("s32[64,2]")));
+}
+
+TEST_F(SpmdPartitioningTest, DynamicUpdateSliceAlongNonPartitionedDimension) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[128,64] parameter(0)
+  %input.copy = s32[128,64] copy(%input), sharding={devices=[2,1]0,1}
+  %index = s32[] parameter(1)
+  %constant = s32[] constant(0)
+  %update = s32[128,2] parameter(2)
+  %update.copy = s32[128,2] copy(%update), sharding={devices=[2,1]0,1}
+  ROOT %dynamic-update-slice = s32[128,64]
+    dynamic-update-slice(%input.copy, %update.copy, %constant, %index),
+    sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto root = module->entry_computation()->root_instruction();
+  auto input = AllOf(op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
+                                               op::Constant())),
+                     op::Shape("s32[64,64]"));
+  auto update = AllOf(op::Copy(op::DynamicSlice(op::Parameter(2), op::Reshape(),
+                                                op::Constant())),
+                      op::Shape("s32[64,2]"));
+  EXPECT_THAT(root, AllOf(op::DynamicUpdateSlice(input, update, op::Constant(),
+                                                 op::Parameter(1)),
+                          op::Shape("s32[64,64]")));
+}
+
+TEST_F(SpmdPartitioningTest, PassthroughGather) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={devices=[1,2]0,1}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[3,9] gather(%input, %indices), offset_dims={1},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=1,
+    slice_sizes={1,9}, sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Gather(op::Parameter(0), op::Parameter(1)),
+                          op::Shape("f32[3,5]")));
+}
+
+TEST_F(SpmdPartitioningTest, GatherPartitionedOnTrivialSliceDims) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[17,9] parameter(0), sharding={devices=[2,1]0,1}
+  %indices = s32[2,3] parameter(1), sharding={replicated}
+  ROOT %gather = f32[2,3,9] gather(%input, %indices), offset_dims={2},
+    collapsed_slice_dims={0}, start_index_map={0}, index_vector_dim=2,
+    slice_sizes={1,9}, sharding={replicated}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto offset = op::Reshape(
+      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto min = AllOf(op::Broadcast(offset), op::Shape("s32[2,3]"));
+  auto max = AllOf(op::Broadcast(op::Add(offset, op::Constant())),
+                   op::Shape("s32[2,3]"));
+  auto clamp = op::Clamp(min, op::Parameter(1), max);
+  auto gather = op::Gather(op::Parameter(0), op::Subtract(clamp, min));
+  auto mask =
+      op::Or(op::Lt(op::Parameter(1), min), op::Gt(op::Parameter(1), max));
+  auto masked =
+      op::Select(op::Broadcast(mask), op::Broadcast(op::Constant()), gather);
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::AllReduce(masked), op::Shape("f32[2,3,9]")));
+}
+
+TEST_F(SpmdPartitioningTest, PassthroughScatter) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[2,9] parameter(0), sharding={devices=[1,2]0,1}
+  %indices = s32[3] parameter(1), sharding={replicated}
+  %updates = f32[3,9] parameter(2), sharding={devices=[1,2]0,1}
+  ROOT %scatter = f32[2,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={1},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=1, sharding={devices=[1,2]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Scatter(op::Parameter(0), op::Parameter(1),
+                                      op::Parameter(2)),
+                          op::Shape("f32[2,5]")));
+}
+
+TEST_F(SpmdPartitioningTest, ScatterPartitionedOnTrivialSliceDims) {
+  const char* const hlo_string = R"(
+HloModule module
+
+add (lhs: f32[], rhs: f32[]) -> f32[] {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT sum = f32[] add(lhs, rhs)
+}
+
+ENTRY entry {
+  %input = f32[17,9] parameter(0), sharding={devices=[2,1]0,1}
+  %indices = s32[2,3] parameter(1), sharding={replicated}
+  %updates = f32[2,3,9] parameter(2), sharding={replicated}
+  ROOT %scatter = f32[17,9] scatter(%input, %indices, %updates),
+      to_apply=add,
+      update_window_dims={2},
+      inserted_window_dims={0},
+      scatter_dims_to_operand_dims={0},
+      index_vector_dim=2, sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto offset = op::Reshape(
+      op::DynamicSlice(op::Constant(), op::PartitionId(), op::Constant()));
+  auto indices = op::Subtract(
+      op::Parameter(1), AllOf(op::Broadcast(offset), op::Shape("s32[2,3]")));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              AllOf(op::Scatter(op::Parameter(0), indices, op::Parameter(2)),
+                    op::Shape("f32[9,9]")));
+}
+
+TEST_F(SpmdPartitioningTest, TiledReverse) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  constant = f32[3,3]{1,0} constant({{1,1,1},{1,1,1},{1,1,1}}),
+    sharding={devices=[2,1]0,1}
+  ROOT reverse = f32[3,3]{1,0} reverse(constant), dimensions={1},
+    sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,3]{1,0}"),
+                          op::Reverse(op::DynamicSlice(
+                              op::Pad(op::Constant(), op::Constant()),
+                              op::Reshape(), op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, MixWithManualPartitioning) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  param = f32[8,2] parameter(0), sharding={devices=[2,1]0,1}
+  to_shard = f32[4,2] custom-call(param), custom_call_target="SPMDFullToShardShape", sharding={replicated}
+  add = f32[4,2] add(to_shard, to_shard), sharding={replicated}
+  to_full = f32[8,2] custom-call(add), custom_call_target="SPMDShardToFullShape", sharding={devices=[2,1]0,1}
+  ROOT mul = f32[8,2] multiply(to_full, param), sharding={devices=[2,1]0,1}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  auto to_shard = op::Copy(op::Parameter(0));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[4,2]"),
+                          op::Multiply(op::Copy(op::Add(to_shard, to_shard)),
+                                       op::Parameter(0))));
+}
+
+}  // namespace
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
new file mode 100644
index 00000000000..207f854cd9f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -0,0 +1,662 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace spmd {
+
+bool HasReplicatedSharding(const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    return absl::c_any_of(sharding.tuple_elements(), HasReplicatedSharding);
+  }
+  return sharding.IsReplicated();
+}
+
+HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b) {
+  if (shape.IsTuple()) {
+    std::vector<HloInstruction*> elements;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      elements.push_back(
+          CreateZero(ShapeUtil::GetTupleElementShape(shape, i), b));
+    }
+    return b->AddInstruction(HloInstruction::CreateTuple(elements));
+  }
+
+  if (shape.IsToken()) {
+    return b->AddInstruction(HloInstruction::CreateToken());
+  }
+  auto zero = b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::Zero(shape.element_type())));
+  return b->AddInstruction(HloInstruction::CreateBroadcast(shape, zero, {}));
+}
+
+HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module) {
+  HloComputation::Builder sum_b("add");
+  auto x = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(type, {}), "x"));
+  auto y = sum_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(type, {}), "y"));
+  if (type == PRED) {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kOr, x, y));
+  } else {
+    sum_b.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(type, {}), HloOpcode::kAdd, x, y));
+  }
+  HloComputation* reduction = module->AddEmbeddedComputation(sum_b.Build());
+  return reduction;
+}
+
+bool EvenlyPartitions(const Shape& shape, const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      if (!EvenlyPartitions(ShapeUtil::GetTupleElementShape(shape, i),
+                            sharding.GetSubSharding(shape, {i}))) {
+        return false;
+      }
+    }
+  }
+
+  if (sharding.IsTileMaximal()) {
+    return sharding.IsReplicated();
+  }
+  for (int64 i = 0; i < shape.dimensions_size(); ++i) {
+    if (shape.dimensions(i) % sharding.tile_assignment().dim(i) != 0) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding) {
+  if (sharding.IsTuple()) {
+    std::vector<Shape> subshapes;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      subshapes.push_back(
+          MakePartitionedShape(ShapeUtil::GetTupleElementShape(shape, i),
+                               sharding.GetSubSharding(shape, {i})));
+    }
+    return ShapeUtil::MakeTupleShape(subshapes);
+  }
+  return sharding.TileShape(shape);
+}
+
+Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
+                                          const HloSharding& sharding,
+                                          int64 partition_id) {
+  if (sharding.IsTuple()) {
+    std::vector<Shape> subshapes;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      subshapes.push_back(MakeNonPaddedShapeForGivenPartition(
+          ShapeUtil::GetTupleElementShape(shape, i),
+          sharding.GetSubSharding(shape, {i}), partition_id));
+    }
+    return ShapeUtil::MakeTupleShape(subshapes);
+  }
+
+  auto partition_shape = shape;
+  std::vector<int64> tile_offset =
+      sharding.TileOffsetForDevice(shape, partition_id);
+  std::vector<int64> tile_limit =
+      sharding.TileLimitForDevice(shape, partition_id);
+  for (int64 i = 0; i < tile_offset.size(); ++i) {
+    if (sharding.UsesDevice(partition_id)) {
+      partition_shape.set_dimensions(i, tile_limit[i] - tile_offset[i]);
+    } else {
+      partition_shape.set_dimensions(i, 0);
+    }
+  }
+  return partition_shape;
+}
+
+std::vector<HloInstruction*> MakePartitionOffsets(const Shape& shape,
+                                                  const HloSharding& sharding,
+                                                  HloInstruction* partition_id,
+                                                  SpmdBuilder* b) {
+  CHECK(!shape.IsTuple());
+
+  Array2D<int32> offset_array(
+      {sharding.tile_assignment().num_elements(), shape.rank()});
+  offset_array.Each([&](int64 i, int64 j, int32* value) {
+    *value = sharding.TileOffsetForDevice(shape, i)[j];
+  });
+  auto offset_table = b->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::CreateR2FromArray2D(offset_array)));
+  std::vector<HloInstruction*> offsets;
+  for (int64 i = 0; i < shape.rank(); ++i) {
+    if (sharding.tile_assignment().dim(i) == 1) {
+      offsets.push_back(b->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32))));
+    } else {
+      auto index = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+          ShapeUtil::MakeShape(S32, {1, 1}), offset_table,
+          {partition_id, b->AddInstruction(HloInstruction::CreateConstant(
+                             LiteralUtil::CreateR0<uint32>(i)))},
+          {1, 1}));
+      offsets.push_back(b->AddInstruction(
+          HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), index)));
+    }
+  }
+  return offsets;
+}
+
+std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
+    const HloSharding& sharding, HloInstruction* partition_id, SpmdBuilder* b) {
+  CHECK(!sharding.IsTileMaximal());
+  auto table_shape =
+      ShapeUtil::MakeShape(S32, sharding.tile_assignment().dimensions());
+  return MakePartitionOffsets(table_shape, sharding, partition_id, b);
+}
+
+HloInstruction* PadToShape(HloInstruction* hlo, const Shape& padded_shape,
+                           SpmdBuilder* b, HloComputation* computation) {
+  CHECK(b == nullptr || computation == nullptr);
+  if (ShapeUtil::Compatible(hlo->shape(), padded_shape)) {
+    return hlo;
+  }
+  PaddingConfig padding_config;
+  for (int64 i = 0; i < padded_shape.rank(); ++i) {
+    auto padding_config_dim = padding_config.add_dimensions();
+    padding_config_dim->set_edge_padding_low(0);
+    padding_config_dim->set_interior_padding(0);
+    padding_config_dim->set_edge_padding_high(padded_shape.dimensions(i) -
+                                              hlo->shape().dimensions(i));
+  }
+  auto add_hlo = [&](std::unique_ptr<HloInstruction> to_add) {
+    if (b == nullptr) {
+      return computation->AddInstruction(std::move(to_add));
+    }
+    return b->AddInstruction(std::move(to_add));
+  };
+  auto zero = add_hlo(HloInstruction::CreateConstant(
+      LiteralUtil::Zero(hlo->shape().element_type())));
+  return add_hlo(
+      HloInstruction::CreatePad(padded_shape, hlo, zero, padding_config));
+}
+
+Shape GetPaddedShapeForUnevenPartitioning(const Shape& base_shape,
+                                          const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return base_shape;
+  }
+  if (EvenlyPartitions(base_shape, sharding)) {
+    return base_shape;
+  }
+  auto shard_shape = MakePartitionedShape(base_shape, sharding);
+  Shape padded_base_shape = base_shape;
+  for (int64 i = 0; i < padded_base_shape.rank(); ++i) {
+    padded_base_shape.set_dimensions(
+        i, shard_shape.dimensions(i) * sharding.tile_assignment().dim(i));
+  }
+  return padded_base_shape;
+}
+
+HloInstruction* PadBaseShapeBeforeUnevenTiledSharding(
+    HloInstruction* hlo, const HloSharding& sharding, SpmdBuilder* b) {
+  auto padded_base_shape =
+      GetPaddedShapeForUnevenPartitioning(hlo->shape(), sharding);
+  if (ShapeUtil::Compatible(padded_base_shape, hlo->shape())) {
+    return hlo;
+  }
+  return PadToShape(hlo, padded_base_shape, b);
+}
+
+absl::optional<int64> UniqueTiledDim(const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    return absl::nullopt;
+  }
+  int64 dim = -1;
+  for (int64 i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
+    if (sharding.tile_assignment().dim(i) > 1) {
+      if (dim != -1) {
+        return absl::nullopt;
+      }
+      dim = i;
+    }
+  }
+  CHECK_NE(dim, -1);
+  return dim;
+}
+
+MultiplyAddDivideOffsetCalculation::MultiplyAddDivideOffsetCalculation(
+    int64 multiplier, int64 offset, int64 divisor)
+    : multiplier_(multiplier), offset_(offset), divisor_(divisor) {
+  CHECK_GT(divisor_, 0);
+  Simplify();
+}
+
+OffsetCalculation MultiplyAddDivideOffsetCalculation::operator-(
+    const MultiplyAddDivideOffsetCalculation& other) const {
+  if (divisor_ == 1 && other.divisor_ == 1) {
+    return OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+        multiplier_ - other.multiplier_, offset_ - other.offset_, 1));
+  }
+  return OffsetCalculation(HloOpcode::kSubtract, *this, other);
+}
+
+void MultiplyAddDivideOffsetCalculation::Simplify() {
+  // We could simplify the calculation when multiplier is a multiple of
+  // divisor_. However, when offset_ is not a multiple of divisor_, we must
+  // make sure that offset_ and multiplier_ are both non-negative or both
+  // non-positive. E.g., (3 * i  - 1) / 3 is not equivalent to i or i - 1.
+  if (divisor_ != 1 && multiplier_ % divisor_ == 0 &&
+      (offset_ % divisor_ == 0 || offset_ * multiplier_ > 0)) {
+    multiplier_ /= divisor_;
+    offset_ /= divisor_;
+    divisor_ = 1;
+  }
+}
+
+int64 MultiplyAddDivideOffsetCalculation::Calculate(int64 shard_ordinal) const {
+  return (shard_ordinal * multiplier_ + offset_) / divisor_;
+}
+
+HloInstruction* MultiplyAddDivideOffsetCalculation::Calculate(
+    HloInstruction* shard_ordinal, SpmdBuilder* b) const {
+  auto scalar_shape = ShapeUtil::MakeShape(S32, {});
+  if (multiplier_ == 0) {
+    return b->AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<int32>(offset_ / divisor_)));
+  }
+  HloInstruction* result = shard_ordinal;
+  if (multiplier_ != 1) {
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kMultiply, shard_ordinal,
+        b->AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<int32>(multiplier_)))));
+  }
+  if (offset_ != 0) {
+    auto offset = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(offset_)));
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kAdd, result, offset));
+  }
+  if (divisor_ != 1) {
+    auto divisor = b->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(divisor_)));
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        scalar_shape, HloOpcode::kDivide, result, divisor));
+  }
+  return result;
+}
+
+int64 MultiplyAddDivideOffsetCalculation::MaxInRange(
+    int64 start_ordinal, int64 limit_ordinal) const {
+  int64 max = Calculate(start_ordinal);
+  for (int64 i = start_ordinal + 1; i < limit_ordinal; ++i) {
+    max = std::max(max, Calculate(i));
+  }
+  return max;
+}
+
+OffsetCalculation& OffsetCalculation::operator=(
+    const OffsetCalculation& other) {
+  opcode_ = other.opcode_;
+  copy_from_ = other.copy_from_;
+  if (opcode_ != HloOpcode::kCopy) {
+    lhs_ = absl::make_unique<OffsetCalculation>(*other.lhs_);
+    rhs_ = absl::make_unique<OffsetCalculation>(*other.rhs_);
+  }
+  return *this;
+}
+
+bool OffsetCalculation::IsConstant() const {
+  if (opcode_ == HloOpcode::kCopy) {
+    return copy_from_.IsConstant();
+  }
+  if (opcode_ == HloOpcode::kSubtract && *lhs_ == *rhs_) {
+    return true;
+  }
+  return lhs_->IsConstant() && rhs_->IsConstant();
+}
+
+OffsetCalculation OffsetCalculation::operator-(
+    const OffsetCalculation& other) const {
+  if (opcode_ == HloOpcode::kCopy && other.opcode_ == HloOpcode::kCopy) {
+    return copy_from_ - other.copy_from_;
+  }
+  return OffsetCalculation(HloOpcode::kSubtract, *this, other);
+}
+
+bool OffsetCalculation::operator==(const OffsetCalculation& other) const {
+  if (opcode_ != other.opcode_) {
+    return false;
+  }
+  if (opcode_ == HloOpcode::kCopy) {
+    return copy_from_ == other.copy_from_;
+  }
+  return *lhs_ == *other.lhs_ && *rhs_ == *other.rhs_;
+}
+
+int64 OffsetCalculation::Calculate(int64 shard_ordinal) const {
+  switch (opcode_) {
+    case HloOpcode::kCopy:
+      return copy_from_.Calculate(shard_ordinal);
+    case HloOpcode::kSubtract:
+      return lhs_->Calculate(shard_ordinal) - rhs_->Calculate(shard_ordinal);
+    case HloOpcode::kMultiply:
+      return lhs_->Calculate(shard_ordinal) * rhs_->Calculate(shard_ordinal);
+    default:
+      LOG(FATAL) << "Should not happen";
+  }
+}
+
+HloInstruction* OffsetCalculation::Calculate(HloInstruction* shard_ordinal,
+                                             SpmdBuilder* b) const {
+  if (opcode_ == HloOpcode::kCopy) {
+    return copy_from_.Calculate(shard_ordinal, b);
+  }
+  auto lhs = lhs_->Calculate(shard_ordinal, b);
+  auto rhs = rhs_->Calculate(shard_ordinal, b);
+  return b->AddInstruction(
+      HloInstruction::CreateBinary(lhs->shape(), opcode_, lhs, rhs));
+}
+
+int64 OffsetCalculation::MaxInRange(int64 start_ordinal,
+                                    int64 limit_ordinal) const {
+  if (IsConstant()) {
+    return Calculate(start_ordinal);
+  }
+  if (opcode_ == HloOpcode::kCopy) {
+    return std::max(Calculate(start_ordinal), Calculate(limit_ordinal - 1));
+  }
+  int64 max = Calculate(start_ordinal);
+  for (int64 i = start_ordinal + 1; i < limit_ordinal; ++i) {
+    max = std::max(max, Calculate(i));
+  }
+  return max;
+}
+
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo, const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function, int64 dim,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b) {
+  int64 input_shard_size = hlo->shape().dimensions(dim);
+  int64 shard_count = target.tile_assignment().dim(dim);
+
+  std::vector<HloInstruction*> concat_pieces;
+
+  int64 max_left_halo_size = left_halo_size_function.MaxInRange(1, shard_count);
+  if (max_left_halo_size > input_shard_size) {
+    VLOG(1) << "ExchangeHalo failed: halo is beyond the left neighbor.";
+    return absl::nullopt;
+  }
+  if (max_left_halo_size > 0) {
+    std::vector<std::pair<int64, int64>> source_target_pairs;
+    target.tile_assignment().Each(
+        [&](absl::Span<const int64> indices, int64 device) {
+          if (indices[dim] > 0) {
+            std::vector<int64> source_indices(indices.begin(), indices.end());
+            source_indices[dim] -= 1;
+            source_target_pairs.emplace_back(
+                target.tile_assignment()(source_indices), device);
+          }
+        });
+    auto halo_shape = hlo->shape();
+    auto source_halo_slice = hlo;
+    if (max_left_halo_size != hlo->shape().dimensions(dim)) {
+      halo_shape.set_dimensions(dim, max_left_halo_size);
+      std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
+      halo_start_indices[dim] =
+          hlo->shape().dimensions(dim) - max_left_halo_size;
+      std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
+
+      source_halo_slice = b->AddInstruction(
+          hlo->CreateSlice(halo_shape, hlo, halo_start_indices,
+                           hlo->shape().dimensions(), halo_slice_strides));
+    }
+    auto left_halo =
+        collective_ops_creator.create_cross_partition_collective_permute(
+            b, source_halo_slice, source_target_pairs, (*next_channel_id)++);
+    concat_pieces.push_back(left_halo);
+  }
+
+  concat_pieces.push_back(hlo);
+
+  // Right halo.
+  int64 max_right_halo_size =
+      right_halo_size_function.MaxInRange(0, shard_count - 1);
+  if (max_right_halo_size > input_shard_size) {
+    VLOG(1) << "ExchangeHalo failed: halo is beyond the right neighbor.";
+    return absl::nullopt;
+  }
+  if (max_right_halo_size > 0) {
+    std::vector<std::pair<int64, int64>> source_target_pairs;
+    target.tile_assignment().Each(
+        [&](absl::Span<const int64> indices, int64 device) {
+          if (indices[dim] > 0) {
+            std::vector<int64> target_indices(indices.begin(), indices.end());
+            target_indices[dim] -= 1;
+            source_target_pairs.emplace_back(
+                device, target.tile_assignment()(target_indices));
+          }
+        });
+    auto halo_shape = hlo->shape();
+    halo_shape.set_dimensions(dim, max_right_halo_size);
+    std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
+    std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
+
+    auto source_halo_slice = b->AddInstruction(
+        hlo->CreateSlice(halo_shape, hlo, halo_start_indices,
+                         halo_shape.dimensions(), halo_slice_strides));
+    auto right_halo =
+        collective_ops_creator.create_cross_partition_collective_permute(
+            b, source_halo_slice, source_target_pairs, (*next_channel_id)++);
+    concat_pieces.push_back(right_halo);
+  }
+
+  auto concat = hlo;
+  // Concat with halos/padding.
+  if (concat_pieces.size() > 1) {
+    auto concat_shape = hlo->shape();
+    int64 concat_dim_size = 0;
+    for (auto piece : concat_pieces) {
+      concat_dim_size += piece->shape().dimensions(dim);
+    }
+    concat_shape.set_dimensions(dim, concat_dim_size);
+    concat = b->AddInstruction(
+        HloInstruction::CreateConcatenate(concat_shape, concat_pieces, dim));
+  }
+
+  return concat;
+}
+
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo,
+    std::vector<OffsetCalculation> left_halo_size_functions,
+    std::vector<OffsetCalculation> right_halo_size_functions,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b) {
+  CHECK(left_halo_size_functions.size() == hlo->shape().rank());
+  CHECK(right_halo_size_functions.size() == hlo->shape().rank());
+
+  HloInstruction* visiting_hlo = hlo;
+  for (int dim = 0; dim < hlo->shape().rank(); ++dim) {
+    auto concat = ExchangeHalo(visiting_hlo, left_halo_size_functions[dim],
+                               right_halo_size_functions[dim], dim, target,
+                               collective_ops_creator, next_channel_id, b);
+    if (!concat) {
+      return absl::nullopt;
+    }
+    visiting_hlo = *concat;
+  }
+  return visiting_hlo;
+}
+
+absl::optional<HloInstruction*> ExchangeHaloAndGetValidData(
+    HloInstruction* hlo, const Shape& base_shape,
+    const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function,
+    int64 explicit_left_padding_on_full_shape, int64 padded_full_shape_size,
+    int64 shard_size_with_halo, int64 dim, const HloSharding& target,
+    HloInstruction* offset_on_padded_shape, HloInstruction* pad_value,
+    HloInstruction* partition_ordinal,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b, bool mask_invalid_region) {
+  auto halo_exchange_result =
+      ExchangeHalo(hlo, left_halo_size_function, right_halo_size_function, dim,
+                   target, collective_ops_creator, next_channel_id, b);
+  if (!halo_exchange_result) {
+    return absl::nullopt;
+  }
+  auto concat = *halo_exchange_result;
+  int64 shard_count = target.tile_assignment().dim(dim);
+  int64 max_left_halo_size = left_halo_size_function.MaxInRange(1, shard_count);
+
+  // Now we determine if we need extra padding after the concat.
+  //
+  // The max of halo size or the first shard's explicit left padding.
+  int64 max_left_halo_or_padding_size =
+      std::max(std::max(int64{0}, max_left_halo_size),
+               explicit_left_padding_on_full_shape);
+  // The calculation that returns the dynamic slice index for a shard on the
+  // padded concat, which is the difference between
+  // max_left_halo_or_padding_size and its left halo size.
+  auto start_offset_on_padded_concat_calculation =
+      OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+          0, max_left_halo_or_padding_size, 1)) -
+      left_halo_size_function;
+
+  // See if we need to pad the concat before dynamic slice.
+  int64 extra_left_padding =
+      std::max(int64{0}, max_left_halo_or_padding_size -
+                             std::max(int64{0}, max_left_halo_size));
+  int64 extra_right_padding =
+      start_offset_on_padded_concat_calculation.MaxInRange(0, shard_count) +
+      shard_size_with_halo - concat->shape().dimensions(dim) -
+      extra_left_padding;
+  extra_right_padding = std::max(int64{0}, extra_right_padding);
+  if (extra_left_padding > 0 || extra_right_padding > 0) {
+    PaddingConfig padding_config;
+    auto padded_concat_shape = concat->shape();
+    for (int64 i = 0; i < base_shape.rank(); ++i) {
+      auto padding_config_dim = padding_config.add_dimensions();
+      padding_config_dim->set_interior_padding(0);
+      padding_config_dim->set_edge_padding_low(0);
+      padding_config_dim->set_edge_padding_high(0);
+      if (i != dim) {
+        continue;
+      }
+      padding_config_dim->set_edge_padding_low(extra_left_padding);
+      padding_config_dim->set_edge_padding_high(extra_right_padding);
+      padded_concat_shape.set_dimensions(dim, concat->shape().dimensions(dim) +
+                                                  extra_left_padding +
+                                                  extra_right_padding);
+    }
+    concat = b->AddInstruction(HloInstruction::CreatePad(
+        padded_concat_shape, concat, pad_value, padding_config));
+  }
+
+  auto valid_slice = concat;
+  if (shard_size_with_halo != concat->shape().dimensions(dim)) {
+    // Concat is bigger than the shard shape, so we need a dynamic slice.
+    CHECK_LT(shard_size_with_halo, concat->shape().dimensions(dim));
+    auto slice_shape = concat->shape();
+    slice_shape.set_dimensions(dim, shard_size_with_halo);
+
+    if (left_halo_size_function.IsConstant() &&
+        left_halo_size_function.Calculate(0) ==
+            explicit_left_padding_on_full_shape) {
+      std::vector<int64> start_indices(slice_shape.rank(), 0);
+      std::vector<int64> strides(slice_shape.rank(), 1);
+      valid_slice = b->AddInstruction(
+          HloInstruction::CreateSlice(slice_shape, concat, start_indices,
+                                      slice_shape.dimensions(), strides));
+    } else {
+      auto zero = b->AddInstruction(
+          HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
+      std::vector<HloInstruction*> slice_offsets(base_shape.rank(), zero);
+      slice_offsets[dim] = start_offset_on_padded_concat_calculation.Calculate(
+          partition_ordinal, b);
+      valid_slice = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+          slice_shape, concat, slice_offsets, slice_shape.dimensions()));
+    }
+  }
+
+  if (!mask_invalid_region) {
+    return valid_slice;
+  }
+
+  int64 total_right_padding = padded_full_shape_size -
+                              base_shape.dimensions(dim) -
+                              explicit_left_padding_on_full_shape;
+  // Mask off garbage data due to uneven partition or low/high padding.
+  if (explicit_left_padding_on_full_shape > 0 || total_right_padding > 0) {
+    auto index_shape = ShapeUtil::ChangeElementType(valid_slice->shape(), S32);
+    auto iota = b->AddInstruction(HloInstruction::CreateIota(index_shape, dim));
+    auto broadcast_start_index_in_padded_shape =
+        b->AddInstruction(HloInstruction::CreateBroadcast(
+            index_shape, offset_on_padded_shape, {}));
+    auto index_in_padded_shape = b->AddInstruction(
+        HloInstruction::CreateBinary(index_shape, HloOpcode::kAdd, iota,
+                                     broadcast_start_index_in_padded_shape));
+    auto mask_shape = ShapeUtil::ChangeElementType(index_shape, PRED);
+    std::vector<HloInstruction*> predicates;
+    if (explicit_left_padding_on_full_shape > 0) {
+      auto valid_index_start =
+          b->AddInstruction(HloInstruction::CreateBroadcast(
+              index_shape,
+              b->AddInstruction(
+                  HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                      explicit_left_padding_on_full_shape))),
+              {}));
+      predicates.push_back(b->AddInstruction(HloInstruction::CreateCompare(
+          mask_shape, index_in_padded_shape, valid_index_start,
+          ComparisonDirection::kGe)));
+    }
+    if (total_right_padding > 0) {
+      auto valid_index_limit =
+          b->AddInstruction(HloInstruction::CreateBroadcast(
+              index_shape,
+              b->AddInstruction(
+                  HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(
+                      base_shape.dimensions(dim) +
+                      explicit_left_padding_on_full_shape))),
+              {}));
+      predicates.push_back(b->AddInstruction(HloInstruction::CreateCompare(
+          mask_shape, index_in_padded_shape, valid_index_limit,
+          ComparisonDirection::kLt)));
+    }
+    CHECK(!predicates.empty());
+    auto is_valid =
+        predicates.size() == 2
+            ? b->AddInstruction(HloInstruction::CreateBinary(
+                  mask_shape, HloOpcode::kAnd, predicates[0], predicates[1]))
+            : predicates[0];
+    auto masking_value = b->AddInstruction(
+        HloInstruction::CreateBroadcast(valid_slice->shape(), pad_value, {}));
+    valid_slice = b->AddInstruction(
+        HloInstruction::CreateTernary(valid_slice->shape(), HloOpcode::kSelect,
+                                      is_valid, valid_slice, masking_value));
+  }
+  return valid_slice;
+}
+
+}  // namespace spmd
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
new file mode 100644
index 00000000000..f96b23d7073
--- /dev/null
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -0,0 +1,229 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
+
+namespace xla {
+namespace spmd {
+
+// Returns true if the given sharding contains any replicated sharding.
+bool HasReplicatedSharding(const HloSharding& sharding);
+
+// Creates zero value instructions of the given shape.
+HloInstruction* CreateZero(const Shape& shape, SpmdBuilder* b);
+
+template <typename NativeT>
+HloInstruction* CreateR0WithType(PrimitiveType type, NativeT value,
+                                 SpmdBuilder* b) {
+  auto literal = LiteralUtil::CreateR0(value)
+                     .ConvertToShape(ShapeUtil::MakeShape(type, {}))
+                     .ValueOrDie();
+  return b->AddInstruction(HloInstruction::CreateConstant(std::move(literal)));
+}
+
+// Create a binary add computation of the given type and add to the module.
+HloComputation* MakeBinaryAdd(PrimitiveType type, HloModule* module);
+
+// Returns true if the shape can be evenly partitioned for the given sharding.
+// All tile sharded dimensions should be evenly divisible and there should be no
+// single-device sharding. Replicate sharding is considered even partition.
+bool EvenlyPartitions(const Shape& shape, const HloSharding& sharding);
+
+// Returns the shard shape of the given shape when it is partitioned for the
+// target sharding.
+Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding);
+
+// Returns the shard shape for a partition without padding due to uneven
+// sharding.
+Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
+                                          const HloSharding& sharding,
+                                          int64 partition_id);
+
+// Generates the HLO instructions that represent the dimension offsets on any
+// device. The size of the returned vector is the rank of the given shape.
+std::vector<HloInstruction*> MakePartitionOffsets(const Shape& shape,
+                                                  const HloSharding& sharding,
+                                                  HloInstruction* partition_id,
+                                                  SpmdBuilder* b);
+
+// Returns the offsets of the partition in the tile assignment.
+std::vector<HloInstruction*> MakeTiledPartitionOrdinals(
+    const HloSharding& sharding, HloInstruction* partition_id, SpmdBuilder* b);
+
+// Pads hlo to the desired shape using high padding. Either a builder or a
+// computation needs to be supplied, but not both.
+HloInstruction* PadToShape(HloInstruction* hlo, const Shape& padded_shape,
+                           SpmdBuilder* b,
+                           HloComputation* computation = nullptr);
+
+// Returns the padded shape when combining all partitions.
+Shape GetPaddedShapeForUnevenPartitioning(const Shape& base_shape,
+                                          const HloSharding& sharding);
+
+// Pads the HLO (with base shape) for uneven tiled partition to make it evenly
+// partitionable.
+HloInstruction* PadBaseShapeBeforeUnevenTiledSharding(
+    HloInstruction* hlo, const HloSharding& sharding, SpmdBuilder* b);
+
+// Returns the index of the unique tile dimension. Returns absl::nullopt if the
+// given sharding is not tiled or tiled along multiple dimensions.
+absl::optional<int64> UniqueTiledDim(const HloSharding& sharding);
+
+// Utilities for symbolic offset calculation and halo exchange.
+class OffsetCalculation;
+
+// Represents a calculation over integers:
+//   (shard_ordinal * multiplier + offset) / divisor
+class MultiplyAddDivideOffsetCalculation {
+ public:
+  MultiplyAddDivideOffsetCalculation()
+      : multiplier_(0), offset_(0), divisor_(1) {}
+  MultiplyAddDivideOffsetCalculation(int64 multiplier, int64 offset,
+                                     int64 divisor);
+
+  OffsetCalculation operator-(
+      const MultiplyAddDivideOffsetCalculation& other) const;
+
+  bool operator==(const MultiplyAddDivideOffsetCalculation& other) const {
+    return multiplier_ == other.multiplier_ && offset_ == other.offset_ &&
+           divisor_ == other.divisor_;
+  }
+
+  bool IsConstant() const { return multiplier_ == 0; }
+  void Simplify();
+  int64 Calculate(int64 shard_ordinal) const;
+  HloInstruction* Calculate(HloInstruction* shard_ordinal,
+                            SpmdBuilder* b) const;
+
+  // Returns the maximum result for shard ordinals in the range
+  // [start_ordinal, limit_ordinal).
+  int64 MaxInRange(int64 start_ordinal, int64 limit_ordinal) const;
+
+ private:
+  int64 multiplier_;
+  int64 offset_;
+  int64 divisor_;
+};
+
+// Represents a calculation over integers based on results of other calculations
+// defined by an opcode. If the opcode is kCopy, it simply wraps an
+// MultiplyAddDivideOffsetCalculation.
+class OffsetCalculation {
+ public:
+  OffsetCalculation() : opcode_(HloOpcode::kCopy), copy_from_() {}
+  explicit OffsetCalculation(
+      const MultiplyAddDivideOffsetCalculation& copy_from)
+      : opcode_(HloOpcode::kCopy), copy_from_(copy_from) {}
+  OffsetCalculation(const OffsetCalculation& copy_from) { *this = copy_from; }
+  OffsetCalculation(HloOpcode opcode,
+                    const MultiplyAddDivideOffsetCalculation& lhs,
+                    const MultiplyAddDivideOffsetCalculation& rhs)
+      : opcode_(opcode),
+        lhs_(absl::make_unique<OffsetCalculation>(lhs)),
+        rhs_(absl::make_unique<OffsetCalculation>(rhs)) {}
+  OffsetCalculation(HloOpcode opcode, const OffsetCalculation& lhs,
+                    const OffsetCalculation& rhs)
+      : opcode_(opcode),
+        lhs_(absl::make_unique<OffsetCalculation>(lhs)),
+        rhs_(absl::make_unique<OffsetCalculation>(rhs)) {}
+
+  OffsetCalculation& operator=(const OffsetCalculation& other);
+
+  // Returns whether the calculation returns the same value for all shards. This
+  // is conservative and could return false even if it is actually constant.
+  bool IsConstant() const;
+
+  OffsetCalculation operator-(const OffsetCalculation& other) const;
+  bool operator==(const OffsetCalculation& other) const;
+  int64 Calculate(int64 shard_ordinal) const;
+  HloInstruction* Calculate(HloInstruction* shard_ordinal,
+                            SpmdBuilder* b) const;
+
+  // Returns the maximum result for shard ordinals in the range
+  // [start_ordinal, limit_ordinal).
+  int64 MaxInRange(int64 start_ordinal, int64 limit_ordinal) const;
+
+ private:
+  HloOpcode opcode_;
+  std::unique_ptr<OffsetCalculation> lhs_;
+  std::unique_ptr<OffsetCalculation> rhs_;
+  MultiplyAddDivideOffsetCalculation copy_from_;
+};
+
+// Performs halo exchange on the given dimension based on the provided
+// left/right halo size functions. Returns nullopt if the halo is beyond the
+// direct neighbor of the shard.
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo, const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function, int64 dim,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b);
+
+// Exchange halo on all dimensions of the HLO. Returns nullopt if any one of the
+// dimensions fails to exchange halo (halo is beyond the neighbor shard).
+absl::optional<HloInstruction*> ExchangeHalo(
+    HloInstruction* hlo,
+    std::vector<OffsetCalculation> left_halo_size_functions,
+    std::vector<OffsetCalculation> right_halo_size_functions,
+    const HloSharding& target,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b);
+
+// Exchanges halos and performs pad/dynamic-slice on the concatenated data such
+// that the result starts with the first needed element on each shard. It also
+// masks off invalid data due to padding.
+// Arguments:
+//  hlo: the HLO op before halo exchange
+//  explicit_left_padding_on_full_shape: the amount of left padding to be added
+//   explicitly by this function on the base shape before partitioning. Without
+//   base dilation, this is usually set to the window's padding_low so that the
+//   sharded op do not need to add padding_low on the window; however, with base
+//   dilation, this could only be set to a custom size.
+//  padded_full_shape_size: the size of the padded full shape on the given
+//   dimension, which includes explicit_left_padding_on_full_shape and required
+//   right padding to make the shape evenly shardable.
+//  shard_size_with_halo: the shard size on the dimension after halo exchange.
+//   If different shards have different sizes, use the maximum size.
+//  offset_on_padded_shape: the offset HLO (S32) that represents the start of
+//   each shard on the padded full shape.
+//  pad_value: the padding value used on the full shape.
+absl::optional<HloInstruction*> ExchangeHaloAndGetValidData(
+    HloInstruction* hlo, const Shape& base_shape,
+    const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function,
+    int64 explicit_left_padding_on_full_shape, int64 padded_full_shape_size,
+    int64 shard_size_with_halo, int64 dim, const HloSharding& target,
+    HloInstruction* offset_on_padded_shape, HloInstruction* pad_value,
+    HloInstruction* partition_ordinal,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64* next_channel_id, SpmdBuilder* b, bool mask_invalid_region = true);
+
+}  // namespace spmd
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SPMD_SPMD_PARTITIONER_UTIL_H_

From f5e922903c1f4ffcce836f3c484f2e222e0922b2 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Wed, 13 May 2020 11:23:16 -0700
Subject: [PATCH 120/412] Roll back change: "For python op generation: add
 dispatch to all generated ops (don't skip ops with VISIBILITY=HIDDEN)"

PiperOrigin-RevId: 311368253
Change-Id: I137dff8c2153ac9666c4a028ca891b01dcb96cc6
---
 tensorflow/python/framework/python_op_gen.cc | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 02b659528b0..857cc7b6638 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -371,7 +371,9 @@ void GenEagerPythonOp::HandleGraphMode(
     const string& function_setup, const std::vector<string>& output_sizes) {
   strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
   strings::StrAppend(&result_, function_setup);
-  strings::StrAppend(&result_, "  try:\n  ");
+  if (api_def_.visibility() == ApiDef::VISIBLE) {
+    strings::StrAppend(&result_, "  try:\n  ");
+  }
   strings::StrAppend(
       &result_, "  _, _, _op, _outputs = _op_def_library._apply_op_helper(\n");
   AddBodyNoReturn(strings::StrCat("        \"", op_def_.name(), "\", "));
@@ -688,7 +690,9 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error) {
-  strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
+  if (api_def_.visibility() == ApiDef::VISIBLE) {
+    strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
+  }
 
   AddExport();
   AddDefLine(function_name_, parameters);
@@ -951,6 +955,8 @@ void GenEagerPythonOp::AddEagerExecute(const string& indentation,
 }
 
 void GenEagerPythonOp::AddDispatch(const string& prefix) {
+  if (api_def_.visibility() != ApiDef::VISIBLE) return;
+
   strings::StrAppend(&result_, prefix, "except (TypeError, ValueError):\n");
   strings::StrAppend(&result_, prefix, "  result = _dispatch.dispatch(\n");
   AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_, ", "));

From 0ecd3d8db02b004795402d58130d7816e8e00965 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Wed, 13 May 2020 11:41:34 -0700
Subject: [PATCH 121/412] Update tf.InplaceUpdate summary and description so
 both TensorFlow op registry and TensorFlow MLIR ODS match.

PiperOrigin-RevId: 311372143
Change-Id: I0f6a8debc07cfe67662b3256d0f4ffdab1070eb5
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 25 -------------------
 .../compiler/mlir/tensorflow/ir/tf_ops.td     | 24 ++++++++++++++++++
 .../base_api/api_def_InplaceUpdate.pbtxt      |  8 +++---
 3 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index bddf064f5c6..2d02d0b7508 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -3625,31 +3625,6 @@ tf.imag(input) ==> [4.75, 5.75]
   TF_DerivedResultTypeAttr Tout = TF_DerivedResultTypeAttr<0>;
 }
 
-def TF_InplaceUpdateOp : TF_Op<"InplaceUpdate", [NoSideEffect]> {
-  let summary = [{
-    Create a copy of `x` with the updated specified rows 'i' with values 'v'.
-
-  }];
-
-  let description = [{
-    Creates a copy of tensor 'x' and updates the columns specified in tensor 'i'
-    with the values 'v'. Originally this function was mutative however for 
-    compilation we make this operation create / operate on a copy.
-  }];
-
-  let arguments = (ins
-    TF_Tensor:$x,
-    I32Tensor:$i,
-    TF_Tensor:$v
-  );
-
-  let results = (outs
-    TF_Tensor:$y
-  );
-
-  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
-}
-
 def TF_InvOp : TF_Op<"Inv", [NoSideEffect, SameOperandsAndResultType]> {
   let summary = "Computes the reciprocal of x element-wise.";
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index 744d1ac5b71..94b0c5f5e19 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -905,5 +905,29 @@ def TF_TensorSliceDatasetOp : TF_Op<"TensorSliceDataset", []> {
   TF_DerivedOperandTypeListAttr Toutput_types = TF_DerivedOperandTypeListAttr<0>;
 }
 
+// TODO(b/156507832): Move tf.InplaceUpdate to tf_generated_ops.td once
+// autogenerated op def matches.
+def TF_InplaceUpdateOp : TF_Op<"InplaceUpdate", [NoSideEffect]> {
+  let summary = "Updates specified rows 'i' with values 'v'.";
+
+  let description = [{
+Computes `x[i, :] = v; return x`.
+
+Originally this function is mutative however for compilation we make this
+operation create / operate on a copy of `x`.
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$x,
+    I32Tensor:$i,
+    TF_Tensor:$v
+  );
+
+  let results = (outs
+    TF_Tensor:$y
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
 
 #endif // TF_OPS
diff --git a/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
index 2fcd3659dc7..c0c160d1be4 100644
--- a/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_InplaceUpdate.pbtxt
@@ -20,9 +20,11 @@ op {
         "A `Tensor` of type T. An alias of `x`. The content "
         "of `y` is undefined if there are duplicates in `i`."
   }
-  summary: <<END
-    Updates specified rows with values in `v`.
+  summary: "Updates specified rows 'i' with values 'v'."
+  description: <<END
+Computes `x[i, :] = v; return x`.
 
-    Computes `x[i, :] = v; return x`.
+Originally this function is mutative however for compilation we make this
+operation create / operate on a copy of `x`.
 END
 }

From b69595c6c7ddcba1c1aacd89d53078f8dae0d3f1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 11:46:15 -0700
Subject: [PATCH 122/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311373038
Change-Id: I495328e259c9c69c73cfa43cc284da61999ada47
---
 tensorflow/go/op/wrappers.go | 51 +++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a90fc2e3e26..7a07a0e78d8 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23940,9 +23940,12 @@ func Copy(scope *Scope, input tf.Output, optional ...CopyAttr) (output tf.Output
 	return op.Output(0)
 }
 
-//     Updates specified rows with values in `v`.
+// Updates specified rows 'i' with values 'v'.
 //
-//     Computes `x[i, :] = v; return x`.
+// Computes `x[i, :] = v; return x`.
+//
+// Originally this function is mutative however for compilation we make this
+// operation create / operate on a copy of `x`.
 //
 // Arguments:
 //	x: A tensor of type `T`.
@@ -25651,7 +25654,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25714,7 +25717,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25965,7 +25968,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26449,7 +26452,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45537,7 +45540,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47477,7 +47480,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47548,7 +47551,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48537,7 +48540,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 90f3a1eb381e644ac5d0f3fd126af25f856820a9 Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Wed, 13 May 2020 11:49:10 -0700
Subject: [PATCH 123/412] Rolling change forward again: "Add support for global
 operation dispatchers.  (This is intended for use by TF-internal classes
 only.)"

PiperOrigin-RevId: 311373578
Change-Id: Ib40cee66bbb1395c8997db3c1eb3f5914425a280
---
 tensorflow/python/util/dispatch.py      | 21 +++++++++
 tensorflow/python/util/dispatch_test.py | 58 ++++++++++++++++++++++++-
 2 files changed, 77 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
index e94e3345348..3868da14b44 100644
--- a/tensorflow/python/util/dispatch.py
+++ b/tensorflow/python/util/dispatch.py
@@ -39,6 +39,10 @@ from tensorflow.python.util import tf_inspect
 DISPATCH_ATTR = "_tf_dispatchers"
 
 
+# OpDispatchers which should be used for all operations.
+_GLOBAL_DISPATCHERS = []
+
+
 class OpDispatcher(object):
   """Abstract base class for TensorFlow operator dispatchers.
 
@@ -82,6 +86,19 @@ class OpDispatcher(object):
     getattr(op, DISPATCH_ATTR).append(self)
 
 
+class GlobalOpDispatcher(object):
+  """Abstract base class for TensorFlow global operator dispatchers."""
+
+  NOT_SUPPORTED = OpDispatcher.NOT_SUPPORTED
+
+  def handle(self, op, args, kwargs):
+    """Handle the specified operation with the specified arguments."""
+
+  def register(self):
+    """Register this dispatcher as a handler for all ops."""
+    _GLOBAL_DISPATCHERS.append(self)
+
+
 def dispatch(op, *args, **kwargs):
   """Returns the result from the first successful dispatcher for a given op.
 
@@ -101,6 +118,10 @@ def dispatch(op, *args, **kwargs):
     result = dispatcher.handle(args, kwargs)
     if result is not OpDispatcher.NOT_SUPPORTED:
       return result
+  for dispatcher in _GLOBAL_DISPATCHERS:
+    result = dispatcher.handle(op, args, kwargs)
+    if result is not OpDispatcher.NOT_SUPPORTED:
+      return result
   return OpDispatcher.NOT_SUPPORTED
 
 
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index 89999fcf843..bd35c391924 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -45,6 +45,47 @@ def test_op(x, y, z):
   return x + (2 * y) + (3 * z)
 
 
+class TensorTracer(object):
+  """An object used to trace TensorFlow graphs.
+
+  This is an example class that is used to test global op dispatchers.  The
+  global op dispatcher for TensorTracers is defined below.
+  """
+
+  def __init__(self, name, args=None, kwargs=None):
+    self.name = name
+    self.args = args
+    self.kwargs = kwargs
+
+  def __repr__(self):
+    if self.args is None and self.kwargs is None:
+      return self.name
+    else:
+      args = [str(x) for x in self.args]
+      args += sorted(
+          ["{}={}".format(name, x) for (name, x) in self.kwargs.items()])
+      return "{}({})".format(self.name, ", ".join(args))
+
+
+class TensorTracerOpDispatcher(dispatch.GlobalOpDispatcher):
+  """Global op dispatcher for TensorTracer."""
+
+  def handle(self, op, args, kwargs):
+    # Dispatcher only applies if at least one arg is a TensorTracer.
+    if not (any(self.is_tensor_tracer_arg(x) for x in args) or
+            any(self.is_tensor_tracer_arg(x) for x in kwargs.values())):
+      return self.NOT_SUPPORTED
+
+    return TensorTracer(op.__name__, args, kwargs)
+
+  def is_tensor_tracer_arg(self, value):
+    if isinstance(value, TensorTracer):
+      return True
+    if isinstance(value, (list, tuple)):
+      if any(isinstance(x, TensorTracer) for x in value):
+        return True
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class DispatchTest(test_util.TensorFlowTestCase):
 
@@ -131,8 +172,21 @@ class DispatchTest(test_util.TensorFlowTestCase):
         r".*some_op \(from __main__\) is deprecated and will be "
         "removed in a future version.*")
 
+  def testGlobalDispatcher(self):
+    original_global_dispatchers = dispatch._GLOBAL_DISPATCHERS
+    try:
+      TensorTracerOpDispatcher().register()
+
+      x = TensorTracer("x")
+      y = TensorTracer("y")
+      trace = math_ops.reduce_sum(math_ops.add(math_ops.abs(x), y), axis=3)
+      self.assertEqual(
+          str(trace), "reduce_sum(add(name=None, x=abs(x), y=y), axis=3)")
+
+    finally:
+      # Clean up.
+      dispatch._GLOBAL_DISPATCHERS = original_global_dispatchers
+
 
 if __name__ == "__main__":
   googletest.main()
-
-

From 91914123c4239c13b15789e0147a6874e0abaf6c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 13 May 2020 18:49:08 +0000
Subject: [PATCH 124/412] Simplify comment and use tensor_util.is_tensor and
 convert_to_tensor if both x and y are not tensor

This is needed for `x / y` where both x and y are not tensors. As long as x is
a tensor, `x / y` will work and will rely on _truediv_python3 to correctly
figure out the right type for y.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 2c141483eb1..2d8285f709c 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -72,7 +72,6 @@ from __future__ import print_function
 
 import numpy as np
 import six
-import sys
 from six.moves import builtins
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
@@ -83,6 +82,7 @@ from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_data_flow_ops
@@ -439,27 +439,9 @@ def divide(x, y, name=None):
     # override names. Use a dummy class to track the runtime division behavior
     return DivideDelegateWithName(x, name) / y
   else:
-    # tf.math.divide will compute python style division x / y. As python 2
-    # and python 3 have very much different semantics on `/` (__div__ vs.
-    # __truediv__), it would be natural to just use `x / y` as the operator
-    # '/' has already been registered for tensors, see
-    # _OverrideBinaryOperatorHelper for more details.
-    # However, in case both x and y are not tensors, the registered '/'
-    # _OverrideBinaryOperatorHelper will not take effect. In this case,
-    # python's default '/' operator will take effect which result in the return
-    # value of `tf.math.divide` as a non-Tensor.
-    # For that reason we excplicitly calls _truediv_python3/_div_python2
-    # in case both x and y are not tensors.
-    # Since _truediv_python3/_div_python2 operates on tensors and will convert
-    # to tensor if needed. This avoid the situation of the following if not
-    # explicitly calling _truediv_python3/_div_python2:
-    # >>> tf.divide(5, 2)
-    # 2.5 <= should be <tf.Tensor: shape=(), dtype=float64, numpy=2.5> instead.
-    if not (isinstance(x, ops.Tensor) or isinstance(y, ops.Tensor)):
-      if sys.version_info.major < 3:
-        return _div_python2(x, y)
-      else:
-        return _truediv_python3(x, y)
+    # We do conversion here to make sure at least either x or y is a tensor.
+    if not (tensor_util.is_tensor(x) or tensor_util.is_tensor(y)):
+      x = ops.convert_to_tensor(x)
     return x / y
 
 
From 39026d9e33040e9ff2a9d226543cfcac40e97010 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 13 May 2020 19:59:40 +0000
Subject: [PATCH 125/412] Remove the need to check if y is a tensor (always
 convert x if not)

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 2d8285f709c..03c10b37c95 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -439,8 +439,8 @@ def divide(x, y, name=None):
     # override names. Use a dummy class to track the runtime division behavior
     return DivideDelegateWithName(x, name) / y
   else:
-    # We do conversion here to make sure at least either x or y is a tensor.
-    if not (tensor_util.is_tensor(x) or tensor_util.is_tensor(y)):
+    # We do conversion here to make sure at least x is a tensor.
+    if not tensor_util.is_tensor(x):
       x = ops.convert_to_tensor(x)
     return x / y
 

From c14af64d68f7a5ad852bbe2ff33d553a6e37b1a1 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Wed, 13 May 2020 13:07:54 -0700
Subject: [PATCH 126/412] Fix Windows build after cl/311081931.

PiperOrigin-RevId: 311389429
Change-Id: I12d355802a21f3538df563410a3ccf3d10dbedee
---
 .../crosstool/windows/msvc_wrapper_for_nvcc.py.tpl     | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
index c10fb826494..de6512e3088 100644
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -59,7 +59,7 @@ def GetOptionValue(argv, option):
   parser.add_argument(option, nargs='*', action='append')
   option = option.lstrip('-/').replace('-', '_')
   args, leftover = parser.parse_known_args(argv)
-  if args and vars(args).get(option):
+  if args and vars(args)[option]:
     return (sum(vars(args)[option], []), leftover)
   return ([], leftover)
 
@@ -136,10 +136,12 @@ def InvokeNvcc(argv, log=False):
   m_options = ["-m64"]
 
   nvccopts = ['-D_FORCE_INLINES']
-  for capability in GetOptionValue(argv, "--cuda-gpu-arch"):
+  compute_capabilities, argv = GetOptionValue(argv, "--cuda-gpu-arch")
+  for capability in compute_capabilities:
+    print(capability)
     capability = capability[len('sm_'):]
-    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
-        capability, capability, capability)
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
   nvccopts += nvcc_compiler_options
   nvccopts += undefines
   nvccopts += defines

From 20f064bf51db4cd2b0934e9656f8b497691e7901 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Wed, 13 May 2020 13:10:21 -0700
Subject: [PATCH 127/412] Improve image testing for TFLite Java tests

PiperOrigin-RevId: 311389871
Change-Id: I3114c2af72e6029035f4ba16f002ec284b9c5917
---
 tensorflow/lite/java/BUILD                    |   1 +
 .../lite/InterpreterMobileNetTest.java        |  66 +++++++++++++-----
 .../java/org/tensorflow/lite/TestUtils.java   |  62 ++++++++++++++++
 .../java/src/testdata/grace_hopper_224.jpg    | Bin 0 -> 24459 bytes
 4 files changed, 111 insertions(+), 18 deletions(-)
 create mode 100644 tensorflow/lite/java/src/testdata/grace_hopper_224.jpg

diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 2fcb4b631be..46cd1be25cb 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -14,6 +14,7 @@ package(
 exports_files([
     "src/testdata/add.bin",
     "src/testdata/add_unknown_dimensions.bin",
+    "src/testdata/grace_hopper_224.jpg",
 ])
 
 JAVA_SRCS = glob([
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
index aaac2f9690a..446cf5f7b02 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterMobileNetTest.java
@@ -18,7 +18,11 @@ package org.tensorflow.lite;
 import static com.google.common.truth.Truth.assertThat;
 
 import java.nio.ByteBuffer;
-import java.nio.ByteOrder;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.Map;
+import java.util.PriorityQueue;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
@@ -61,14 +65,9 @@ public final class InterpreterMobileNetTest {
   }
 
   private static void runMobileNetFloatTest(Interpreter.Options options) {
-    // Create a gray image.
-    ByteBuffer img = ByteBuffer.allocateDirect(1 * 224 * 224 * 3 * 4);
-    img.order(ByteOrder.nativeOrder());
-    img.rewind();
-    while (img.hasRemaining()) {
-      img.putFloat(0.5f);
-    }
-
+    ByteBuffer img =
+        TestUtils.getTestImageAsFloatByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
     float[][] labels = new float[1][1001];
     try (Interpreter interpreter = new Interpreter(MOBILENET_FLOAT_MODEL_BUFFER, options)) {
       interpreter.run(img, labels);
@@ -78,22 +77,53 @@ public final class InterpreterMobileNetTest {
     assertThat(labels[0])
         .usingExactEquality()
         .containsNoneOf(new float[] {Float.NaN, Float.NEGATIVE_INFINITY, Float.POSITIVE_INFINITY});
+    // 653 == "military uniform"
+    assertThat(getTopKLabels(labels, 3)).contains(653);
   }
 
   private static void runMobileNetQuantizedTest(Interpreter.Options options) {
-    // Create a gray image.
-    ByteBuffer img = ByteBuffer.allocateDirect(1 * 224 * 224 * 3);
-    img.order(ByteOrder.nativeOrder());
-    img.rewind();
-    while (img.hasRemaining()) {
-      img.put((byte) 128);
-    }
-
+    ByteBuffer img =
+        TestUtils.getTestImageAsByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
+    byte[][] labels = new byte[1][1001];
     try (Interpreter interpreter = new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options)) {
-      byte[][] labels = new byte[1][1001];
       interpreter.run(img, labels);
       assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
       assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
     }
+    // 653 == "military uniform"
+    assertThat(getTopKLabels(labels, 3)).contains(653);
+  }
+
+  private static ArrayList<Integer> getTopKLabels(byte[][] byteLabels, int k) {
+    float[][] labels = new float[1][1001];
+    for (int i = 0; i < byteLabels[0].length; ++i) {
+      labels[0][i] = (byteLabels[0][i] & 0xff) / 255.0f;
+    }
+    return getTopKLabels(labels, k);
+  }
+
+  private static ArrayList<Integer> getTopKLabels(float[][] labels, int k) {
+    PriorityQueue<Map.Entry<Integer, Float>> pq =
+        new PriorityQueue<>(
+            k,
+            new Comparator<Map.Entry<Integer, Float>>() {
+              @Override
+              public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return o1.getValue().compareTo(o2.getValue()) * -1;
+              }
+            });
+
+    for (int i = 0; i < labels[0].length; ++i) {
+      pq.add(new AbstractMap.SimpleEntry<>(i, labels[0][i]));
+    }
+
+    final ArrayList<Integer> topKLabels = new ArrayList<>();
+    int topKLabelsSize = Math.min(pq.size(), k);
+    for (int i = 0; i < topKLabelsSize; ++i) {
+      topKLabels.add(pq.poll().getKey());
+    }
+    return topKLabels;
   }
 }
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java
index 1471b4b506b..ae88cddcf57 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TestUtils.java
@@ -15,17 +15,24 @@ limitations under the License.
 
 package org.tensorflow.lite;
 
+import java.awt.image.BufferedImage;
 import java.io.File;
 import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
 import java.nio.file.StandardOpenOption;
 import java.util.EnumSet;
+import javax.imageio.ImageIO;
 
 /** Utility for interacting with test-specific data. */
 public abstract class TestUtils {
 
+  private static final float DEFAULT_IMAGE_MEAN = 127.5f;
+  private static final float DEFAULT_IMAGE_STD = 127.5f;
+
   public static MappedByteBuffer getTestFileAsBuffer(String path) {
     try (FileChannel fileChannel =
         (FileChannel)
@@ -40,5 +47,60 @@ public abstract class TestUtils {
     return true;
   }
 
+  public static ByteBuffer getTestImageAsByteBuffer(String path) {
+    File imageFile = new File(path);
+    try {
+      BufferedImage image = ImageIO.read(imageFile);
+      return toByteBuffer(image);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  public static ByteBuffer getTestImageAsFloatByteBuffer(String path) {
+    File imageFile = new File(path);
+    try {
+      BufferedImage image = ImageIO.read(imageFile);
+      return toFloatByteBuffer(image);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private static ByteBuffer toByteBuffer(BufferedImage image) {
+    ByteBuffer imgData =
+        ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 3)
+            .order(ByteOrder.nativeOrder());
+    for (int y = 0; y < image.getHeight(); y++) {
+      for (int x = 0; x < image.getWidth(); x++) {
+        int val = image.getRGB(x, y);
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    return imgData;
+  }
+
+  private static ByteBuffer toFloatByteBuffer(BufferedImage image) {
+    return toFloatByteBuffer(image, DEFAULT_IMAGE_MEAN, DEFAULT_IMAGE_STD);
+  }
+
+  private static ByteBuffer toFloatByteBuffer(
+      BufferedImage image, float imageMean, float imageStd) {
+    ByteBuffer imgData =
+        ByteBuffer.allocateDirect(image.getHeight() * image.getWidth() * 3 * 4)
+            .order(ByteOrder.nativeOrder());
+    for (int y = 0; y < image.getHeight(); y++) {
+      for (int x = 0; x < image.getWidth(); x++) {
+        int pixelValue = image.getRGB(x, y);
+        imgData.putFloat((((pixelValue >> 16) & 0xFF) - imageMean) / imageStd);
+        imgData.putFloat((((pixelValue >> 8) & 0xFF) - imageMean) / imageStd);
+        imgData.putFloat(((pixelValue & 0xFF) - imageMean) / imageStd);
+      }
+    }
+    return imgData;
+  }
+
   private TestUtils() {}
 }
diff --git a/tensorflow/lite/java/src/testdata/grace_hopper_224.jpg b/tensorflow/lite/java/src/testdata/grace_hopper_224.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..15a2f2bd2a5e3d1b23a9af50251b3711d3b0a69a
GIT binary patch
literal 24459
zcmcG!byyrt&@Z~U28TefB{&2K?jGFT-EDCQ4uOQA!Civ8`{EuXKv;CK;10nZ&gOkT
zIp2Nmea^jq+?ws)o~r7qYMGwt-^|Ox%WnXVytJG&01gfgU<x|`FUN$&s<zG^&hEC(
zE<jG!_W(W#IVFT+bag9tXGc#DJ7*98MmT?U0dTP6KmPw}A-*D()&<HqI-8q00u}9S
zY(4BiHb8NZjiVJz8Gr~&6#wXv;QhCrxTA-iho|NL)IhkvP<68dc>tvy&1~FZ_0awr
z_)BpA#Dj>ACF$(q<M!9bKRW-4{*lq+5XhUkn1QU^t$_d22ml>c{a=Frr&U+4C7m6e
z-GH)Yj@BN|AS<ATmB-(h|9X@KuvuHf^c`V2@;{Z}{)3a3p7CEC4wwWEpbyLcQ-}K>
z4$FUY{_%ACAD%Sb%s}qeR&M{hCl^?+|KAeA9?Z)B53+y802Nur)6UZBZ&P8T`L`7S
zc(H#I;NL!`c%`MT2z2(ca&!9+#lIu?uj~!7hNGRuzqNf}%zs<|R}6qRh9x9e0TUnt
za08eDEC5yjAgn)~0WPq^3dZ}#+h5)PLH{{6Fy`O-f9?FA=%1G0;>t-%0<}OMW;Qlf
zmj5{&U=#WuDZy(+Gap!o%Y6XKSb?nE%sgNNE@9^Wcf$Q`58OXmn1sq!-as8YkfpP?
zJ5bZs)5#oUX6Fb1;MV=sfhAK|f{hd$5C8~-6>*aR!~nRL|LqEbhl90I7B*fd04#A!
zXLBonWiU(;?qvxO2Ear`MMXo!L_@>G#X!fvCB(+W#3m%h$0x+cC&tD6hyLU5-;@7q
z4)+QJ;}zB`94ssx0xT>ng1;FS!M{~-|KAvR=>g!P!Wv+T0QVLEj|+!@3-_`D>j31J
z0XR`ult{1@;O`>vFChTn;1Q9KQBcv)F<=+@{}jRj;1T{)hy#E_ghPNwghxU_Mnyuv
z<b(-v5fG{H*pbB5%<$j3hHxMg#OKvXP}2~q_nLEZxqbR2i9(y8@n`+a!kwEazdn@i
zL!zdoR9}Gy&+BQe4Xg8puzrZ==7ltUQsL*u<bj#3%c7>i+3hQBYp*XU#mz%=JJ-B2
zIyT<nsU<DL^Sd{=0C;#<w21%2go2F3{Z9x~?0AT<2wdOdBXNYp6a0;!SN+!?P8#!d
zw@(R@8iW?^TxYb%e*-|F<L;Y=1yS(YQc8=*!)oKdLim4<fd0Qjc!2^i5&rRs3lIfd
z3=8)xBKAU2$#BTqO^%t3L<Q+fc6m8*d(#P0?^WJ<rE)<!uvZ-%uUS(u4wQU?G}sRL
zvEPjh)gNWIK7az!cEBs&w0>HQ*#>ww&G~g+sU13-kL*)w3#97o$4>4aU#%Id4L5hb
z8LFK>d@5QbBC2cBX@N96H&<iE(P?`utVJY1S+$9bl?fZS&Gq&zEm{RKf#lU>f)4$e
zF90czHDmCzsGc=>^3;7u`PhDKmw4Gk1HMy|^c?gBP_Gkz!2SYQ7nx{_JOc&r^J=bX
zN2~WbRb_t{9xBg)gFdAQ`301(X+3|2&^TP9pb8gIGtgP$^1r93i>JI@p8pU}61(}l
zzcBce6WD5@F~bO6x7(l9*@Hquow&`Ls#^k_J)_rJ_6+3b4%2l5RH`f~D|BsmiJE=W
zQ#Pirk+<}$a+(%(gu9EJDjisgW}Tm2msZWZ04kmk=(-9(gxT(g$WDY?kZ`5Innha&
zCMPfc+82QGakRr);F`u0Ws%NB|9y6H(sTRl@xcr@TByP#TeWjQbBx?k__odO3w_PC
zf8^m6;}dPAmFIm^omM#eaX$$W?KiTw&Ydn}QWrxsme^yAcsnLh+2L(>x4i2Rqn!8=
zKZVj&ROibSKVe?s2Z#2%DADO+!k7U&miOz+l-sN|hg~Wv2f`RamyVGQ^+x;6FMx&x
zP;JX>af_Of=8{#rC`6}W>ID#bh2yKEf8R87uZ^N|->=ll|D3FQ<%t0`IMjt6KG6zG
z<&Cd3&328ty#RtB;3kLH1H2?izp5FUc1MrQu7!4sQ5c(<vt^K<bJM-ZH%ug(Pv8+k
z;Ku!es)w1tRXE~+nAp5>Kf*ks>`&ewu9Nui8T;geUZ&Pb)BOaD7BB@F>(>)4Z4IxR
zX6B}4cDS2`$G-398rA>ls-cKC1-YBJTGX8*&b;pwRk}T{uTh>8S<1c_y`)H(JP6X}
zybqGCsnEZF?oIwCEv3zO?4J8^<yzoC<bkzd^?~wnE$`m>mOImnN16P_Vq5c|dN`=W
zolGPub`MKbX063DXrHi$eLN&@`nhyQ7-g2C8v>R_(RPB3?UGM%mVZWgSpr73!;`b}
zyf2YX=zjC*FCC?vn!-AP#AN|*Ij?=!;kM6fn3>B)HSH^HY6vT)xf$&<#JiWP(L)Ej
zT7EJEThIz?G8D29UdG%f`V8jv(zs*FhgSYe6EE(n_BX@7+_84_45G*|KQtY&_-Q8w
zHa}T9p%8n%%N2_$u6^PO*}wOby`F7Y+ULAfD7^2y;w%KWSbDyFOnk;_cxOu4b**(T
z6Qtq0SNB}?r2NCq;(fWRZ7L4gipbMwV?*DwG1I{$g}$X;D)yJt%->HohXLF*&MM;#
zgv|+#P6qm9D!Z*t_xcrjxkkQS#-e#(QVR@@!C23$RkJ$J)^xgM{c4NFL!NQp@+bO>
zu8R2po7r#W#ILR#`v6?ZT_=7Iv9pqF`A@0`T&+4<S~;W^6r)ad&&4l*`|@kLtSNe3
zR-wk8>7elAk0&9SnW?imN0m_=@0wFly5_;dxW;QLD`wtP_WM%4_C!P|((T0oRyAbZ
zvz0myi>45FNy9#hm4Kga`v_d?j&5!p`cA(5{^PPEGQP7Y#W}dw1Vc^M3nnkWc70Fh
z9arn7&;DFERIoSA44YV0F4;hlVgpjttobV-R;|w|EV)0f2q$ZTbX|P=&-Qzpj)*6t
zvrMO$0=hCC9*n$OR^s=#yUm^~E^|^ZM3o1B=+Do0RlL!&Aa5Sxlg#+-)mCkH&+3qo
zg#)-{W*>$FN@u-&p3YYQ&Ha8Jawv0B{^=Q?w6W`3Y0BekoUQysvDBGs+uWUZPr37?
zQw2&?BH}DuG-a)i_KOt+WY4uTUe34QcNJevY;LY4#*MO<d(K16!3(qlFMyh9x66sB
zDcgNr%@!bL^s=wMu=byRr;VF=+3f1e+-JhI38t*Y(gEXQx726e<lK}34%z8^(cRwO
z2T`rzntAXNCvz>I+Bc(SEe&Hs2ayK$HKlHaz30Sdc3Y3Y%vl%X!xJvwq^5aY=H}sV
zifEU|WhXxhW`m|2cp#@Q0Qp9vC@tp)VJAP=>lj(iw%iE=y^o+5z_j0NbWTkIhO5ZL
z@Gn`XHm%<?CT6t#N8?o6F;8@p`qy(yl>3^c>FK&27O`)?zBWkc*6W)XD{dA(qMZ$&
zm+nK5T{@^AVqYtH5Hz|ZdjY6CzcQVj;`G*2mTKka?u><-{iPS+<a)2!Z3W6dcusa+
zyJx#t0F%{Jw%&2S`NKRCsD%BdzV-6R)_d>t>WRnEFCbHsWF~Wl-5_4;XWI>zPzNsr
zi0xgt!`1Dce)ka9CSzWWBU4UYb6$dvU&LA5L7so-cUIBt)fV&aL8U&vtt)l!Y3T)U
zXunV`QRnoo=JHd4#xki<W+s+<MOxk3qSqX39e@vi<4-i-uNaGG3EFZPdjUwhcD)61
zO0yHHl9{XuslNd7zd?diW;F7n;r(ZKl0{#p`|dZBAp$k}{T1kQO)2_w)*A4oV2Dmd
z(qoOM_I>hk!TbvV={jiV=`gMP?g6FP?v;CZ0nJe;Mb55YcEjF@_{S-1yl>xPL4px)
z>+V)2&-Dulzmf_7L{C$P@W-f#hqmdfUjVN`-}RDpEu@u_U*CgIxWk=1sqbES%|8TY
z9NZ`N^pDxbIhIvAWd30FlRiDF5!zp@-e0`hn&B#pFg@u%4hY&|JuNj1U<xKpPqaA9
z*<v4er+!YD*x`6gopY=CQ}=UPsB}n-u=0b|(~-4M&?bLcfJ52J++x(MLxAEXhMhwX
zi#ya?&y#3wu71rZeC=fINC&zc&AU7<qi*jx5s<RhwsrE#d#ap=J9^<Ktebl)M>$;F
z@3TKP8MXedTjcHPc1h(=<2L8L@LmbD5^3uG&HMrgd$w0O+=U-2|2g#03b*;7$IocZ
zYt3P)Q<8!o^+$i$4-@R5H_r_;if^rOS*h?pWY^p{-&f}ycpI-9xV@)zKxyP?m!ofQ
z5g3I|=7;ZXtZ!~x_BhSI?mTupVqLndSb0uAc>xHk*BqW;Sqv)nd(L%Bx0r@dW}OF=
zcbTaFB&TOC-=3ZxUDK-}#%HytvM{p9S#uKoBN!yQU-DIU0vJmQbl7OtGpAyFWlNdb
zUr^2hKJxdr)H~SnHHH2Gq*8b|H#`nZ`F+1!epD`r=6IjoS#UQ*PO0~_<nXw5$a`r&
z&Ki!){*Ay4-}KePTDgjFhP4kZBS__89Fw7sih_y8zDV~4@B;bmYel#=_ukcwl-nR#
zOXvMQu!H@``{eoHNTg$S`b=0bGeB{?>Sy8QeSx1H=8EJKQIL$4=dpvZN{PW@oz;`g
z{^_2HKI6e`!^~hi(0W$%gYGb9bhJ*8^YYp)vks@{+?J8G)XAFXsb@;Zjo-vg`{ORv
zdk!35)d#)0?Hn|Ga@tbZG!?uUagTD?5?c;xQ{k<;GQ8%g%u0Qucgbi|U-#Yl!tq;C
z%SAF?oR~#$jVuIIJ~F;Daxosi;OqT-I1>~#F31V44n)ExSV@K)Tk+)P57#gIM`k@?
z@7+L74YI{}sZ#WH56Cu&)Q`!AQJk7H-<;+jR2rtvid3G&UwuR14Jy$9Ir(56?=1u{
z_jlJGkIe<PcOP719E=_CD-E{OUpkj+9~#-({A$&ZbAUbuUA_QvP4*mL0C4p}x8vn%
zcb}dOxXfHr$5=a~dN<9@EZFDA^d)IeQA}JHUjTsyqDBtRa`$oL4jFo>v(DvA0{qF4
z`+yqfN6H4+^Icf`9a>kpXH8j9%l<Ikxt#N&^A8J$!sTDhRYzB>Eq+1Af=5r3H8xXM
zffH9T&s^Xo-<mm%2!*jau!b@pd8eDVHGg+Dw9-3hfHn#jCDPy$ZHIG6OoEb4{cOg-
zNrwk&d{AYUqT8qcJ1AktFI5Ke+~$c+l|D|=1V?&lwW?w1>-7Qvb>DC7>i_iDd;u`E
zpp6Q?u^VMiSwA`JYW7h4M0{uxxj76aZR*C>ZZR;qJJc6<Fq|eE4&D(WaR^x-ce!q>
z7*ogt1*jgd6_oKJO}Y$^I%Gg6e-VXrm2d@G-v$J(t!cAgAjXrtx~v@(0>Ybn?ZM{X
zOwBX9f{aY1;C?_}NG^E|+qa`SpXKVqM@xhIE+ms~r<!jEbull1Ulox9g(j&|SS}V$
zP0{WnP=I@=;jZ9?R}&}p=vd}%bzeM9=bINmX^<An7~$`&)q4ztsKvM?fPePveey5Q
zAep_O-?_hMDe9hCnG5j7z}iyHE4pjfe*MBf;f?2(dySXgta-OZN(+4ah%Ekl56qe!
zJ{Q;#U{PI}6)tk9kF-cz54EifbW(y2!Pc0DN7~b?*Fu;P<n#GKk+Iz|kd+x%j%N0p
z^zkcGl$1H1<zdl_B@Hh<gCNd;5f4>|C)g5Cv=F2tK%1>{omt)Vts1-+pk!M&=`}sn
zR<k_clnGles#vKe{FaT+4R7#v`G@3a#<F%P3wK#d9!&$R#tz91j%u1pNb$8H8ZGS2
ze0~`>0i<ksx%n+m<XxQ(DRkU)R5)DXx}Y2pyR7v}mll`6a!;~oUpF`61G_PYl?W{B
z-N@|yEs(_uxLaw$26Auj{iNDq&VO6YW4!jUd{}$R&cTUynX-L<?*720<QacPe$TjR
zt$?jg1fNYQw7++d((5$*u4b3QXHb~8oAPaC<hY)~k_C1qs-IS6SNse+JM+)ZXaWPX
zvRG_~*x$UkbtQgUK!`vg%`2S1yjYhcwCLXGFA33~abN>KyYm7VLe_9TAXcjCy^J<d
zZ^6PYy?Mxd%1C89(*`Gw)r-+d62CzSrkxCV=-NHe;!_$vml?@S|Ayrlk33*$@yxUy
zvqQa}p9$*xMB)^ai_+?Tp^ElOV?mSa_N|u1!N=wn$xP}k5302hw>L9;Mx><WSt)B-
z%dgdYY?ps-ERnZEN@RE;D-HX3jHm<Yjv^HW_>6Gm%fH11qYATC(is*Oi8fcqBd_n1
zTZKRW!~%r2l~_ELiljR%&kyY+K#)I#z9G0*j)39>%kpqh&3MnT)*d>y&)YUINKFuG
z(|^lf$(by;i{uy86&O;;I;=;CT6J>0Jixo*DK|WIvUsH67TtMw2PZXo2c*Vf03grm
zv)#D4qMeOJYbks^!=g+n%%m1%z5lGE*VWm3)>N>JkTwr;&_DftWh}$Q>gpy11S|lW
z-Ltl=_(^`8GEy)G#n8Vj{)_uiz0tOEbBDo|KnrIl7c&n#b2~>n4<Dc%$lCcY$e{&G
z|KJ_i0C9jN44g9m2c}X5*uu&lu;l(v2?qH9@c$1QKmZSb74ZJA!GC=tW6G;a$pF<n
z&HlQC@c}3Rc$g684~7Ow0ubQ;(%&2bMu^CWe+d;C83`E;6&)Q76%7p?6B`p90}BHU
z?G^4TENmPcTpV;vJbXMHd{`OhuMxPvH4zX|VHI&O&@f=!|Bv)9BJ>;Z`+q}(1^^VW
zV38623kZUp!N49E5Cn&W20(uq0La1+9|U+LctivwWY`Y{)+l%cTtp-QG726QyErQT
zTNvmPLO{)t_XmxTNFu(Mvu<78{L`<q*EEtJEZn#>G(!^-^Xp-j$o>OZ`rAGK5)#Z5
z%ohL^CgsQd>jx1Y6%h#$9^tS2AJ_>O>8}?xWPF$t7_9V`qxTm9wZ!@#6hiYe&U_kf
zxBrR8>~?M+H?AYXwW<iSM^-fXj4?Eo7j4R1ht2wsu6Of-);OKzpl<_eF{W?24A37L
zafJcbeGXrjoWz9g69oL=T|3IX9}E9DCG$k-IWnar8>FktQTz!qA+sv#+yD5;a6EaK
zg>y-2z##^RpD}fm*OC98*lxRYN?b?M-nfOY!p-F{)X|;OIHu@PKeTzY=%$oh%#@(~
zOT$GS)%An7JU$vFH>IN_0XGx0J_ph(To=_s@RbQIyFc-m@9k%ciABy<KWV%S)cxXj
zrnPGWMJX~0Wg6aJ*6`ZG`>)RP#+23Tj15{_LsXgU`j`gNzLW!)WOx7cqL5pTlWfvJ
zb?c+%tn-_?<&@cYm99tw`}#{6C1VxCK2=Sr)E&o?s%tZS*_u^zd<SV=YL|=IW=03(
zTHr=0#9v`>*jpzb?tI_Ls1exqC@fx?$tw=~0+mqg!%LIhtlK4~$Tq7R!AY}K*hCr9
z#<u$2W8F~{9_)H<5gE;W@z$7wwR%m%t()z=`l`KQo55+0iw`dynVR5DvAT9T^gAYe
z{1k3D{TCbg%VC3k&0zWZnXp6vnumY4Xfd!j=i~b~7pYquO;yd0{Y^}2TVa=qJ-PnM
zT@)jEzh1d_(G)QeL4D201N^r9UI4s}{=OUY@)qo-X9Jt=a~}jxnOX9!F^{eW`yFdU
zFWq7rb>!Vos!SlMZo=f9_l=B#pRGnBxr*rX9tPoS62wI@Jr(+NLLDZeE&Y#EuawJw
z`j$+hS%D+pAn`R)1&_vU5ERi0My$#*;tug0O8A#(-aaa|Y9@Q5-%<?6F6`BW<>D+x
zDF`v{_%G>beEgXnTm86Q!lb9&Qjw#vZ-xwy-)3ti7&+sM8e3=@%dxm3KUW9E0}#zU
za>Mq3pXpRLCkBCMukww_I+&&wIac+wR@?ckM{7g0OmvBg8KzW9^MF*0%#30Oa;IMl
zr&(mf@;(d;$=zaK^u1C(BduH&U_S3k)4lW^(Mq?a(30Z3w$9el_(EmBy3wowxm1tr
z<bY!$!<5$&Z3rC@&RLU6Ylyio;hm1FS-n=w^Au}?FguovX?a0ynkzojI0^Gi8XETb
zet3W2PAIkdkm#3p`-Om+4$C4`s<NOT#3-LqJgQ=+DZ_{1aj-7&f#m)LP)(b)_#n(3
z8>QjcCMs8Pfp<RGH#Iv7egULa;7qg3A~~!mHbO}Br{fl&!DUSJF0xr1O_}0CGPgFx
z{Gt5NJ$ElH`B0ECi{Bgs;RRqaO;&P9nk646_vk+KN__f2&XAN^Awc3a9o+lZF~9HW
z9+G~bU-dFlCqlM^Gwd{nQEJHF7x!w<*^l*eK^g4{K^X4@4$5Ym!h;c;o6ux+5s7oe
z5nWuWFbl?3lR;!?<m|<<UW%O3zh%skZ*y@Q1`{Y7XaVt4uAI@4VhxO>GJtDpQedAf
zB38qsYxI%)2bx6s_=pg&)lpNo4IC}|TbduS0e|quX=(7mez9qW@|t*rlb~e_Qk;7J
zoPuJaeYT%(Sp<|%EFU&m@}_9qXFRf)G-zTmWt3fWnPY0<$gzm_S)<?)kFGq0>wbrF
z6p@$Y1Pme8nhQR7;tz39p|49Wgq+@IF&PvkdF=c25Xh)1QUrcndHW>-O-{itapnxN
z9Ye_6rQVxe;6c3mErsVtTT4fnGfQY+67q%!-{-;kTg@2wd8RVkRpt5vo>nWeZ)wW`
zL>1esIcwT>^G!XUvkMEaImy<?HW2!*i|vf-wD*q5zig&4dA}J`NFs>xm{+X`6M`2<
z`_|XC&V-VamexUN6@=E<*0+|!9jd!fJg!L?J*$6=IHOkIzpu-ZN`5|>)wvYFsIGC!
zw?F??%Q!Z9$I7z33Y^Jlc#O^KU~q^km@kfaA88p%PV-5cUz<7tOtLx$FWz}tLf*HB
zE1uC)jc7<*`rUMD2GaUd0h_TTu4dqKVPE*vCoCRr9_S7O%Z)cfE+{fjp~SvOma2u!
z5x%iV4KcVdK~T0=>rw}$;%8SLhSyek3#n`4r>55H%~k-v8dFZT>#rF4KR)SAHob&A
z65~AH^%M5e*5w_Gu&1BaA5KyXvKNRT1yF&nT*M<CUoSXXS^#H$#L4a3inb%Nx|rb_
z7~VEmUWTId)(nX>_a=@HTDn<#J=l51zaA?0$g;)cK>om^WxXQb)krkvnA@PbWH%k)
zoU&ClLT%0)Pp9Y_nQGd?dYX33<3b2tH&Bu0$q76gDERgQcqbxoEx)ctIkq%3LXKuS
zdOSxd##)YsE(aKQkHXG__ZNmej53S1Gp3-Cxe&ep*!;QtC2j-8!TeRJ<nLPyPF_2$
z8foPzLdxdt_YLuq5x?WfQpxJzGWO4lJI!M#r#U2l*_J(KaUk(W*<V&<sn6w*2v?N}
z{CL8ivLB=9Zr17kYUrb(Ab5_NkH9K9UG|BM^3pAsc!9AS`>pBMf$f5Q4PzqDl)^&E
zZ%b*)J#tLTX<xU_^9FKLJFpdtn3BTq#1eA9u*n<^2GKyGH#Krj=Ioh0>x;i^Bzl!w
zrTAYW5zXI}1Cj_%$UNm$YzJdghG}j+%-P*&zTWakC+-pO2}OVLZd<Z+xv)cntW#eb
zX|CDsH6aXqDlcU?kg&INOT^I7!Vt%M<xnP=5l`v%RY#<VX4KWyleJs!(y#wHZ{f1~
z(}=ZECX*2bC}mg?kHka0tw}*c@(hHK%<L-cGl+kt^dQTroM)Zt@gQNOB%*I$L}0zI
zUc@zPpH%yedlnvDZ~40z6B<G?Xz%S(_ZMse9mC~}0H5;ls_oEs+AAE#3a_OZv}<ce
z=Viwt9)b_VTn&V&<7wccXT6x!-o;GpG<9!zias{_CH~etDu}3Tk6_(G4()hCdYExo
z-_$&8d;xGTR$lo`-<gfv2!7@pSArgr6%t~G9bp6vna$QOGU{hyz{$z?K($&u9jI-Q
zV+?boZs_z*NUVkp8&?h;&IhL}HCFMGUlT#?`pX%tR*G2pG}?M((S=>d<*^Y1S@)cp
z^uFZEu_;y0s&A_PRN^bU$o!P5=kU?s>k1MU4Nhr1-`?@0vB#uBMWWy2-FY|ju0RH9
z064#fLsitZsGzi@C>&>0FSxeowg2ELt-Obm4gl`z$`xEpFY&nCuk_UZs99Gx!~v*-
z-L4#ef?XXaV>m6!f8HG|LwNQ5;*=1$xmc<rYUG-$oQg=x<~vT1jLi<)r%_9CII4Uw
zBs|Jk_sbnT4mn!<2s}QefD31|*;~W6hvu4dt(*WgtlyKdRfT#6YK$HfJ46i2=2MT`
z=9+)HluGC+Jsp9sl<KnwU3c!RY&{A+uZn=)bo{2bpQ>6uMo;<AtE}JoB=Q<M>nyPt
zDKTRTMe1mOF3aEMWP@vDK}B~JmjZAl%RbxaD89|1c0(GR`0lbcX+o1U9*{+rU$ZEh
zS#Eb#!?mTd8ZBzwX;n1WDPJOtURqLW$6%83G5cF9Ha_J$4@xy5cH8It`i+D0C7z5M
zxFwgyBCd%^a02qM=!V=~w3`a+M}sr=1kNIHuvPSKyuF>%BzSgiR(e~#+@9cTae<vj
zIn`j-Y09Fk8kCpC!XE1#o!%M!W<`IBas3K+3GMWbpwY<XRGloiS;tQ!t_?2It%d`h
z7r)k}5wAuYEy&aJUEY-7$Q<ucXUK)tcD|JY=v1xbq8H>@(@*F1YYJ51+CE{V(T?v{
z?D)M=?y~6m?tDZD30_?h4Kd`as~w#DrZXA5Yha_pASm-=^7l^5H%rXT9V3dP3*hZ2
z2$OZ5;{YFq{1qxn()ou^Xa|f#HFld|r_$GkmEZVBc_Qr@$X)P?)WN*Gt;0;RN%+f!
zSsq<cm*Eek?-edW0;*hB#@~5yhY19Shy2N<G?x3K8C!;9{!zFtW`xcXw4sdHJY7I`
zxgU)j%#njFbd}|B(r_)##7bV0wi)Xh&m^Y|ncV*7h!{%aK6#mCNf`Tt>0f(T<CQ-%
zJvss>5isuE8)1nW45ukeJh7qGEEo3{MY$DUJU}3}zwTx@HBS9g9J&VC`FEe2!%&=7
zRnDpLv*F>*VC1A6g0!?azUQ>}JR3kUs0z6aI=3;nIAuO%WfmIjV<&DdfT}9GL_|N-
z{nfjvMs3jAuwBHF#CUqJe6D#}9U%%;KG$D7UHulNRcz4B#@&+}U3C2G;aT6oMi|sY
zzBzsym#;sNydHtM#n!v#-eCDIkYa0N+R1X#mp{;k$I^Ec*<rLZ2vlHShDBdSQP+w;
z$q3BybdY@nL9}l5%jbXEHvg(EC(SrI{<S!*T0S45+%u&gfdMT0pa?)4egh><vt*Q~
z$ooMYWa!DpZgegjuGsZ!d!hcxNXzdX{WqTC?<MXPH&l#J;SiBsgq^<40ghA*(&pWm
zN{?%Gj=Wz{<Pvp(Tn)&d4k0Gj+l3@eGdsXBN0TGo_h8?m`W&1GTazLwgz2iVAE9`d
zL&gO4yWxHl1>1S0BpY`M?jUeB)Sg<N#B8*&HDOw4=7fo~=l5}LHC09d*Qi(mzOc{z
zrd3|*a(7BoF1@u7^gOdczxrriyOO5#PAQ=kp^b(H?!=Yc6Tj>`!CA+V(`0*_0Du1$
z77OJBbLA@cg|;u`Sfp#WGTs~=%<H5iz{<jvs3xhrnd`qb&~KWIc$wJV%zv>SSO*<%
zsf5j_;bfg86VJY)WJdX`(IlS)`L_2e!T~=fpLwn_t}1SxWE+j=6&$*->oAb?rIg@W
zXb777qbsJBr{$B<G|LPJOpS5uyXG_9&+Y8gl#~pTPVnUym?YGFP?3zn;%Kf<ql07_
za>IWozly_n#Ofv|gE(VQrz!UNO8_Co1wTu)On$jYy(0R<%MuiVvMTW0MGR@}7K&HJ
zX_(WDfChH5?~*aA0}|2DHW#EZn1aP{&*JZoL$#We)9-?07ZDvK)j|fFOEHApa)Kb(
z%PrOS$te!+xE-y*ho3k$cgh!~JOi|5JgVcw^ci!RZ5ah4<e**^_Q7%N6fA?sTlb2g
z!y5Aszwuk*EM*XyQABuT->rKxTe@@GX(!Tsyq~YE3N$u%=L#a!VJX)hJ`+e)UExhi
zMnS;jYED;4BOLpTv_Kgtm$j=xX-t1HTcyqX98^Ly&x>M5+@MCADsKEIxk#z<9Totk
zHk3}T!a!3F_cEt-R5F5AtmhQ*+xei^lzP)*nEBJF$)IRt67lTP75CHl^!y3Y35e&*
z<E5T$LXs0_=PMr(_1DwdNk}*X8-3fNQ0EQM=%6sU)AeMUot>@`pE-DXwB<(_Ls>4%
zhhn}XA4HOklyG|qn{k%_7rc@PCT88(u-qAQIlJF!`qH29B2&+FT2vlwj5$}N9mn)b
zvQNC;wRjbmR-3OzH3V4~OX!PvX6P{M2VBndX&%4Frlp!v<r$To-=!pKE0DM_l6i``
zEp*m#<i3ol6^*D$#?ty#h=r3%kfefx9#=~cL~5DuDhtp}r+04YCk{oaq48vo>|SCV
zi~2hCXL`+(sw(bP(rKwN%~6Zv^F;o>ayIN0l?k-cYh(K?YTHU{D8Fn~yQ#|IJiIgw
z&FL*_oJ<LBm+8#oLl&Dtx3&Mu$96*_?F$2!R-uPp4JvVYO*(8K0C4;za8KyN@3~||
z_}pARDg;CT>SfD?r@q_@+jw=c^f<P4N48DZOv}w|W0}U%N2DHk0<qhq#H;?#&5sjh
zA#1V7Z{A=cNhj$NM94`{+D^J<gtG&=6@D3pOj*?aQka7b-GIbIqH;AGD$<Wh;RB{|
zrSq}2UFT=N1tcMX@Q>$V8yhhyIkJeay8wcU%Ou$TS&z7ptfK0psjqx6;4C!5S{r#5
z%UpKveJzB6LJZM;2tW+P6HSWy*dKyKg@oG#yPe3Mh7zo<3#tr1-u84Ppd#YZ$BLSi
z_C)f9dE#NZ1*W+e*B3SsjY5MSATyJ+8F4jKSHLjACIhX;%l9AIr#5+6#+ql~b0km{
zJ^Um(Sa_=`koP!1PWA06?ozbS`gS(MYOZRYO{(D#`I)a68d$(M8GJPAI4s~dqI<!D
zK$3bb*7WWB@CNDcMo>a;gP(RGlCs!nR$<vyjDrYB>Q!ZQKE%6Rx8t1rGr#?HTp%gn
zwdB+f0h}2Kw@hIJPuYz*z_9T%2&r|@fNNh49$>$BAl>=TZUFeBeZd*NmFp;2-*0Jd
zsp+kTs8xxayg_OFxU-;^5SrXg|C{;52BS$9BX=V#<5pqoroQqMIn$@~PqJu85VQnS
zCuEyuwkgel>m3veXWU+$1m2jT`zj~<;`ATuf)*uwplPIlBJ>TLE?)6I5`8U^=Gzy5
zp&i2yLUTGaVF5E{rQPW3r^Gj0i~$4Wp=kic_=x#l{}KV~)cchBqFynTR?6xEoEU>j
zM-kVrzr}K}=qZs3gGj9#;f3|&TjP9*5x%|vB%_B_j^0NmK<)$cNcMiy<A!%U5E>BG
z7Z#RDNz<$wHUIqBQP%6wJflU_!|J1yj7Yzn(;qr=`(T=(&poTft8}Ygt*Oq7BZAO&
zO+LZWmAWOf&ytiLaA2~MBsjep?Yg6xQ|`Kb*7{6vHjINP)}ue#4B^>zE6mH|PS8Cn
zAP4^WR`#Zpq}o8X0h$stT>9uBE0k1Fc0crPPVWy|ar4yD>++?dl|OKvBj-zU*SLR<
zBgUvZoCv7CKw&SYq_<x6y!sx(;fu8<<a7efR7u5)*OC(R??rO~Z=Ee4e+YtPS=!gh
zM}}IrGaMP%3o#`m?e(aD%GvY&kQQ9Mqt(3DTh?lAQ`hV)Snyk?H95M4&=1NiP9Jj=
zI-;9g_{kpgB~=NWZ@Wjx9~@&gw3|sf6SU^q(FAs$(W_hQGi6{m32G=evM@5KX~rc|
zhPOa)BGXY0MQlTV143q_yphzPPuX{BDB}jz!O1vfnjtR$A!o-NxN{`ZNN$f12PnFF
zLwn+Bl$7d|Sj*i^GSg~72RtBtIY>E|xc@c(LAF&qme!l3T3v~hZ|o8(IGEly{YDzJ
zJs#mdWr22g#A{U7YRN&NN*v#v8U>RX*U)3eP=C9g5}R3nL-5t|&i5qX@=O7Lrx8@%
z3OU~AKz;nPXA<$S!Y!w<_*tp8sV_#=uC8TmF0c1yb3JBTq}9$TfFZCz>&S6NC~ueR
zX(R!q4N8EDu%Nj4(^J7H2XC9;OvbUHsr>W7WZ1Dd<TJmKlUSF&x?;DY$W^WAl5;7J
zK(xeRacOaYqb65zjySZiO3*7e`tfWAydn;%*f=n77TESJYSbjc^){oL(ilx-N*rQB
ziC__}6WOM)9P2W0DAg~U!IWc^V@wbj^;Kyxb}i~>XS5PF5}VKXVZQKz9f5`YPbZ|I
zpDnA3%?Z0S1q#;6YnXBp#70?Qlz%*9<@xbmX6ASo_a+=eV4ur;QoERpp{%UZ<6U7`
z^6MF(N-tnT!6`T`efOKcHzu1rU-<1es~@f~EGgmn{Oh~m(8ioj5%Rj<3ur(z!9dAq
zrdRNJ!1xd%abWu9Hl-pCxG;_&C+u7Jlj|v)%RcT>7g2E*1?H$V&t)_kL5NIwAo5$6
zqp>ml4ATVVla9w&0NBlE33oL}bd1zWqwy18F!t5;{-=I8x{o4f5G;qIj-p8{V<EB$
zs6wf4V%eSXh(vTW#GdQAuQB5Vz>Z?Bu|tSc^bQgBIn+kYZPVNf;MFRfn{Pu=!l$!D
zPa=pi35(7&iXIrY;ewRsC5Ws0Fz5#he?y6Eu$#D9($)Z5f(7zds;_jE^o$n}PPK<N
zW^zq66|rjBi`$r=*DvT}({}32aC$7WGjRP3lE3758@-WF_>KHHZ}y00<s{_LkA$o{
z+CsBS8nB+a+A%$Iq%L{--RE-Dp<bg0TCFlqsB@iY<3DTHzIdr0&h`SpvW#+K)<J(`
zZ>>5j+`9=Zr3C;;sBPu8@HBlxzlO>`e$d+I{roT_IWnKo9_a8hJx-uJLq`Fmrpg2H
zOCjJ#*AJGBHCHCpFWb=2XbPg8+UiuUE*v)jA84qFdzKZ;vl8rrW`W?ZShSL#2+-bR
zo`u|6lc*8wRITBIypNwGKjqkMdiWrJ>3?FVQzu3zQ*eddm*~iu|LiAmejjT~WVeSu
z(tcB+kcv@}qplFmQbLdluN^=|3&c^3pT?rv9qkA%jnkNUWM{U!Zryr=+2RDLd;uhu
zAHI7VGWrIZA!D`e>?*HzK8Z|AgCmaU=lY57uGCr6uBEGabtMuV7tI}fqVHYWt;WO=
zdI$cU6(XaflnV75Ye8>``qn!A5vx^2iaA+~OqMv!|BW+$1AoC7^Gf0&{VXDufLD&+
z2JU#uk`(yLNXfM0`JySW|5~7l$G{*uk&#&(FL?l7_}(1!$6V&@+IqQ&ri5`mtPp#+
zCtkSyh8PZl?~nKW1@LvP0~Fx_L%Z#8jztFTxbWrl$J;*hM|KXjIW*CNBWu5)oe$i@
zsr^nUj76#2hsw=V$y7AV%}#GO);pwWn4y&#g4i1wKKsdShR_<{&e`rbwzU+!({qZ|
z1g<o^Ewwi;(v=Ti<|+;cA#Z!{+8sv39wwHSIVUxb|FXBcu2y-QmJH-GlBJG0n=}dM
zj4r0uvA^-m<PA>+FQ}P`H@-^|eDq!RVMu&u|E1Ml>erzk$=wlnwlJ?f#z?9;Ii*aU
z4^={J*$5pySVf#BbchU^n_mA#Tfu`HJXvv<asZ+79B(&}lO9u`rD9xw>a>(GA!c_V
zjQ1}I*}ZzpTt1;(2ui;!1BVrsm}$=xA$XnaL4hxT<6E=rghuK4bXHG7lA-#~kuq!N
zt47+tOV9)(H5)(s51sH0B1A~+;=OsZ!UZt-Nl?;T$Zo>#K6pj5)F2l0*_MOCZRF<W
zfT7B_F4K{_lCSww_1<@|DxU_P=7xGQ6%7S}8M1}P#;EP8ORH|S*JRoWkI<lea4JU9
z5o@xXowU#tZxuR1tOl-72yjychWt_1=>#4@c76`!&D=44&o99$yI5v1t4*MB(=6?u
z<lb-A{gF-rRS+>Y59M0%nwo|{_&ceFzmg+<$MFGD00z;a#qwxjx>#N}E&0?kVipd1
z?CmQ(#w*wK;arbx_D`|%4iDVmFBy5?p^pjRd^xgXMZ~KOp79OD3Em`={haO{zs~6~
zXAP;}D^<l)r{xmr+Zw%6-%WcY&<je`qHHYcnunB&#=Pk?wEH!G6sTtc2qnZcvakn%
z<uZ~GT%VD5#%ZM^+-60_@2-p`?3wU1Q{O-puoK{*Mc<|OXc(D^i~EJJ9J1U^YHfWO
zsjaxuGsFSpp3Ck}$!O#HAt~og8)-{iebc$@SD=cnB_`G;&sRt2*vTSQ5srzCK;Kp-
zO_%qVqXI1nY+99UJhl^tRi!28s}G!Bzh?>E*`F~YWRq~|XA-dCEnTCOuM;dIy^<ze
zet7p3GCXa+_qnXh9Vo0qM!nt$*nE}gm1&9YKeDw{U*##xnb@YqH36bL#jAt&b&~<P
zNb6r)Ib}$OljfFxBID1SMCu(E;)rSQ)f!f<??l=;)6gg(QQ*)$N76U)(X&&@uWY=R
z*W4gB#8WNGRY`A?Mbn`XL)ywp_L%S*ez#a0+qEVse?VqmU1uk62k(QfCX_BqC3iUh
ziXUQu@HjYyIBJmRVN#hTG2#?{9`Hc)fPG&`Y`6W`nS5}e<%D#9D0jUZSEN2IigRHY
zp?gsJ+kDKmZ7aHIb){6cXMc%h(`ncTI`=e}%eIXnV(?=Qos@QH$*b{;!Yj>byuvEe
zr*%O&XucLZ9sO=dKyrO&p8o*DFh2d#`m-)%OR<Y%o8@{<`js$s>S(e$gvM82!43K~
z$F!6I`;#<TWuCC93?21@1^80F+5U)*ac50!6%I%#H6&mY-urnI4<ZKoWwGLdM)6vf
zP*EHh`Xp$<P@+!QKDN0O?f9726OX^vEc6By7?=LuwFo?_2RtVrKrt8i+M_-~AE&~8
zOuXo03CzhJCYS4#%29Or@Rc%$4E-uWsvx^0Sf=Ih(giTi@;;GV?2yk;J%{&PmO~^B
zd7yB7{PEBCqNaJCOlH~-U*9SYA$jO&0nzXf-G#qBv3E?8yyn>NvaK0uYhErVP42W}
z<Ed89n`Nak8UKt5yZEcQUPHa@)KlyYTuYiiJL=26_mvgqgFTFyn{XN_?{e))tfXWv
z>{~^d2fqUL01^1Kn`eqjV+Kxw1Cd?6ECUu9g-Vt#UCMUnpr5?2He7|-$;Eyyq<KqJ
zxt~aU@I_RCywMtnAXSXhI_tX+55ZeBjH6+=WS?}N>f;rYFv!8fJo_PfKBzw&-a#SG
zpJ7DVv;HAgR7j5!f#3c83g^|qu7XXeR<@UYrjwG7?cKYMJ>>z=Nx?|4&3QksR0rGc
zli4|_!pzIDF`uA4S<>>g`ddj<(dZXI=dDcJu=M2vjnxy!k46$|reBfvZ=5Rdq;Ir?
z@S}w13T8I(A(1=0=%o?hg#P`SU$(xy%9|!+9dH!oLJ$)Sz1PZ@btSg;<qoV#Et<<r
z4yztt4hW{3zqMzVq9G2r`6Sqojy}B)gnhwv$aY1DR#eiMCtfT^11?L-JtTW{xoVFb
z_Q?3OB#|FpK?apOW~K|Oq%6jtF=U%mYj7FU)&_BNzbZnl^sS^y_|{XrqKGFitBS7p
z2l@SNSf@u-+NqiQD$xaM`kBPNsv}a*(fmn)pV6YnPeD*cvHUC6^IuGwCu+d79Pz!f
zqmQ8zym~|_BXOt3){Y9+KlNC-nrQ38RNgnmGaaCm{@4r+OI2+0Xjpsm3@nn#dK9T)
zI|b|zHW`9KQJQ&Y8uNmWtB+Ec?Bx9i8uyk7(zj$9hY)i=FHyUKH|D2Kt?1_*w4<yH
z+%#1DTlx}c2`ZeV1C>rhQ%KVN1bNFBQ*!y$79OKpoH0roMb)dy%lvB`YVt&n3gLqb
zW)cD7bc{0?SA=ubi3%P&{dR?|Iwx928Xi69Os;*|IzuY(KAPAc1sd2Ks!d8;mfWyF
zV>P-}m&Xv=s2IT*;XN_AtG62CiXk)S>^>(Rzfp9tX{rh%lZ!U)Vs1pOoXaRA2`UR2
zYoW0h<NmOD%g!y8xdO-=Z)1&2X+D;y4KLH)O)4MN@{};#ONGyqTjm)L#pkQ$HS(T&
ztkKNsbcNhQhc!zjXH2V-CtbG0C<$dC2yMbOB(cTuo+RL<)V;|5WF?NpvQl{!TC~03
z(q{{HzVwsvp8gW2=g~sVVt&rmr>vO~$(l>8x#k#csqQW#H8}+Wq&^}vB&N@OnwVdK
z9(~P5h3ZD_j10|MGlh~EB@}L`@hIOBALJ-8wH0g1`Aej^DcV_fMz7|fY$t67&Q-2d
zoNemGvja#p@$9`guI$mf3h4<H@)@2An!u7(fj}cyeJ}oIY+fBBC1S*ZC9D^K0%3Lg
z{Oo+&wW$v`j&>On(@9|A#V5`=*@n8bBcGfJBdMd$8bVW=cU3f_ib0Gh9ywto{i!$j
zvp3Rq$I9$VYFjK$uMt0Op{fTT;tPXKJ+vz}NXjfY`0adVfNzU`FL?X?kjsvcZ*?U2
zBSv3#$QKQ7kX+!YkCutq=@1+2w0@MKL(6mJDIe+-(?ha|(}46rri~sgLwt#B{Nk)<
z#YanpWjZP0d~_saX5{c{$SbZ)pO4I9|A;Qm?2=tQ<h&6Zhx^Ke%Y4=bw<@J<t!&`d
zMCAh#@CW--3H*Wtgbx3B4dJp%@H_6=y#>Q(@w8)n$o?l@$x2*t!4R?m74koLAvge<
z3;OL)qULFyuz>e^?$gM>rUP#KLyTEaF)UdO(A$JfO_}W&H366xFvQ?Zas)f)kzf`Y
zvmBm^_$0CC^p`S(0p(oF*S7mt7VMTqMm23BORC4{{Z^(&%6~pL^SOMY1&+({ZGjv*
z=63|Etgeq$OAeWrjT9y*yjkoMWUe*@;iBrwjRiQZl}Ol_x2&Mmb5O-j`5n{ZN}Hdc
z>Y^EVwGQ$h>gxQ|;OKGlv|WR0gN_|=W@qf^Q}wLwt^d;4`m9Egd@8Av3+-F2W-)fa
z<uI_XY^Sg|$fzN>xg9Qmr9Wdod0K`!*lgkO=AK@}5mL<cJZq~p%%L%7T&+k~rV0D@
zMT#2HS93GGJb3}`^b2P-qx{x*`r+`CE7NUg;pmC=z2R(JwfTC==^WI%bamy>$qC4{
z$x2^Nlz<O>__2vAS~0GCP7^;ff&FXzM*V%qa`y8oYJkR@MFrh=DCdVvOSQ0VClD=5
ziGt*EmwIP>Dl+R@N}UwWBMDv5z%PfWK8RTEAm`}}t>fb-8Y}=`!nxEz%e$F{n#K^h
zz|Tq@cygo(0)>wQ(Wt`?G$B<#yqOh#3B7%%ZD+u6XHen;WG$YqA0sN)CUIi0EBBg$
zlFG`f#{mH-;(v%W^uPR4QLdM_V%(n(jggk^TxwudXkK-SMPk=cPK_%KZ5%F1<iJKD
zMZXVD(g?)g&PNp<-^8Ad7+7$uA=5pzRKO_kqytiYtiz(%=U$$nm!H{Y{$XDfCZMZc
zaF#uSkh2wkXHNX<*tVwWwOdi&YQ|@-!$`AmGR)Qp&9d5}xCCq8IGf$@p+aG8kE(LI
zYzFxQVgD;xm$N4e!b3F=n%yBKV}qX{eO4)}7Sb|t4MKW*IYG#QM3M1;e=WQ()IPAR
ziFK3=x&08tBW-7UK!Ycz%!#9%!?<pChLG+_YEYx+&bgpp51B5aTa@eei)sEQe^cqJ
z1s1=IrS765eX6csPd#8_nJy{YpF>ZQgTnnjn|ZdDX84ShC}bP!+ic{X9WhWlC|<%V
z&4h8pmq4Xn#y<RBOcc}jC_B?`Dy&=e9uKlkC1SRYj(bUdcVxQmiY(>z6AsG^K^4SG
z_$Z*4?aD0cr#zgB^k$znhD0C({FI*S^qMqs?s#N4;){f)7bpFnu-M_dh+DH|e@WZD
z3r4c`&#g;-{zJgv+Pv|^AJnngf0QN}tqglzJ1~k~v1kc;%P7zlqUa+jHRltA?jNY=
z9ZsOz52n;NFqD!YM`#|$j3^Nz2_%SzjH1E^O1mO*a-Teot^FeNvxr~J(dbGv<`SqL
zjcT~jnbMaNcQ`uG2)@kzPJw;N=csk~C4%oTS+-THo=kbBbebPAH(J8ag}tE7hK0>)
zQ?-gf%@dW8ZFWiDvK;$-)Q4{c%FPk_J|H6@lvPc&tTB#I6^e#-En#6Vh!!!JF3=pc
z$Pi0v6ns~O<z<|9NE@$xeWE#mIF<xD(r|`=QEtD+@!~F{A<9NOd9)~j?%dziUR_zW
zGy_{deXmW&`&@XzsP6Lu&>Olh&RKCQ2;ho+>S}rwmrAhzinb3;-XQ0B*>jULv)RFp
zi_1y9K4r<Lv3WrwP(Mpn7R6rWyhsSoVy1ebm~Mi@MNN2-)b{p5(QvzLR=~u)Zh7w=
zV{3WQcI~FIbCUWm@nm@MEu5Slv!8BVvqneVpi1!)t$x3a&uEFtF$IrWmrU8U@c@4t
zoE?uEl3{`R-$_f3?Qxq?GDv;mntpMBkO=XI^0g-QrHtZpT`Rl~_ep*r;YrS`q4@%0
z{sHEExRzH`1Lt+!$E$6-a&_@i3P2Pc_+OhPCryk@DkAYS@lSua7}Y<<hjwK59OD2^
zTu)gHBISYw+?U92#B^|0h_LpA;@61L)Wc9Y=b`WI5XpPsEpmK+zyB@_9JwJ2Z0oyb
zdn-=}HkG9giRN37fv4lASa{Py$WRE!H?&743mi6N`k+%ShsfOW)~83aHJLIn6uDI1
z8!H}v{@wyW9YKkOVlvO?=3+Q3pHx)gs&*i^i&`<c>LPoM(EOU(R_!VQ1uY)sNkk)z
zo8CQsd=hapN1Jlgz$->Q$tS|}m<iqo|M}w?E1TVJZZ%dxQF#qjq_&Yyf17x7oTesq
zPF7EUd2dqSc!AEx4^@KlBsnw>Pwe{DU;*OZxLa3-F6<DkO?OX^pHrX{gpMa(+4>#b
z2ek*M0mgTQ&XNJ>lXkae<7eRpiXSKhrFq^_g7sw3)LSXBZHMZs28y;~YQ;CZ-p?l2
z>&P2ll)#O6zvDzK?qSIIAzc{N9w7=6*T+Wp&3tIvJtv&|UdD#6<0Sv}QSwo`g(CPF
zDDtsUb2g%V$6m3!mS*Ocptd26lRW9$58m`H&uKfepU(6wyQfSTvKf4rR8yyHT7|xq
zZXU*d-d|a)xXy=?n+>E4>6Nw8THGZx0iEupIXdaH&&=?VGfDmS$~JEOkQ|uxdAl>E
zPIoeWme^Ib7LY<3wmVf^T)*l_9h+=WY-VxK@rV>>Bo`@zWAZH-Y4It=W32PET0j%k
zdh+A>*Bdz1MFx36@!F(qu=^~BNY2bGdr-BD&a_rX?#g8^tCnb6V_)+)6~|j`nI1Ch
zgR+A<d3{jP=M(DSBQ%EXl733&A{PhmnS9UdoKI1-v&O8-1P#uWRv@HzNMPmlCSh-e
zYqYIlMOjRZ9n4Idu`9ji2<^-aJG*$p{z5`^&9cHWG&dSK`C#g&Jd`Y*0Jw4PXQmDt
zyOnMp4XVm+B`>YP3U!^%;>AKM&-c}tGfiB<MPl(i0M{rvqqOoX_UKKmOuKX=1)r2n
z`_|e*N_o{S9?Xkdf=}2pUPC!TxVJnc1R9wgnVF7k{VkmcHnJ0(x;uTMjGubWXwFWi
z3Y^lrMpwtII2)rOGYR>>7F3G}-8Mt-lC(IN{`CAd`z3ajlCmycUXHHgRhb-e3&bM8
zE4v+Vr7=%iYn<e;>R4DGOOLE7_eOKeX3OQk9cY$2kO&Kq#<d~WpL1-Ij&x=(`XaWd
z9wy&>$$>5M^rx!dcODMEw_BJn++67^J|`-9Ms;|F$hP&KLLm?I1<)_aumF7c;5zf+
z1)%s^!z8g{ukYmeSJ(GMD#v_rgm@q-=Nxr0zq`T)wW?kAMUMBeG5}yuBv#Bv85S};
zj3rc6c0ZQITGj9{f@(LZe1i+d#|H$q`<Wl5as6uHiiOksMpT&GTN{q&M>snPdo5e(
zNSCmzTOw)ys<J*Vj+Y=p7T|iTGPPcia8bk))K!$}@Tq8x>rH1cfqH>!9DyK14vI+%
z@F7;B(?+n0C1=~WJYX=6T-Ok>n9gf*_=>MlKscVo%d51$TBYUg6@|Jwd?yg(RW$1z
zRxEc1w?q*1iqvr~!Pq~!WR^H%W8FHWO`vp11w>*3o4u>`+kUfLvs$1QBl&eC%_ccL
zbSGP~*eowb;QtEG8!+UIKa$&E6$8QRj%3+$&ST0MlA|+aOT9%ETB==jE~Nheh?O{0
z1BpJBr=P+^a&d9Ad&Ndf&kWO8^NmG5TfeC(OuW6cb!r{kbw(Q}{{WV=--Rn3_oMYR
zk+-R9LmX|{bKbZfHp+^Jb(xERzk-~10Rw|P$M?X7#S^&o6A^<MZ4Ne?rpjDta8k4=
zDJSHi!TMtxV{hlr=Z#X$qLiLF+)qKw#N*JY5hf2&fuo2(ObPGIV<9r2Kr#ov81YJz
zsUkoC08h+(v5zdmj}m!|9_vJMQe<?2g1}J_q?00J`{BT)-6wUi1JxXN=rCGZPU%sc
zDN+jeN_Zov?|_5^p7T8M+x}Q}*MjBvkWS)EOv(QMJQZ%YJ$Sp&TG~=j<#kh3$Wplp
zQ;P`%k6|<P!t1@RzR7j0v)tJ%)lj_UGtHjPAe5+oQIBNx7IT$>O5lF=qd!3Uam}k=
z`8k5!HNTmZuAk}Fgk+r($qrKGi%V4(r`j#nU8)-kexS!acuGIq97kEjX4fqnuG7lo
z*HJX5Ef*B|?76g`DQA^7v(O&t_a7`?jLmz?d1EV9=1kppq_tXT0d}IQsm+bF34jlB
zNgxD(Njv~?R=h`e-O)?#nABtpfo-YH71vM_x9Vveqotx!Po=_K0ml{(DM~;d>K!qU
zd=l4P#N6;DZ*UBFbb4?|{I_V~+01r@SZNA_1+oWT!>%+}8n#)j0oU7XgFdAM#u+u4
zj`v--J#cX;GD;g)mYMp0d}_>>3xzUIUx~w_!bsKbMX@jH6jH__rG$_;xZBWQ-ARkg
z_9_GwIdN&ok3(dwN9aH2i5A=AUozy>S&l-=bSzg|?8~#Ns^sFI^HQY(R2y$-5dr~G
z_~N#yBX27HXyc*A0#hO-Bje5_b_q!{40R384&d$Nvv`+Hu`EHNHrb6Fx8=FN(LOV#
zv}$gkT;I9oEYnh_YMn2s6p2kUd&mQXwgKn>>Tt?%?{RBcQk0@psXn5wa(kQv5>ix<
zL=HRS-kpT<48$MTI3*1{RO}JL(AJIsSS+;lGgP}g&MrNs>d=18n(BLJj2xR#`jQ9P
zE~y*1nWJz(`1Qr44P7$-0Eo?J$T_lscC}qEO<!$6W)WLNwn7s62;iia(=o^(g9KuV
zcvHBh+2UJF>fCw8hw&{F1f=aS>b#;pp=1&9$6!29=#FO8{F83ZI(<fmH3uZEN}_1E
z5N<Swhv_#SC20Xko;RcpH%=&H6o#stwS!;Y{{Tc|0jp&tLtP=WPKV1+bx^O?m+Gqm
zisitqrCkb0>pzwwEbVEks;P(R_ob$kB*!vje7FY^`fA1AnXA1u9ZNM0EVo}o^JNcH
zzK=@Kv?v0Vq=E=M;h&so%{|ezJn8c4l#l>PB&G?8`W#_t<a1wR!E~9L3oXGz7R@ZB
zyRB6f3?phH0(kA*dwXI)$eA*Z$X$J=0$gPTnCZCx0IUouvEJ=Vwp5+T9%IEM{JkPQ
zzYP6w;HR#or^_{U>?JBMD5g}X0U%0@^nr_0&oj<bvPd9`7UpOdD|M3X37}Keb#(|X
zx}_~^B#@N%j{g7-IX$s(^4?sd(p?<Pnd<h^7_4f;S#f`O2{iQWQ?10dXYi6z3cfHq
zVl42Lq<3ua%?6?7TVs^jc9lY+(!<SdRmsu#YE*;BWk2{}{v~?&0TQqFoa%#Gcj2n#
zm1*me)!RJPvt(hQue>y%hia4u`ymKQ3I71qj`%r(Vb0I^sm8oM>mAZ8Og#P;6Q>`y
z-Ep10>20#OD%c8y{5ima<OC?EZ*d)P5>#Lne50QK0PBE)caljM)vQ4Y;gNM0J?Z^N
zpOf=N!q>Z%w$rl1o&9YpP_L?fxUjh&#=X&kiXBACtk$3bHMXJ{cjBcoqp@G`sXr`L
zic$vY1V=r4VPQaknF0iv@BX+cNr+Ta3r6iZZWVcHVW0j+{nte@KDFt7p((m-xteK>
zYX1Q8RbbN8{E~(KKln<p{8M`l)u(AmAo0&!a>S#ewuK~g9ZZZpvd*$<u2l)Au;=^3
zEFuC}dH(>kG6(+0LVzDA!N__*EoqtJKFDOaEzRP0n&`ucl!Xvtc^FO!h*+8Hig9G|
zm#MT1Z5O*mlJwU_eZuKoKP5Jzng0NX64p%dbEY|rw?&y6t;;n^+MU&aJxodM=QsF%
zYf652Ee0z|Ik0t)Zi<AsZA@C0+;(cF<-ih16YtV+Q=lJF9e=E2&D2s{DVeUJscNE~
z^zUi*97~P3l6s{;kfHK80TLl{qt~Zg5#3Tc8!DUQn@H;NW~fwWdImwW=8|d>vC*oq
zKJ~NgR0i?$IIVW7lSh2ER+5-vvYo>nwz!k|W3Sr9knEbFkgKvi>Xo+<a3L(cv==F!
z{zp(xMFZd3Mmb5JbpG|FdH&;+vl<&!RhFb3?a6KQgSieQo~c(Kn8zEWuN4&CQ}-UL
zs8&%HOeAi{(M~=)b*i^KXZAGG)mXpWIYBA=Lugl}`<l=Ti%{?7r9rg&A29dDUkM%4
z1ep9>bA>gx3cH=gs_Aom$EvEHTkkl&t6J1lqy0(3Ws*`1&nLez`WUej1WXMa5b2-R
zJh=#M3Iqs)?fzH;i%2RwPgy4!Qsnwl4?nNh#|4mh<|nE4!?#eCgCGw;BiP^|&u$NY
z*IWrnf<Qewb>rIsObF&V<DN!55+Qp98297yz>rFL-6Niy@r)G$0oWvRV;F!WNbB2(
z7;^I1TC1xqmnuuGzNK`vlywbMQ$E>+H1ZS_g@f)CkVZa$JSpo!HP1?N?xQ%%Y}8(9
zcGk3xQq^l`WkLtTdPyJdsAK2J3n+q~-rcd(U)dF@hQ7g?w4$b{dA9nbUT9n0$V-4F
z$oLYFet$eE7YbH<q$_daE5pN?zN7xqH0q9>8Mcul?UZUEwy#Vie}=QsFQgAchp*sK
zh$*RethZ^*6u)Z@)LZThxaB&j9At+WdSPf&d?+kI1RjK9`s+TrX|90eI*k7Ssn_&d
zt?Z92mQpv=*1l6Ah8mf1!yN|zNl{8cBb6D`_NVI&%Q;@}`3oat%(X>c<9<SFEh($7
zQ)mS#RYgYhhZa&q`e85vWJx*d+zpj-b+MeMk8kd@7+*-GqKBxIW@y`65!uH?;AccH
zO<UQwvc+1|`|dpNmqzo7Kq@2y2_*0p^^94a4)AEa(N{@$)oWAKIa1ZL_jX@>AunBA
zPd<HWWRx%u;b8K!>Tm%GJnGah9KIy=CfNO>FlTn@>1rS;Ho7M0RscN`<3Ko>?+WLp
zA07|?0B9zm_;$O}<&5E1S(-HM*CG3JuPNHAXi@b$M75cL)UczmT<zMUCDhm(VUw(t
z`K_?Ef|j?W($quMK-`zy?tC^~vqc3h9WY{*gf^5UB?(9Z0N??R8Gg~-Xk%G?M%?IZ
zwME+&M?uTf3kFd0$XQiWqu^d8Aooi{9Wn7|;{O0CY4?TaHA7BwEgj0MI?{?9r&CVg
zA;+FU-1r?wkd4YvB2uuFm^@>K&M8Y$+*^uuYf(~`zDksNB%b7+N%uH722o<N<eofI
z1u`1z&tN^zzw6%^Ln;SxG1&YBVR%Y*f;Vx%o@W^H(4SNuoJYPM6AjUAv`TlaTt?CG
z!Typx3NQp7_%$pA5xGBzbOZG`ETOR4VSReQj(<lJw1AS<LIo)Q0K}R90D5N`OP~cQ
zU8j@n?mwmil9eec4tDo3@&5P;3G)EnM3{gFw{Oc5TIs-{DbWOuI(47W=NS&BVIT>K
zGwwfM-wB-bAyB1lBz52tFiKoj;$ZR5$K!^b65Ty=UaDu!JE(2!r<iJ~>DznET>_jc
zq<E9+3uO9v1Cjte!JJ?0lbY@M>nd7vb*L%2Ypp73LvtmibX#QmrC~`wDaSM1S(CfD
zDkIzbV%&Hu&LX=k-f|T~{{Vk!sTV3#DE|QE-h{dU?h@ib{{Zg%aiQq_Ys99V=R!2^
z9J~;GTAVsQv<*VN*EK84ZAc<gxj#&E=f|DCos7?9yCEP|);CbwtL~&BNhu#q#~lW}
zsu_f;BikJC@$qs?ncps1M<$xq?MnV7*S3$zqmEUF0%{t%HF3#G@_Ad-D!=A0@bW+y
z@H>(cK#rq~1aK7`j)MW@iI@Pu=Ng5QD0{f%nVH5^t;T0@AFq}IA!-9>FeAPOnOC6y
z05@5}gqA@vpi0S7r;d8yJedG-*OQMawIO9BZBUU1j(f<)LX@M=DkqPQ5h4PP<JY7f
zk&hIFoxl#9gWCoar9Aie9@z3o35h3z9L!;Cys((u2~qjtV(`bVHxCZlw`kPrS~u&d
zXcFTU^h~%As>*ezmccv%R1*p_9bn=ek6zUz0tp6BF@+4Cc9oGQ-;wph+AdeGmr$g?
z5Pl+cBgB?`dp&9m({iCrTdl0ALXz1`NFkOIdUP9>eM=%#dH~AHnL{?`8uePTje_TT
ztx7!A)6g)t))OFZ+K_mX2gIBeLvN|Bt76|pO7&GuLekAUEjCuSTvC8a5|POy@BqhQ
zzh?gcg@tK0O@B6KyK`yGH&+9-2ZecH54&M19YbJ}V31a&b%1f_{3ArHXm#9nT*9I7
zm*Rq^n7*9^&i-1tNnN4D@%4j-_Ic5LRMRRdOP-<W?ej0u);d_HsIR6dLfiv($?T|;
z1_^^54nB}fB_>Y;sW|7C?N!yht!vUY^xX(%qV=2W*m9Ym0W8x}I+q=&KGLnNr`)Yy
z?3^)|aZUlG8)&&!_3G_aYoW1KT<adU+njmII!7K-nx|XEDs@eum_b=W3Zf*W6F5aG
zTDP{S0w>=Ec?weGpr_zZ7#bQastJM9uMlwe4rzd!tdr%*+5q4Y@cH0FV$Uj1BaC@X
zsZs|4LS}GR*oN)N<P+Ig^LI&ftS@SSN{Wy2q<pYjiC8<qNdgEF$IBi|L8OJLJb({A
z*i8LPjw|f#$OOqV{kmet!68Fhlz<ZkNDAYQKDZNQeq6MP3Ns%bxJ>r)6a*0*d_Erz
z_{&ZZIXnR*bi!W7$#^3QqE<%e?l~U}60-!QGtUw2&OD{8sO|@B^Y1t)kU0aVKP)a_
zk$FTJDUbvR?fT;nZ5hvpPBkjWnjCs4+o=p!f&A32jgI`l1S{r&?S?9qB$=LnLm209
zQ9GplFxaLr$R%dr-FfW~X=&K5(_Wq`fSD=n^Ne###;wgY`txz53L7o9$_ihM`DMaC
ztGMYk#1(|AaUh+dJD%9&{=H@v^H#HGOX?TBx32XKGJo{hK1zQk@sh$Dc}tG~wV2C;
zU=8L|2XrP62lK`z0ZblA>S8b`ObL*B^WXh25IsYVXQX~u(JZ=~JUsYT&YmNS)cHdH
z0A;s6io-_BcP}+4dBE40s9FI60Udgii6cIK&CPz(XK35L-tt!ahnx0fHFk6@)V346
zg!v@E3rwDppgZB`;5%FNgGVxx{{Y4rs+Qe#uDCq=PqR{%>nLD^PQ|GyS{!UbfKp5W
z9Wi3{L&UzA>b+$%)*Q`lt9qH|2)S0*Q?RCK%76H3c0;OO#W)h;mhRK{XaPOPi(F2M
zcC%nr3Uk2^h!=fyHhTraNgr{2W>5bBvt~6l{tWy}+SE(BOLBNm<E5*md=viw1HaoH
zD$9B2B2+fgY3OYaIHaX0EL{`47!p89^>^#u44z4J8^xV;VJe|p%UX`zl?Q3$!egj`
z1H8r*wZQD1)U>WN=N&%JI(u19L!NSl*2Miv6sA#KPFfD67*R^PC<E3Bjy>_9NE=i^
z2aYk<9aPA97ge&YMpn&sN~(%?A#5n92PvHd{o_lCO2B|4j-Z7D80A)a%Z%i#^L@$;
z+`iu3n&9;kDndd$<e^cxi6JIBk9os@JXA6>BGK@br;mp`;ObY1bA*6+0(*4l1faB$
zCxbkQkZ^NqN=&#&CL`KA96h656eU`(06CW=S+$W=Iik@-^~%1MEY*&qa+!G&U1<}-
zO4}+RfxGU?XVn<(wpgyVyO-%|DAd{*OGBgp1Y_pv!<cLG_2PptS?bFd3uQ}HbdD$a
z9YTVVej-QI5;4}r_^hze_;|AF4F%a0wE2piD`-yjFz?}Yy@Qgb))bOB=soa~fHgZS
zF*C1jQM#A>NqDcznrCjRwA$*@{bCtgwJSjcFw#LPQTE^%#~JmfR`U+F>IHs#%r38U
z`&go2l0_{VC(CGibt+NrrKtDB+2Z$Aa@{SVjV;TxS0CaVgQ>Y@dyT<usctk<mD4cU
zZAu$gK5B;q1hzYzRMAitb|`QJPyFz)0c_mUXR|*kyf;cx;zY>l!1>?{LY(7p=a}c)
z9&ce(o{2CzU}{=;=s+ZZF(be`r~)h(HWVn3IOu<092k(N7M7VoQkI~EB_ov*U;*<c
zK4%MTHXDv$trRo~Ze&W_a3v_7<e%~wxS12dzK$zFt96eu>a)ts_Iry(n_Ct1Xs1|h
z_J0}(E}z7%5~0BP;tb$BE%ildaC*wPg+m$e7Y2fOll;`|6;7?w4M<_yiW-cWbY=Qy
zS5EU!AR$e<l2W87DN+a^gYg(8tu53Po-&2Nka>WEAO8SYx^-Dj)0`^nvt4FqaJzK0
z)@C}DHq|&~y2W1p%vlnX3G$pP0!pL<(+R0EEse`1%IW-4uRe-`%~skFx^)K?RZ~jS
z=?Msvu`fDA9yS85_?_`o#@D^TS%ibA;cvRs(_x(xo2qN)n>)ZiD{ZN_xGIxoJfoU&
zlBkiH>{XVvT&e!sNScHM`RsX%Z_P(0Q@ypOebPoWg>mUA<J|T7e@tIo{iGU6sn`3R
zr!wc4*-o9=YVCHHsVivOHBGfyR`Pt}TRYTB_$h!v;9?HGXeON7F157O*UP(%Q`I`t
zZhftzxwNc+WFij$ayZ2ry7;JMbTuC^we7Cp9s7^bP{H~gUD8>kI6ww5atnxQEx7~7
zO-BV?$w}nI4!r!ZR`kH*<BK(|9v-xRBI$ZFhEmQqI_r~;R=4C-*V8zS%+&d50Vr%L
zRM=9YWW<g!S(|8dDihDk`r=q;WbH0y*CWNGwEBZ?%~ocrl1fMQa*G=S=c)jO1Hcd`
zfrAi`M*!{bJaK~yK{7-n`;mbm8>F7I)^Rg;Raaj#UELN-C2e_dC|ao<s8JIIO}3?<
z{{YT7`u#)I&pTZCCEBGI)~|Ia?kD<_j+SRlP_<|63e5SI(CR5&rLoqo<3^DyVb38z
zPq<Mc3`Ap^1`-m0s2~of*keC~v=%sy#jNQx64A`sc}0l5&?Ev?Bp!MX^2d_|pI1)N
z(-{M48>C3@#ykWpjv_%HOc?;Iz0r~qHp!mz+x;<*DF=dt5(hq>n8Eb{2d7BD$8>=t
z3HV_^DK3NM0CyoNN5&wGETp_%C{?<O^IuH{C7QPzKmPzH36!+RN>l+nAAAK6m4cua
z5}-&u{&CD7h8D?lCnY|l>NN?9EdKzQ+1bK&-Cr9|z;28;E9)z6RnOcmHI7u()h$mx
z^Q+#s7Eh^4QRI?)oB?a)MItvI0PXO`7I$!{={>RIs_`U?6@WnagYVl4Jk@_M0G^Th
z=LvuUI0gsx;{x5Z?MZ-o@r4C79WEYHf)BjRj61U4e!AIhO0w2C<z1<l9jRuC#S++c
zzyiP>x9v>)rwuA++klnz;xUKRnO4AdpL5?1dqbOJu}_LeMC{E+RaHM-RPA+AiFx-6
zjw(+i^WUU)_zYpRtz;P7dIRixV796fkOG8l>(q=Mn&bo!f!pX#6mYVLo13LOys-r%
zo=yy)evvgvU3ELovXdpY)(}bdlgJoGov#T8yzqFzjW~d*CL^a4k2n|Zq>4_d!N6YJ
z^&U#hXy=NYqop}*c6PL(`sY^5jd|&=VX!wXN{|a=^iUmpb;D0H>+2bVGFD~^e4wgV
zDmJOK{i(80Qrb{GK?D7ec*HK1&s81|6aBDYS~>L|NRB&TYcNVCO!|P+nfY#Rb;>L_
z>8CCDtwkqIahfMN^zsJW5%RZUZa@~(!CLIq<D#bijH>0oPF|ybE#8gPIIH@Ia!V#Q
zp|p*_vUyhF>gn#EKKR|K&iBnezSZg1+0ICQs;9|n^g<s~)dlZJ4S<0gP-H9cf*|6q
zB|v(CGdz$m6|EzoB=sgc;G5dl+>yDzh`-%un>*WWb`~BxsqZ=Xu&&Lw%hl`n9i|a?
zUA<{-cQra%G^{+=3i8D6^UAsih~{Su!>*NKVSmFdKu?#q?TY#T0JGB(ii$uRw{yqH
zU?<B`qiW}mjw#eoQ`JajmA7&2J<CglblV%jDTN&b*%&TuW}Wxcf>ZB0Yb)8KB0G|Q
zhS5rgZj4-2QYwno_f0(5M%|!6^NE@MP$Qg1?e69b`hEJ&4XgqZt;Csw$4)RnbSHwc
z0Qba@!pe4t;}!t{#c^7!W~tScugxQ^b6_uFVYP<H)pAOt$UP&R384f65BXq#@>R(r
z<q`V!#un+__>VF<J^PRK#k;g2ifF&)OO=Z(Us$%+Ji;!NN}#Dq6#>?kKq*k}F$x(a
zK~ae>Q0fadV8VSx_mUv{30F_1BT}MfQzPCm0|;(N3FcQu{-sn2ae%@4SUpKT)<IDz
zf(*$g0R#le5I%SmgFOg7-#lS`3R0s0@O~I7x#<fbUC($2j&LGI&;VC?JpTaG1#LZg
z{{VamD<hfWIRyUzY&aB%&q3gc3hDay#uT7PC0q`Jzw70XB|&`rKdu8HK_kChD)PuF
zA_48&9#JYAcB((F2H8L`p~n+|E+nXbm$p1YTdWBxKau|U_rt6o>H0VjZ*t0tl99|3
zc*YWwBy{-+!dRq5m@&{sHnsU@Gg>Y5*L$v7yHQ)~Q@Wb6riIp-e0!nABqc+h#N%(J
zw+l6=Q7ktb{S_@$B|S%(<yA+Jr$9bYa2W*w&4h!;9Pv2ler@prok7T-z}cq9L@8;I
z9jmz3EyG1{rU*(LOCc>3B}ocU1tf(M6o6Ce0WnNkY(7B9;?6B{Z*DfzP57f&ZV)jD
z^XIm<^u5nw<~pTo>(ucl2Qy6M2pdTNd!D%3W=mzJ*_kf1Hfq6DZl<k$nkUEpD=Be7
z3i^om!gN3ZDfJG$aBYqSq1@1E8d&1qOAWRHU_|ueiN-@|O1I1^e>{0^Aw2IgOm**q
z+)~t_QnR#3By_}^?3k1!0yY3627w4CbjQQvgy<@cM+SL4@IN)ggDV_}5%b5h5u$Li
zQ@ut`^+BFEm>2__0zoST?dkEtV*8+S=#j*M*8p3tfFZzVuNYF`mf#~i#f1P#;$j91
zQ{g-m1rKm13DoPUOm_r>9dJsXQj~j;PjA%Y*@qOHq0~56w1q$>IqQNzQcNlV`1Zy<
zQVA+BdJGLJT36>GMFk`o_V&X#>Y7$KNrgy})Ajn}0WwcMrF_3E7~pbz!kIJFnZQXP
NZ6lFB*mCuk|JnJR1G)eJ

literal 0
HcmV?d00001


From 3dcace488c5dbe4ff8a88496c5e8ff72144d780d Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Wed, 13 May 2020 13:47:15 -0700
Subject: [PATCH 128/412] review comments.

---
 .bazelrc                                    |   2 +-
 tensorflow/tensorflow.bzl                   |   8 +-
 tensorflow/workspace.bzl                    |  11 --
 third_party/mkl/build_defs.bzl              |   2 +-
 third_party/mkl_dnn/build_defs.bzl          |   4 +-
 third_party/mkl_dnn/mkldnn_threadpool.BUILD | 133 --------------------
 third_party/mkl_dnn/mkldnn_v1.BUILD         |  26 ++--
 7 files changed, 27 insertions(+), 159 deletions(-)
 delete mode 100644 third_party/mkl_dnn/mkldnn_threadpool.BUILD

diff --git a/.bazelrc b/.bazelrc
index bb5f1c03727..2c83830072d 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -143,11 +143,11 @@ build:mkl --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl --define=build_with_mkl_dnn_v1_only=true
 build:mkl -c opt
 
+# config to build OneDNN backend with a user specified threadpool.
 build:mkl_threadpool --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl_threadpool --define=tensorflow_mkldnn_contraction_kernel=0
 build:mkl_threadpool --define=build_with_mkldnn_threadpool=true
 build:mkl_threadpool -c opt
-
 # This config refers to building with CUDA available. It does not necessarily
 # mean that we build CUDA op kernels.
 build:using_cuda --define=using_cuda=true
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index b6066200553..0b544ae54f1 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -328,9 +328,11 @@ def tf_copts(
         if_mkl(["-DINTEL_MKL=1", "-DEIGEN_USE_VML"]) +
         if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) +
         if_mkl_v1_open_source_only(["-DENABLE_MKLDNN_V1"]) +
-        if_mkldnn_threadpool(["-DENABLE_MKLDNN_THREADPOOL"]) +
-        if_mkldnn_threadpool(["-DENABLE_MKLDNN_V1"]) +
-        if_mkldnn_threadpool(["-DINTEL_MKL_DNN_ONLY"]) +
+        if_mkldnn_threadpool([
+          "-DENABLE_MKLDNN_THREADPOOL",
+          "-DENABLE_MKLDNN_V1",
+          "-DINTEL_MKL_DNN_ONLY"
+        ]) +
         if_enable_mkl(["-DENABLE_MKL"]) +
         if_ngraph(["-DINTEL_NGRAPH=1"]) +
         if_android_arm(["-mfpu=neon"]) +
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 24f9b962d79..83e74f3d105 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -232,17 +232,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "mkl_dnn_tp",
-        build_file = clean_dep("//third_party/mkl_dnn:mkldnn_threadpool.BUILD"),
-        sha256 = "54737bcb4dc1961d32ee75da3ecc529fa48198f8b2ca863a079e19a9c4adb70f",
-        strip_prefix = "oneDNN-1.4",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
-            "https://github.com/oneapi-src/oneDNN/archive/v1.4.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
diff --git a/third_party/mkl/build_defs.bzl b/third_party/mkl/build_defs.bzl
index f69d27dd094..bd0686523bc 100644
--- a/third_party/mkl/build_defs.bzl
+++ b/third_party/mkl/build_defs.bzl
@@ -107,7 +107,7 @@ def mkl_deps():
     return select({
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_only": ["@mkl_dnn"],
         "@org_tensorflow//third_party/mkl_dnn:build_with_mkl_dnn_v1_only": ["@mkl_dnn_v1//:mkl_dnn"],
-        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": ["@mkl_dnn_tp//:mkl_dnn"],
+        "@org_tensorflow//third_party/mkl_dnn:build_with_mkldnn_threadpool": ["@mkl_dnn_v1//:mkl_dnn"],
         "@org_tensorflow//third_party/mkl:build_with_mkl_ml_only": ["@org_tensorflow//third_party/mkl:intel_binary_blob"],
         "@org_tensorflow//third_party/mkl:build_with_mkl": [
             "@org_tensorflow//third_party/mkl:intel_binary_blob",
diff --git a/third_party/mkl_dnn/build_defs.bzl b/third_party/mkl_dnn/build_defs.bzl
index 5778d136e9b..bd3b4b94f29 100644
--- a/third_party/mkl_dnn/build_defs.bzl
+++ b/third_party/mkl_dnn/build_defs.bzl
@@ -34,10 +34,10 @@ def if_mkldnn_threadpool(if_true, if_false = []):
     """Returns `if_true` if MKL-DNN v1.x is used.
 
     Shorthand for select()'ing on whether we're building with
-    MKL-DNN v1.x open source library only, without depending on MKL binary form.
+    MKL-DNN v1.x open source library only with user specified threadpool, without depending on MKL binary form.
 
     Returns a select statement which evaluates to if_true if we're building
-    with MKL-DNN v1.x open source library only. Otherwise, the
+    with MKL-DNN v1.x open source library only with user specified threadpool. Otherwise, the
     select statement evaluates to if_false.
 
     """
diff --git a/third_party/mkl_dnn/mkldnn_threadpool.BUILD b/third_party/mkl_dnn/mkldnn_threadpool.BUILD
deleted file mode 100644
index 7209b8a62d0..00000000000
--- a/third_party/mkl_dnn/mkldnn_threadpool.BUILD
+++ /dev/null
@@ -1,133 +0,0 @@
-exports_files(["LICENSE"])
-
-load(
-    "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
-    "if_mkl_open_source_only",
-    "if_mkldnn_threadpool",
-)
-load(
-    "@org_tensorflow//third_party:common.bzl",
-    "template_rule",
-)
-
-config_setting(
-    name = "clang_linux_x86_64",
-    values = {
-        "cpu": "k8",
-        "define": "using_clang=true",
-    },
-)
-
-template_rule(
-    name = "dnnl_config_h",
-    src = "include/dnnl_config.h.in",
-    out = "include/dnnl_config.h",
-    substitutions = {
-        "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
-        "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
-        "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    },
-)
-# Create the file mkldnn_version.h with MKL-DNN version numbers.
-# Currently, the version numbers are hard coded here. If MKL-DNN is upgraded then
-# the version numbers have to be updated manually. The version numbers can be
-# obtained from the PROJECT_VERSION settings in CMakeLists.txt. The variable is
-# set to "version_major.version_minor.version_patch". The git hash version can
-# be set to NA.
-# TODO(agramesh1) Automatically get the version numbers from CMakeLists.txt.
-
-template_rule(
-    name = "dnnl_version_h",
-    src = "include/dnnl_version.h.in",
-    out = "include/dnnl_version.h",
-    substitutions = {
-        "@DNNL_VERSION_MAJOR@": "1",
-        "@DNNL_VERSION_MINOR@": "4",
-        "@DNNL_VERSION_PATCH@": "0",
-        "@DNNL_VERSION_HASH@": "N/A",
-    },
-)
-
-cc_library(
-    name = "mkl_dnn",
-    srcs = glob([
-        "src/common/*.cpp",
-        "src/common/*.hpp",
-        "src/cpu/*.cpp",
-        "src/cpu/*.hpp",
-        "src/cpu/**/*.cpp",
-        "src/cpu/**/*.hpp",
-        "src/cpu/xbyak/*.h",
-    ]) + if_mkldnn_threadpool([
-        ":dnnl_config_h",
-    ]) + [":dnnl_version_h"],
-    hdrs = glob(["include/*"]),
-    copts = [
-        "-fexceptions",
-        "-DUSE_MKL",
-        "-DUSE_CBLAS",
-    ] + if_mkl_open_source_only([
-        "-UUSE_MKL",
-        "-UUSE_CBLAS",
-    ]) + if_mkldnn_threadpool([
-        "-UUSE_MKL",
-        "-UUSE_CBLAS",
-    ]) + select({
-        "@org_tensorflow//tensorflow:linux_x86_64": ["-fopenmp-simd"],
-        # TODO(ibiryukov): enable openmp with clang by including libomp as a
-        # dependency.
-        ":clang_linux_x86_64": [],
-        "//conditions:default": [],
-    }),
-    includes = [
-        "include",
-        "src",
-        "src/common",
-        "src/cpu",
-        "src/cpu/gemm",
-        "src/cpu/xbyak",
-    ],
-    visibility = ["//visibility:public"],
-    deps = select({
-        "@org_tensorflow//tensorflow:linux_x86_64": [
-            "@mkl_linux//:mkl_headers",
-            "@mkl_linux//:mkl_libs_linux",
-        ],
-        "@org_tensorflow//tensorflow:macos": [
-            "@mkl_darwin//:mkl_headers",
-            "@mkl_darwin//:mkl_libs_darwin",
-        ],
-        "@org_tensorflow//tensorflow:windows": [
-            "@mkl_windows//:mkl_headers",
-            "@mkl_windows//:mkl_libs_windows",
-        ],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "mkldnn_single_threaded",
-    srcs = glob([
-        "src/common/*.cpp",
-        "src/common/*.hpp",
-        "src/cpu/*.cpp",
-        "src/cpu/*.hpp",
-        "src/cpu/**/*.cpp",
-        "src/cpu/**/*.hpp",
-        "src/cpu/xbyak/*.h",
-    ]) + [":dnnl_config_h"],
-    hdrs = glob(["include/*"]),
-    copts = [
-        "-fexceptions",
-        "-DMKLDNN_THR=MKLDNN_THR_SEQ",  # Disables threading.
-    ],
-    includes = [
-        "include",
-        "src",
-        "src/common",
-        "src/cpu",
-        "src/cpu/gemm",
-        "src/cpu/xbyak",
-    ],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 243ec00a60f..c7aa0207ee2 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -4,6 +4,7 @@ load(
     "@org_tensorflow//third_party/mkl_dnn:build_defs.bzl",
     "if_mkl_open_source_only",
     "if_mkl_v1_open_source_only",
+    "if_mkldnn_threadpool",
 )
 load(
     "@org_tensorflow//third_party:common.bzl",
@@ -17,16 +18,26 @@ config_setting(
         "define": "using_clang=true",
     },
 )
+_DNNL_RUNTIME_OMP = {
+    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
+    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
+    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
+}
+
+_DNNL_RUNTIME_THREADPOOL = {
+    "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_THREADPOOL",
+    "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_THREADPOOL",
+    "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
+}
 
 template_rule(
     name = "dnnl_config_h",
     src = "include/dnnl_config.h.in",
     out = "include/dnnl_config.h",
-    substitutions = {
-        "#cmakedefine DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_${DNNL_CPU_THREADING_RUNTIME}": "#define DNNL_CPU_THREADING_RUNTIME DNNL_RUNTIME_OMP",
-        "#cmakedefine DNNL_CPU_RUNTIME DNNL_RUNTIME_${DNNL_CPU_RUNTIME}": "#define DNNL_CPU_RUNTIME DNNL_RUNTIME_OMP",
-        "#cmakedefine DNNL_GPU_RUNTIME DNNL_RUNTIME_${DNNL_GPU_RUNTIME}": "#define DNNL_GPU_RUNTIME DNNL_RUNTIME_NONE",
-    },
+    substitutions = if_mkldnn_threadpool(
+        _DNNL_RUNTIME_THREADPOOL,
+        if_false = _DNNL_RUNTIME_OMP,
+    ),
 )
 
 # Create the file mkldnn_version.h with MKL-DNN version numbers.
@@ -59,9 +70,8 @@ cc_library(
         "src/cpu/**/*.cpp",
         "src/cpu/**/*.hpp",
         "src/cpu/xbyak/*.h",
-    ]) + if_mkl_v1_open_source_only([
-        ":dnnl_config_h",
-    ]) + [":dnnl_version_h"],
+    ]) + [":dnnl_config_h"]
+       + [":dnnl_version_h"],
     hdrs = glob(["include/*"]),
     copts = [
         "-fexceptions",

From 7d704e32464c25e910a8ce51d643336b7d8f8bd6 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 13 May 2020 13:47:01 -0700
Subject: [PATCH 129/412] Move tf.keras.layers.featureDenseFeature back to
 Keras package.

PiperOrigin-RevId: 311396758
Change-Id: I253d89a5f23dce3ed06db665640c0c2ec3902cf9
---
 tensorflow/python/feature_column/BUILD        |  20 +
 .../feature_column/dense_features.py          |   5 +
 .../feature_column/dense_features_test.py     | 416 +-----------------
 .../feature_column/dense_features_v2.py       |   7 +-
 .../feature_column/dense_features_v2_test.py  |   2 +-
 .../feature_column/feature_column_lib.py      |   8 +-
 .../feature_column/feature_column_v2_test.py  | 291 ++++++++++++
 .../feature_column/keras_integration_test.py  |   2 +-
 .../sequence_feature_column_test.py           |  49 +++
 .../feature_column/serialization_test.py      |  66 +++
 tensorflow/python/keras/feature_column/BUILD  |  78 ----
 .../python/keras/feature_column/__init__.py   |   0
 ...equence_feature_column_integration_test.py |   2 +-
 .../python/keras/layers/serialization.py      |  18 +-
 .../saving/saved_model/saved_model_test.py    |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   4 +-
 17 files changed, 463 insertions(+), 509 deletions(-)
 rename tensorflow/python/{keras => }/feature_column/dense_features.py (97%)
 rename tensorflow/python/{keras => }/feature_column/dense_features_test.py (62%)
 rename tensorflow/python/{keras => }/feature_column/dense_features_v2.py (94%)
 rename tensorflow/python/{keras => }/feature_column/dense_features_v2_test.py (99%)
 delete mode 100644 tensorflow/python/keras/feature_column/__init__.py

diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index 786c26c009a..d67cdf9cc06 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -55,6 +55,8 @@ py_library(
 py_library(
     name = "feature_column_v2",
     srcs = [
+        "dense_features.py",
+        "dense_features_v2.py",
         "feature_column_v2.py",
         "sequence_feature_column.py",
         "serialization.py",
@@ -124,6 +126,15 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "dense_features_test",
+    srcs = ["dense_features_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":feature_column_test_main_lib",
+    ],
+)
+
 py_library(
     name = "feature_column_test_main_lib",
     srcs = ["feature_column_test.py"],
@@ -166,6 +177,15 @@ tf_py_test(
     deps = [":feature_column_v2_test_main_lib"],
 )
 
+tf_py_test(
+    name = "dense_features_v2_test",
+    srcs = ["dense_features_v2_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":feature_column_v2_test_main_lib",
+    ],
+)
+
 py_library(
     name = "feature_column_v2_test_main_lib",
     srcs = ["feature_column_v2_test.py"],
diff --git a/tensorflow/python/keras/feature_column/dense_features.py b/tensorflow/python/feature_column/dense_features.py
similarity index 97%
rename from tensorflow/python/keras/feature_column/dense_features.py
rename to tensorflow/python/feature_column/dense_features.py
index 820f1a6b1b7..6feef185815 100644
--- a/tensorflow/python/keras/feature_column/dense_features.py
+++ b/tensorflow/python/feature_column/dense_features.py
@@ -23,6 +23,7 @@ import json
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
+from tensorflow.python.keras.layers import serialization as layer_serialization
 from tensorflow.python.util import serialization
 from tensorflow.python.util.tf_export import keras_export
 
@@ -172,3 +173,7 @@ class DenseFeatures(fc._BaseFeaturesLayer):  # pylint: disable=protected-access
           cols_to_output_tensors[column] = processed_tensors
         output_tensors.append(processed_tensors)
     return self._verify_and_concat_tensors(output_tensors)
+
+
+layer_serialization.inject_feature_column_v1_objects(
+    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/keras/feature_column/dense_features_test.py b/tensorflow/python/feature_column/dense_features_test.py
similarity index 62%
rename from tensorflow/python/keras/feature_column/dense_features_test.py
rename to tensorflow/python/feature_column/dense_features_test.py
index ec07964bcbe..7cd523dcc14 100644
--- a/tensorflow/python/keras/feature_column/dense_features_test.py
+++ b/tensorflow/python/feature_column/dense_features_test.py
@@ -18,21 +18,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.feature_column import dense_features as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import partitioned_variables
@@ -678,417 +676,5 @@ class DenseFeaturesTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
-class IndicatorColumnTest(test.TestCase):
-
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
-    with ops.Graph().as_default():
-      features = {
-          'animal':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-      net = df.DenseFeatures([animal])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
-
-
-class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True
-      }, {
-          'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False
-      })
-  @test_util.run_deprecated_v1
-  def test_dense_features(self, use_safe_embedding_lookup):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        use_safe_embedding_lookup=use_safe_embedding_lookup)
-
-    # Provide sparse input and get dense result.
-    l = df.DenseFeatures((embedding_column,))
-    dense_features = l({'aaa': sparse_input})
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in trainable_vars]))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-    if use_safe_embedding_lookup:
-      self.assertIn('SparseFillEmptyRows',
-                    [x.type for x in ops.get_default_graph().get_operations()])
-    else:
-      self.assertNotIn(
-          'SparseFillEmptyRows',
-          [x.type for x in ops.get_default_graph().get_operations()])
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_not_trainable(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=False)
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures((embedding_column,))({
-        'aaa': sparse_input
-    })
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    self.assertCountEqual([],
-                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-
-class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
-
-  def _test_dense_features(self, trainable=True):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 4)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [0]
-        # example 1, ids []
-        indices=((0, 0),),
-        values=(0,),
-        dense_shape=(2, 5))
-    sparse_input_c = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 1), (1, 1), (1, 3)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_d = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids []
-        indices=((0, 1),),
-        values=(2,),
-        dense_shape=(2, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0:
-        # A ids [2], embedding = [7, 11]
-        # B ids [0], embedding = [1, 2]
-        # C ids [2], embedding = [7, 11]
-        # D ids [2], embedding = [7, 11]
-        (7., 11., 1., 2., 7., 11., 7., 11.),
-        # example 1:
-        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # B ids [], embedding = [0, 0]
-        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # D ids [], embedding = [0, 0]
-        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
-    )
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    categorical_column_c = fc.categorical_column_with_identity(
-        key='ccc', num_buckets=vocabulary_size)
-    categorical_column_d = fc.categorical_column_with_identity(
-        key='ddd', num_buckets=vocabulary_size)
-
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
-        [categorical_column_c, categorical_column_d],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-
-    features = {
-        'aaa': sparse_input_a,
-        'bbb': sparse_input_b,
-        'ccc': sparse_input_c,
-        'ddd': sparse_input_d
-    }
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures(
-        feature_columns=(embedding_column_b, embedding_column_a,
-                         embedding_column_c, embedding_column_d))(
-                             features)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(
-        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-        tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    if trainable:
-      self.assertCountEqual(
-          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = global_vars
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values,
-                        self.evaluate(shared_embedding_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    self._test_dense_features()
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_no_trainable(self):
-    self._test_dense_features(trainable=False)
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_get_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_identity(
-                key='b', num_buckets=3), dimension=2)]
-    orig_layer = df.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    self.assertEqual(config['name'], orig_layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertLen(config['feature_columns'], 2)
-    self.assertEqual(
-        config['feature_columns'][0]['class_name'], 'NumericColumn')
-    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
-    self.assertEqual(
-        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_from_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
-                'b', vocabulary_list=['1', '2', '3']), dimension=2),
-            fc.indicator_column(fc.categorical_column_with_hash_bucket(
-                key='c', hash_bucket_size=3))]
-    orig_layer = df.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = df.DenseFeatures.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 3)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
-    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
-    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
-
-  def test_crossed_column(self):
-    a = fc.categorical_column_with_vocabulary_list(
-        'a', vocabulary_list=['1', '2', '3'])
-    b = fc.categorical_column_with_vocabulary_list(
-        'b', vocabulary_list=['1', '2', '3'])
-    ab = fc.crossed_column([a, b], hash_bucket_size=2)
-    cols = [fc.indicator_column(ab)]
-
-    orig_layer = df.DenseFeatures(cols)
-    config = orig_layer.get_config()
-
-    new_layer = df.DenseFeatures.from_config(config)
-
-    self.assertLen(new_layer._feature_columns, 1)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
-
-
-@test_util.run_all_in_graph_and_eager_modes
-class SequenceFeatureColumnsTest(test.TestCase):
-  """Tests DenseFeatures with sequence feature columns."""
-
-  def test_embedding_column(self):
-    """Tests that error is raised for sequence embedding column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
-
-    input_layer = df.DenseFeatures([embedding_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In embedding_column: aaa_embedding\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-  def test_indicator_column(self):
-    """Tests that error is raised for sequence indicator column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
-
-    input_layer = df.DenseFeatures([indicator_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In indicator_column: aaa_indicator\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/feature_column/dense_features_v2.py b/tensorflow/python/feature_column/dense_features_v2.py
similarity index 94%
rename from tensorflow/python/keras/feature_column/dense_features_v2.py
rename to tensorflow/python/feature_column/dense_features_v2.py
index e4dc22f1bbe..405c5d63249 100644
--- a/tensorflow/python/keras/feature_column/dense_features_v2.py
+++ b/tensorflow/python/feature_column/dense_features_v2.py
@@ -18,9 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.feature_column import dense_features
+from tensorflow.python.keras.layers import serialization as layer_serialization
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -93,3 +94,7 @@ class DenseFeatures(dense_features.DenseFeatures):
     # We would like to call Layer.build and not _DenseFeaturesHelper.build.
     # pylint: disable=protected-access
     super(fc._BaseFeaturesLayer, self).build(None)  # pylint: disable=bad-super-call
+
+
+layer_serialization.inject_feature_column_v2_objects(
+    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/keras/feature_column/dense_features_v2_test.py b/tensorflow/python/feature_column/dense_features_v2_test.py
similarity index 99%
rename from tensorflow/python/keras/feature_column/dense_features_v2_test.py
rename to tensorflow/python/feature_column/dense_features_v2_test.py
index 95fc8b7ac1e..71cb163a7d9 100644
--- a/tensorflow/python/keras/feature_column/dense_features_v2_test.py
+++ b/tensorflow/python/feature_column/dense_features_v2_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.feature_column import dense_features_v2 as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -30,7 +31,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.feature_column import dense_features_v2 as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables as variables_lib
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index bda20ff3f2c..afe14f55bfc 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import,g-bad-import-order
+# We import dense_features_v2 first so that the V1 DenseFeatures is the default
+# if users directly import feature_column_lib.
+from tensorflow.python.feature_column.dense_features_v2 import *
+from tensorflow.python.feature_column.dense_features import *
 from tensorflow.python.feature_column.feature_column import *
 from tensorflow.python.feature_column.feature_column_v2 import *
 from tensorflow.python.feature_column.sequence_feature_column import *
 from tensorflow.python.feature_column.serialization import *
-# We import dense_features_v2 first so that the V1 DenseFeatures is the default
-# if users directly import feature_column_lib.
-from tensorflow.python.keras.feature_column.dense_features_v2 import *
-from tensorflow.python.keras.feature_column.dense_features import *
 from tensorflow.python.keras.feature_column.sequence_feature_column import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index a13f38a5203..fe769850fb0 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,6 +31,7 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
@@ -5581,6 +5582,23 @@ class IndicatorColumnTest(test.TestCase):
       self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
       self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = df.DenseFeatures([animal])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     animal = fc.indicator_column(
@@ -6253,6 +6271,156 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
                           self.evaluate(predictions))
 
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False
+      })
+  @test_util.run_deprecated_v1
+  def test_dense_features(self, use_safe_embedding_lookup):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        use_safe_embedding_lookup=use_safe_embedding_lookup)
+
+    # Provide sparse input and get dense result.
+    l = df.DenseFeatures((embedding_column,))
+    dense_features = l({'aaa': sparse_input})
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in trainable_vars]))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_not_trainable(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures((embedding_column,))({
+        'aaa': sparse_input
+    })
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertItemsEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
@@ -7158,6 +7326,129 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
       self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
+  def _test_dense_features(self, trainable=True):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 4)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [0]
+        # example 1, ids []
+        indices=((0, 0),),
+        values=(0,),
+        dense_shape=(2, 5))
+    sparse_input_c = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 1), (1, 1), (1, 3)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_d = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids []
+        indices=((0, 1),),
+        values=(2,),
+        dense_shape=(2, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0:
+        # A ids [2], embedding = [7, 11]
+        # B ids [0], embedding = [1, 2]
+        # C ids [2], embedding = [7, 11]
+        # D ids [2], embedding = [7, 11]
+        (7., 11., 1., 2., 7., 11., 7., 11.),
+        # example 1:
+        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # B ids [], embedding = [0, 0]
+        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # D ids [], embedding = [0, 0]
+        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    categorical_column_c = fc.categorical_column_with_identity(
+        key='ccc', num_buckets=vocabulary_size)
+    categorical_column_d = fc.categorical_column_with_identity(
+        key='ddd', num_buckets=vocabulary_size)
+
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
+        [categorical_column_c, categorical_column_d],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+
+    features = {
+        'aaa': sparse_input_a,
+        'bbb': sparse_input_b,
+        'ccc': sparse_input_c,
+        'ddd': sparse_input_d
+    }
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures(
+        feature_columns=(embedding_column_b, embedding_column_a,
+                         embedding_column_c, embedding_column_d))(
+                             features)
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+        tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if trainable:
+      self.assertItemsEqual(
+          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
+    shared_embedding_vars = global_vars
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values,
+                        self.evaluate(shared_embedding_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    self._test_dense_features()
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_no_trainable(self):
+    self._test_dense_features(trainable=False)
+
   @test_util.run_deprecated_v1
   def test_serialization(self):
 
diff --git a/tensorflow/python/feature_column/keras_integration_test.py b/tensorflow/python/feature_column/keras_integration_test.py
index 456c0204350..e0677e84e50 100644
--- a/tensorflow/python/feature_column/keras_integration_test.py
+++ b/tensorflow/python/feature_column/keras_integration_test.py
@@ -23,12 +23,12 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.feature_column import dense_features_v2
 from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.feature_column import dense_features_v2
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.premade import linear
 from tensorflow.python.keras.premade import wide_deep
diff --git a/tensorflow/python/feature_column/sequence_feature_column_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
index d0cf5ee7670..3d5d24ec03a 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -24,6 +24,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import serialization
@@ -110,6 +111,54 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class DenseFeaturesTest(test.TestCase):
+  """Tests DenseFeatures with sequence feature columns."""
+
+  def test_embedding_column(self):
+    """Tests that error is raised for sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a, dimension=2)
+
+    input_layer = dense_features.DenseFeatures([embedding_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+  def test_indicator_column(self):
+    """Tests that error is raised for sequence indicator column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+
+    input_layer = dense_features.DenseFeatures([indicator_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+
 def _assert_sparse_tensor_value(test_case, expected, actual):
   _assert_sparse_tensor_indices_shape(test_case, expected, actual)
 
diff --git a/tensorflow/python/feature_column/serialization_test.py b/tensorflow/python/feature_column/serialization_test.py
index 881ca0cca5e..78b72746ac9 100644
--- a/tensorflow/python/feature_column/serialization_test.py
+++ b/tensorflow/python/feature_column/serialization_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
 from tensorflow.python.framework import test_util
@@ -113,6 +114,71 @@ class FeatureColumnSerializationTest(test.TestCase):
     self.assertIs(new_price.normalizer_fn, _custom_fn)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_get_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_identity(
+                key='b', num_buckets=3), dimension=2)]
+    orig_layer = dense_features.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    self.assertEqual(config['name'], orig_layer.name)
+    self.assertEqual(config['trainable'], trainable)
+    self.assertLen(config['feature_columns'], 2)
+    self.assertEqual(
+        config['feature_columns'][0]['class_name'], 'NumericColumn')
+    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
+    self.assertEqual(
+        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_from_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
+                'b', vocabulary_list=['1', '2', '3']), dimension=2),
+            fc.indicator_column(fc.categorical_column_with_hash_bucket(
+                key='c', hash_bucket_size=3))]
+    orig_layer = dense_features.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    new_layer = dense_features.DenseFeatures.from_config(config)
+
+    self.assertEqual(new_layer.name, orig_layer.name)
+    self.assertEqual(new_layer.trainable, trainable)
+    self.assertLen(new_layer._feature_columns, 3)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a')
+    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
+    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
+    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
+
+  def test_crossed_column(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        'a', vocabulary_list=['1', '2', '3'])
+    b = fc.categorical_column_with_vocabulary_list(
+        'b', vocabulary_list=['1', '2', '3'])
+    ab = fc.crossed_column([a, b], hash_bucket_size=2)
+    cols = [fc.indicator_column(ab)]
+
+    orig_layer = dense_features.DenseFeatures(cols)
+    config = orig_layer.get_config()
+
+    new_layer = dense_features.DenseFeatures.from_config(config)
+
+    self.assertLen(new_layer._feature_columns, 1)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
+
+
 @test_util.run_all_in_graph_and_eager_modes
 class LinearModelLayerSerializationTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/feature_column/BUILD b/tensorflow/python/keras/feature_column/BUILD
index 94097c28d73..650efcceb52 100644
--- a/tensorflow/python/keras/feature_column/BUILD
+++ b/tensorflow/python/keras/feature_column/BUILD
@@ -12,88 +12,11 @@ exports_files(["LICENSE"])
 
 py_library(
     name = "feature_column",
-    srcs = ["__init__.py"],
     deps = [
-        ":dense_features",
-        ":dense_features_v2",
         ":sequence_feature_column",
     ],
 )
 
-py_library(
-    name = "dense_features",
-    srcs = [
-        "dense_features.py",
-    ],
-    deps = [
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tf_export",
-        "//tensorflow/python:util",
-        "//tensorflow/python/feature_column:feature_column_v2",
-        "//tensorflow/python/keras:backend",
-    ],
-)
-
-py_library(
-    name = "dense_features_v2",
-    srcs = [
-        "dense_features_v2.py",
-    ],
-    deps = [
-        ":dense_features",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tf_export",
-        "//tensorflow/python/feature_column:feature_column_v2",
-    ],
-)
-
-tf_py_test(
-    name = "dense_features_test",
-    srcs = ["dense_features_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":dense_features",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/feature_column:feature_column_v2",
-    ],
-)
-
-tf_py_test(
-    name = "dense_features_v2_test",
-    srcs = ["dense_features_v2_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":dense_features_v2",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/feature_column:feature_column_v2",
-    ],
-)
-
 py_library(
     name = "sequence_feature_column",
     srcs = ["sequence_feature_column.py"],
@@ -136,7 +59,6 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
-        ":dense_features",
         ":sequence_feature_column",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/python/keras/feature_column/__init__.py b/tensorflow/python/keras/feature_column/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
index b1100bf7b07..8784182e23b 100644
--- a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
@@ -24,11 +24,11 @@ from google.protobuf import text_format
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.keras.feature_column import sequence_feature_column as ksfc
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.ops import init_ops_v2
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 30be3d485df..0a90441d8a0 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -64,11 +64,23 @@ ALL_V2_MODULES = (
     recurrent_v2,
     preprocessing_normalization
 )
+FEATURE_COLUMN_V1_OBJECTS = {}
+FEATURE_COLUMN_V2_OBJECTS = {}
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
 
 
+def inject_feature_column_v1_objects(name, cls):
+  global FEATURE_COLUMN_V1_OBJECTS
+  FEATURE_COLUMN_V1_OBJECTS[name] = cls
+
+
+def inject_feature_column_v2_objects(name, cls):
+  global FEATURE_COLUMN_V2_OBJECTS
+  FEATURE_COLUMN_V2_OBJECTS[name] = cls
+
+
 def populate_deserializable_objects():
   """Populates dict ALL_OBJECTS with every built-in layer.
   """
@@ -122,11 +134,9 @@ def populate_deserializable_objects():
   LOCAL.ALL_OBJECTS['WideDeepModel'] = WideDeepModel
 
   if tf2.enabled():
-    from tensorflow.python.keras.feature_column.dense_features_v2 import DenseFeatures  # pylint: disable=g-import-not-at-top
-    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
+    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V2_OBJECTS)
   else:
-    from tensorflow.python.keras.feature_column.dense_features import DenseFeatures  # pylint: disable=g-import-not-at-top
-    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
+    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V1_OBJECTS)
 
   # Merge layers, function versions.
   LOCAL.ALL_OBJECTS['add'] = merge.add
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 5e9ccc2d37a..9cbe8607a54 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column.dense_features import DenseFeatures
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -47,7 +48,6 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import testing_utils
-from tensorflow.python.keras.feature_column.dense_features import DenseFeatures
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import save_impl as keras_save
 from tensorflow.python.keras.utils import generic_utils
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index ba9156d7f95..ecda1603325 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index 130a9954202..f7137f0d09b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features_v2.DenseFeatures\'>"
-  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.dense_features_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"

From 8d1e8b350c37dc37b4439c7f646d3ec178598931 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Wed, 13 May 2020 13:57:07 -0700
Subject: [PATCH 130/412] Export
 tf.keras.layers.experimental.preprocessing.CategoryCrossing layer.

PiperOrigin-RevId: 311398537
Change-Id: I394c7dd5ae7fe168f3238dbd8a7ab064ff6ad2c1
---
 tensorflow/python/keras/layers/__init__.py    |   1 +
 .../python/keras/layers/preprocessing/BUILD   |  19 ++
 .../layers/preprocessing/benchmarks/BUILD     |  10 +
 .../categorical_crossing_benchmark.py         | 116 +++++++++
 .../preprocessing/categorical_crossing.py     | 140 ++++-------
 .../categorical_crossing_distribution_test.py |  64 +++++
 .../categorical_crossing_test.py              |  82 +------
 ...tal.preprocessing.-category-crossing.pbtxt | 222 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 ...tal.preprocessing.-category-crossing.pbtxt | 222 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 11 files changed, 707 insertions(+), 177 deletions(-)
 create mode 100644 tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt

diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 192c6a4afc8..ede199a9169 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -57,6 +57,7 @@ else:
   from tensorflow.python.keras.layers.preprocessing.text_vectorization_v1 import TextVectorization
   from tensorflow.python.keras.layers.preprocessing.text_vectorization import TextVectorization as TextVectorizationV2
   TextVectorizationV1 = TextVectorization
+from tensorflow.python.keras.layers.preprocessing.categorical_crossing import CategoryCrossing
 
 # Advanced activations.
 from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 501c99fe890..bef294429bd 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -310,6 +310,25 @@ distribute_py_test(
     ],
 )
 
+distribute_py_test(
+    name = "categorical_crossing_distribution_test",
+    srcs = ["categorical_crossing_distribution_test.py"],
+    main = "categorical_crossing_distribution_test.py",
+    python_version = "PY3",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    tpu_tags = [
+        "no_oss",  # b/155502591
+    ],
+    deps = [
+        ":categorical_crossing",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/keras",
+    ],
+)
+
 tf_py_test(
     name = "discretization_test",
     size = "small",
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
index 276fb4767af..0c7e6ba856d 100644
--- a/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/BUILD
@@ -17,6 +17,16 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "categorical_crossing_benchmark",
+    srcs = ["categorical_crossing_benchmark.py"],
+    python_version = "PY3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python/keras/layers/preprocessing:categorical_crossing",
+    ],
+)
+
 tf_py_test(
     name = "index_lookup_adapt_benchmark",
     srcs = ["index_lookup_adapt_benchmark.py"],
diff --git a/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py b/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py
new file mode 100644
index 00000000000..80a7903f0b9
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/benchmarks/categorical_crossing_benchmark.py
@@ -0,0 +1,116 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for Keras categorical_encoding preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import time
+
+from absl import flags
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.layers.preprocessing import categorical_crossing
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+v2_compat.enable_v2_behavior()
+
+
+# word_gen creates random sequences of ASCII letters (both lowercase and upper).
+# The number of unique strings is ~2,700.
+def int_gen():
+  for _ in itertools.count(1):
+    yield (np.random.randint(0, 5, (1,)), np.random.randint(0, 7, (1,)))
+
+
+class BenchmarkLayer(benchmark.Benchmark):
+  """Benchmark the layer forward pass."""
+
+  def run_dataset_implementation(self, batch_size):
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_generator(
+          int_gen, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([1]), tensor_shape.TensorShape([1])))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      num_batches = 5
+      ds = ds.take(num_batches)
+      ds = ds.prefetch(num_batches)
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = sparse_ops.sparse_cross([i[0], i[1]])
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+    return avg_time
+
+  def bm_layer_implementation(self, batch_size):
+    input_1 = keras.Input(shape=(1,), dtype=dtypes.int64, name="word")
+    input_2 = keras.Input(shape=(1,), dtype=dtypes.int64, name="int")
+    layer = categorical_crossing.CategoryCrossing()
+    _ = layer([input_1, input_2])
+
+    num_repeats = 5
+    starts = []
+    ends = []
+    for _ in range(num_repeats):
+      ds = dataset_ops.Dataset.from_generator(
+          int_gen, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([1]), tensor_shape.TensorShape([1])))
+      ds = ds.shuffle(batch_size * 100)
+      ds = ds.batch(batch_size)
+      num_batches = 5
+      ds = ds.take(num_batches)
+      ds = ds.prefetch(num_batches)
+      starts.append(time.time())
+      # Benchmarked code begins here.
+      for i in ds:
+        _ = layer([i[0], i[1]])
+      # Benchmarked code ends here.
+      ends.append(time.time())
+
+    avg_time = np.mean(np.array(ends) - np.array(starts)) / num_batches
+    name = "categorical_crossing|batch_%s" % batch_size
+    baseline = self.run_dataset_implementation(batch_size)
+    extras = {
+        "dataset implementation baseline": baseline,
+        "delta seconds": (baseline - avg_time),
+        "delta percent": ((baseline - avg_time) / baseline) * 100
+    }
+    self.report_benchmark(
+        iters=num_repeats, wall_time=avg_time, extras=extras, name=name)
+
+  def benchmark_vocab_size_by_batch(self):
+    for batch in [32, 64, 256]:
+      self.bm_layer_implementation(batch_size=batch)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
index e3eb27b2b4e..88b552e23b7 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
@@ -20,49 +20,35 @@ from __future__ import print_function
 
 import itertools
 
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export('keras.layers.experimental.preprocessing.CategoryCrossing')
 class CategoryCrossing(Layer):
   """Category crossing layer.
 
-  This layer transforms multiple categorical inputs to categorical outputs
-  by Cartesian product, and hash the output if necessary. Without hashing
-  (`num_bins=None`) the output dtype is string, with hashing the output dtype
-  is int64.
-
-  For each input, the hash function uses a specific fingerprint method, i.e.,
-  [FarmHash64](https://github.com/google/farmhash) to compute the hashed output,
-  that provides a consistent hashed output across different platforms.
-  For multiple inputs, the final output is calculated by first computing the
-  fingerprint of `hash_key`, and concatenate it with the fingerprints of
-  each input. The user can also obfuscate the output with customized `hash_key`.
-
-  If [SipHash64[(https://github.com/google/highwayhash) is desired instead, the
-  user can set `num_bins=None` to get string outputs, and use Hashing layer to
-  get hashed output with SipHash64.
+  This layer concatenates multiple categorical inputs into a single categorical
+  output (similar to Cartesian product). The output dtype is string.
 
   Usage:
-
-  Use with string output.
   >>> inp_1 = tf.constant([['a'], ['b'], ['c']])
   >>> inp_2 = tf.constant([['d'], ['e'], ['f']])
-  >>> layer = categorical_crossing.CategoryCrossing()
-  >>> output = layer([inp_1, inp_2])
-
-  Use with hashed output.
-  >>> layer = categorical_crossing.CategoryCrossing(num_bins=2)
-  >>> output = layer([inp_1, inp_2])
-
-  Use with customized hashed output.
-  >>> layer = categorical_crossing.CategoryCrossing(num_bins=2, hash_key=133)
-  >>> output = layer([inp_1, inp_2])
+  >>> layer = tf.keras.layers.experimental.preprocessing.CategoryCrossing()
+  >>> layer([inp_1, inp_2])
+  <tf.Tensor: shape=(3, 1), dtype=string, numpy=
+    array([[b'a_X_d'],
+           [b'b_X_e'],
+           [b'c_X_f']], dtype=object)>
 
   Arguments:
     depth: depth of input crossing. By default None, all inputs are crossed into
@@ -74,10 +60,6 @@ class CategoryCrossing(Layer):
       equal to N1 or N2. Passing `None` means a single crossed output with all
       inputs. For example, with inputs `a`, `b` and `c`, `depth=2` means the
       output will be [a;b;c;cross(a, b);cross(bc);cross(ca)].
-    num_bins: Number of hash bins. By default None, no hashing is performed.
-    hash_key: Integer hash_key that will be used by the concatenate
-      fingerprints. If not given, will use a default key from
-      `tf.sparse.cross_hashed`. This is only valid when `num_bins` is not None.
     name: Name to give to the layer.
     **kwargs: Keyword arguments to construct a layer.
 
@@ -87,114 +69,69 @@ class CategoryCrossing(Layer):
   Output shape: a single string or int tensor or sparse tensor of shape
     `[batch_size, d1, ..., dm]`
 
-  Below 'hash' stands for tf.fingerprint, and cat stands for 'FingerprintCat'.
+  Returns:
+    If any input is `RaggedTensor`, the output is `RaggedTensor`.
+    Else, if any input is `SparseTensor`, the output is `SparseTensor`.
+    Otherwise, the output is `Tensor`.
 
   Example: (`depth`=None)
     If the layer receives three inputs:
     `a=[[1], [4]]`, `b=[[2], [5]]`, `c=[[3], [6]]`
-    the output will be a string tensor if not hashed:
+    the output will be a string tensor:
     `[[b'1_X_2_X_3'], [b'4_X_5_X_6']]`
-    the output will be an int64 tensor if hashed:
-    `[[cat(hash(3), cat(hash(2), cat(hash(1), hash(hash_key))))],
-     [[cat(hash(6), cat(hash(5), cat(hash(4), hash(hash_key))))]`
 
   Example: (`depth` is an integer)
     With the same input above, and if `depth`=2,
-    the output will be a list of 6 string tensors if not hashed:
+    the output will be a list of 6 string tensors:
     `[[b'1'], [b'4']]`
     `[[b'2'], [b'5']]`
     `[[b'3'], [b'6']]`
     `[[b'1_X_2'], [b'4_X_5']]`,
     `[[b'2_X_3'], [b'5_X_6']]`,
     `[[b'3_X_1'], [b'6_X_4']]`
-    the output will be a list of 6 int64 tensors if hashed:
-    `[[hash(b'1')], [hash(b'4')]]`
-    `[[hash(b'2')], [hash(b'5')]]`
-    `[[hash(b'3')], [hash(b'6')]]`
-    `[[cat(hash(2), cat(hash(1), hash(hash_key)))],
-      [cat(hash(5), cat(hash(4), hash(hash_key)))]`,
-    `[[cat(hash(3), cat(hash(1), hash(hash_key)))],
-      [cat(hash(6), cat(hash(4), hash(hash_key)))]`,
-    `[[cat(hash(3), cat(hash(2), hash(hash_key)))],
-      [cat(hash(6), cat(hash(5), hash(hash_key)))]`,
 
   Example: (`depth` is a tuple/list of integers)
     With the same input above, and if `depth`=(2, 3)
-    the output will be a list of 4 string tensors if not hashed:
+    the output will be a list of 4 string tensors:
     `[[b'1_X_2'], [b'4_X_5']]`,
     `[[b'2_X_3'], [b'5_X_6']]`,
     `[[b'3_X_1'], [b'6_X_4']]`,
     `[[b'1_X_2_X_3'], [b'4_X_5_X_6']]`
-    the output will be a list of 4 int64 tensors if hashed:
-    `[
-      [cat(hash(2), cat(hash(1), hash(hash_key)))],
-      [cat(hash(5), cat(hash(4), hash(hash_key)))]
-     ]`,
-    `[
-      [cat(hash(3), cat(hash(1), hash(hash_key)))],
-      [cat(hash(6), cat(hash(4), hash(hash_key)))]
-     ]`,
-    `[
-      [cat(hash(3), cat(hash(2), hash(hash_key)))],
-      [cat(hash(6), cat(hash(5), hash(hash_key)))]
-     ]`,
-    `[
-      [cat(hash(3), cat(hash(2), cat(hash(1), hash(hash_key))))],
-      [cat(hash(6), cat(hash(5), cat(hash(4), hash(hash_key))))]
-     ]`
   """
 
   def __init__(self,
                depth=None,
-               num_bins=None,
-               hash_key=None,
                name=None,
                **kwargs):
     # TODO(tanzheny): Consider making seperator configurable.
-    if num_bins is None and hash_key is not None:
-      raise ValueError('`hash_key` is only valid when `num_bins` is not None')
     super(CategoryCrossing, self).__init__(name=name, **kwargs)
     self.depth = depth
-    self.num_bins = num_bins
-    self.hash_key = hash_key
     if isinstance(depth, (tuple, list)):
       self._depth_tuple = depth
     elif depth is not None:
       self._depth_tuple = tuple([i for i in range(1, depth + 1)])
+    strategy = ds_context.get_strategy()
+    if strategy.__class__.__name__.startswith('TPUStrategy'):
+      raise ValueError('TPU strategy is not support for this layer yet.')
 
   def partial_crossing(self, partial_inputs, ragged_out, sparse_out):
     """Gets the crossed output from a partial list/tuple of inputs."""
-    if self.num_bins is not None:
-      partial_output = sparse_ops.sparse_cross_hashed(
-          partial_inputs, num_buckets=self.num_bins, hash_key=self.hash_key)
-    else:
-      partial_output = sparse_ops.sparse_cross(partial_inputs)
-
     # If ragged_out=True, convert output from sparse to ragged.
     if ragged_out:
-      return ragged_tensor.RaggedTensor.from_sparse(partial_output)
+      return ragged_array_ops.cross(partial_inputs)
     elif sparse_out:
-      return partial_output
+      return sparse_ops.sparse_cross(partial_inputs)
     else:
-      return sparse_ops.sparse_tensor_to_dense(partial_output)
+      return sparse_ops.sparse_tensor_to_dense(
+          sparse_ops.sparse_cross(partial_inputs))
 
   def call(self, inputs):
     depth_tuple = self._depth_tuple if self.depth else (len(inputs),)
     ragged_out = sparse_out = False
-    if all([ragged_tensor.is_ragged(inp) for inp in inputs]):
-      # (b/144500510) ragged.map_flat_values(sparse_cross_hashed, inputs) will
-      # cause kernel failure. Investigate and find a more efficient
-      # implementation
-      inputs = [inp.to_sparse() for inp in inputs]
+    if any([ragged_tensor.is_ragged(inp) for inp in inputs]):
       ragged_out = True
-    else:
-      if any([ragged_tensor.is_ragged(inp) for inp in inputs]):
-        raise ValueError(
-            'Inputs must be either all `RaggedTensor`, or none of them should '
-            'be `RaggedTensor`, got {}'.format(inputs))
-
-      if any([isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs]):
-        sparse_out = True
+    elif any([isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs]):
+      sparse_out = True
 
     outputs = []
     for depth in depth_tuple:
@@ -229,15 +166,22 @@ class CategoryCrossing(Layer):
   def compute_output_signature(self, input_spec):
     input_shapes = [x.shape for x in input_spec]
     output_shape = self.compute_output_shape(input_shapes)
-    output_dtype = dtypes.int64 if self.num_bins else dtypes.string
-    return sparse_tensor.SparseTensorSpec(
-        shape=output_shape, dtype=output_dtype)
+    if any([
+        isinstance(inp_spec, ragged_tensor.RaggedTensorSpec)
+        for inp_spec in input_spec
+    ]):
+      return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.string)
+    elif any([
+        isinstance(inp_spec, sparse_tensor.SparseTensorSpec)
+        for inp_spec in input_spec
+    ]):
+      return sparse_tensor.SparseTensorSpec(
+          shape=output_shape, dtype=dtypes.string)
+    return tensor_spec.TensorSpec(shape=output_shape, dtype=dtypes.string)
 
   def get_config(self):
     config = {
         'depth': self.depth,
-        'num_bins': self.num_bins,
-        'hash_key': self.hash_key
     }
     base_config = super(CategoryCrossing, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
new file mode 100644
index 00000000000..e1ba91e3558
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
@@ -0,0 +1,64 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for keras.layers.preprocessing.normalization."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.framework import config
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras.layers.preprocessing import categorical_crossing
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.platform import test
+
+
+@combinations.generate(
+    combinations.combine(
+        # Investigate why crossing is not supported with TPU.
+        distribution=strategy_combinations.strategies_minus_tpu,
+        mode=['eager', 'graph']))
+class CategoryCrossingDistributionTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_distribution(self, distribution):
+    input_array_1 = np.array([['a', 'b'], ['c', 'd']])
+    input_array_2 = np.array([['e', 'f'], ['g', 'h']])
+
+    # pyformat: disable
+    expected_output = [[b'a_X_e', b'a_X_f', b'b_X_e', b'b_X_f'],
+                       [b'c_X_g', b'c_X_h', b'd_X_g', b'd_X_h']]
+    config.set_soft_device_placement(True)
+
+    with distribution.scope():
+      input_data_1 = keras.Input(shape=(2,), dtype=dtypes.string)
+      input_data_2 = keras.Input(shape=(2,), dtype=dtypes.string)
+      input_data = [input_data_1, input_data_2]
+      layer = categorical_crossing.CategoryCrossing()
+      int_data = layer(input_data)
+      model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict([input_array_1, input_array_2])
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
index 49d8f0d7003..5bbcf5ce022 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_test.py
@@ -40,7 +40,7 @@ from tensorflow.python.platform import test
 @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
 class CategoryCrossingTest(keras_parameterized.TestCase):
 
-  def test_crossing_basic(self):
+  def test_crossing_sparse_inputs(self):
     layer = categorical_crossing.CategoryCrossing()
     inputs_0 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -52,36 +52,6 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
     self.assertAllEqual([b'a_X_d', b'b_X_e', b'c_X_e'], output.values)
 
-  def test_crossing_sparse_inputs(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=1)
-    inputs_0 = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 0], [1, 1]],
-        values=['a', 'b', 'c'],
-        dense_shape=[2, 2])
-    inputs_1 = sparse_tensor.SparseTensor(
-        indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    self.assertAllClose([0, 0, 0], output.values)
-
-  def test_crossing_sparse_inputs_with_hash_key(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2, hash_key=133)
-    inputs_0 = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 0], [1, 1]],
-        values=['a', 'b', 'c'],
-        dense_shape=[2, 2])
-    inputs_1 = sparse_tensor.SparseTensor(
-        indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    self.assertAllClose([1, 0, 1], output.values)
-
-    layer_2 = categorical_crossing.CategoryCrossing(num_bins=2, hash_key=137)
-    output = layer_2([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    # Note the output is different with above.
-    self.assertAllClose([0, 1, 0], output.values)
-
   def test_crossing_sparse_inputs_depth_int(self):
     layer = categorical_crossing.CategoryCrossing(depth=1)
     inputs_0 = sparse_tensor.SparseTensor(
@@ -127,35 +97,15 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
         [expected_outputs_0, expected_outputs_1, expected_outputs_2], axis=0)
     self.assertAllEqual(expected_out, output)
 
-  def test_crossing_hashed_two_bins(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
-    inputs_0 = sparse_tensor.SparseTensor(
-        indices=[[0, 0], [1, 0], [1, 1]],
-        values=['a', 'b', 'c'],
-        dense_shape=[2, 2])
-    inputs_1 = sparse_tensor.SparseTensor(
-        indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose(np.asarray([[0, 0], [1, 0], [1, 1]]), output.indices)
-    self.assertEqual(output.values.numpy().max(), 1)
-    self.assertEqual(output.values.numpy().min(), 0)
-
-  def test_crossing_hashed_ragged_inputs(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
+  def test_crossing_ragged_inputs(self):
     inputs_0 = ragged_factory_ops.constant(
         [['omar', 'skywalker'], ['marlo']],
         dtype=dtypes.string)
     inputs_1 = ragged_factory_ops.constant(
         [['a'], ['b']],
         dtype=dtypes.string)
-    out_data = layer([inputs_0, inputs_1])
-    expected_output = [[0, 0], [0]]
-    self.assertAllClose(expected_output, out_data)
     inp_0_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
     inp_1_t = input_layer.Input(shape=(None,), ragged=True, dtype=dtypes.string)
-    out_t = layer([inp_0_t, inp_1_t])
-    model = training.Model(inputs=[inp_0_t, inp_1_t], outputs=out_t)
-    self.assertAllClose(expected_output, model.predict([inputs_0, inputs_1]))
 
     non_hashed_layer = categorical_crossing.CategoryCrossing()
     out_t = non_hashed_layer([inp_0_t, inp_1_t])
@@ -198,16 +148,6 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertIsInstance(output, ragged_tensor.RaggedTensor)
     self.assertAllEqual(expected_output, output)
 
-  def test_invalid_mixed_sparse_and_ragged_input(self):
-    with self.assertRaises(ValueError):
-      layer = categorical_crossing.CategoryCrossing(num_bins=2)
-      inputs_0 = ragged_factory_ops.constant(
-          [['omar'], ['marlo']],
-          dtype=dtypes.string)
-      inputs_1 = sparse_tensor.SparseTensor(
-          indices=[[0, 1], [1, 2]], values=['d', 'e'], dense_shape=[2, 3])
-      layer([inputs_0, inputs_1])
-
   def test_crossing_with_dense_inputs(self):
     layer = categorical_crossing.CategoryCrossing()
     inputs_0 = np.asarray([[1, 2]])
@@ -251,13 +191,6 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertAllEqual(expected_output,
                         model.predict([inputs_0, inputs_1, inputs_2]))
 
-  def test_crossing_hashed_with_dense_inputs(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
-    inputs_0 = np.asarray([[1, 2]])
-    inputs_1 = np.asarray([[1, 3]])
-    output = layer([inputs_0, inputs_1])
-    self.assertAllClose([[1, 1, 0, 0]], output)
-
   def test_crossing_compute_output_signature(self):
     input_shapes = [
         tensor_shape.TensorShape([2, 2]),
@@ -272,18 +205,9 @@ class CategoryCrossingTest(keras_parameterized.TestCase):
     self.assertEqual(output_spec.shape.dims[0], input_shapes[0].dims[0])
     self.assertEqual(output_spec.dtype, dtypes.string)
 
-    layer = categorical_crossing.CategoryCrossing(num_bins=2)
-    output_spec = layer.compute_output_signature(input_specs)
-    self.assertEqual(output_spec.shape.dims[0], input_shapes[0].dims[0])
-    self.assertEqual(output_spec.dtype, dtypes.int64)
-
-  def test_crossing_with_invalid_hash_key(self):
-    with self.assertRaises(ValueError):
-      _ = categorical_crossing.CategoryCrossing(hash_key=133)
-
   @tf_test_util.run_v2_only
   def test_config_with_custom_name(self):
-    layer = categorical_crossing.CategoryCrossing(num_bins=2, name='hashing')
+    layer = categorical_crossing.CategoryCrossing(depth=2, name='hashing')
     config = layer.get_config()
     layer_1 = categorical_crossing.CategoryCrossing.from_config(config)
     self.assertEqual(layer_1.name, layer.name)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
new file mode 100644
index 00000000000..0407188ab6b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -0,0 +1,222 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryCrossing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.categorical_crossing.CategoryCrossing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'depth\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "partial_crossing"
+    argspec: "args=[\'self\', \'partial_inputs\', \'ragged_out\', \'sparse_out\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index f369c32a65e..20e5ca1af9c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.layers.experimental.preprocessing"
 tf_module {
+  member {
+    name: "CategoryCrossing"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
new file mode 100644
index 00000000000..0407188ab6b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-category-crossing.pbtxt
@@ -0,0 +1,222 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.CategoryCrossing"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.categorical_crossing.CategoryCrossing\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'depth\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "partial_crossing"
+    argspec: "args=[\'self\', \'partial_inputs\', \'ragged_out\', \'sparse_out\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index f369c32a65e..20e5ca1af9c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.keras.layers.experimental.preprocessing"
 tf_module {
+  member {
+    name: "CategoryCrossing"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "CenterCrop"
     mtype: "<type \'type\'>"

From 5fee245d9ff76b9fae2b7404f47aae83c84b8564 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Wed, 13 May 2020 13:57:37 -0700
Subject: [PATCH 131/412] [XLA] Basic (R1) support for CPU bounded dynamic
 shapes.

- Add dynamic tensor metadata read/write in XRT.
- Implement two custom calls: PadToStatic and SliceToDynamic -- R1 only.
- Some helper functions in shape util to do sanity check.
- Tests -- R1 Only.

PiperOrigin-RevId: 311398639
Change-Id: I7129fd13f4e0a2b7a14efb52eb814f753a15e05e
---
 tensorflow/compiler/xla/BUILD                 |   1 +
 tensorflow/compiler/xla/service/cpu/BUILD     |   1 +
 .../compiler/xla/service/cpu/cpu_compiler.cc  |  15 +-
 .../xla/service/cpu/cpu_executable.cc         |   7 +-
 .../compiler/xla/service/cpu/ir_emitter.cc    |  88 ++++++
 .../compiler/xla/service/cpu/ir_emitter.h     |   2 +
 .../compiler/xla/service/shaped_buffer.h      |  13 +
 tensorflow/compiler/xla/shape_util.cc         |  36 ++-
 tensorflow/compiler/xla/shape_util.h          |  29 +-
 tensorflow/compiler/xrt/kernels/BUILD         |   1 +
 .../compiler/xrt/kernels/xrt_execute_op.cc    | 257 ++++++++++++++-
 tensorflow/compiler/xrt/tests/raw_api_test.cc | 299 ++++++++++++++++++
 12 files changed, 726 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 0193bea9d6d..45f49cee328 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -331,6 +331,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:regexp_internal",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 121bdedf2dd..2f432cd9356 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -146,6 +146,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:conditional_simplifier",
         "//tensorflow/compiler/xla/service:convolution_group_converter",
         "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dynamic_padder",
         "//tensorflow/compiler/xla/service:dynamic_index_splitter",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:flatten_call_graph",
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index b04237138e8..fe769bbdd2a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -72,6 +72,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_index_splitter.h"
+#include "tensorflow/compiler/xla/service/dynamic_padder.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -239,7 +240,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   HloPassPipeline pipeline("HLO passes through layout assignment");
   pipeline.AddInvariantChecker<HloVerifier>(/*layout_sensitive=*/false,
                                             /*allow_mixed_precision=*/false);
-
   // Expand random number generation.
   pipeline.AddPass<RngExpander>();
   pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
@@ -273,6 +273,13 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<ConvolutionGroupConverter>(
       cost_model,
       /*convert_batch_groups_only=*/false);
+  pipeline.AddPass<ScatterExpander>();
+  pipeline.AddPass<BatchNormExpander>(
+      /*rewrite_training_op=*/true,
+      /*rewrite_inference_op=*/true,
+      /*rewrite_grad_op=*/true);
+  pipeline.AddPass<DynamicPadder>();
+  pipeline.AddPass<HloGetDimensionSizeRewriter>();
   pipeline.AddPass<ConvCanonicalization>(target_machine_features);
   {
     auto& pass =
@@ -281,12 +288,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
                                                /*allow_mixed_precision=*/false);
 
     pass.AddPass<TreeReductionRewriter>();
-    pass.AddPass<ScatterExpander>();
-    pass.AddPass<BatchNormExpander>(
-        /*rewrite_training_op=*/true,
-        /*rewrite_inference_op=*/true,
-        /*rewrite_grad_op=*/true);
-    pipeline.AddPass<HloGetDimensionSizeRewriter>();
     AlgebraicSimplifierOptions options;
     options.set_enable_dot_strength_reduction(false);
     pass.AddPass<AlgebraicSimplifier>(options);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 8c1ae0179c0..f031daecb1f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -363,7 +363,12 @@ StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
   if (shape.IsOpaque()) {
     return sizeof(void*);
   }
-  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  if (shape.is_static() || shape.IsTuple()) {
+    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  }
+  // Each dynamic dimension size is represented as a S32.
+  int64 metadata_size = sizeof(int32) * shape.dimensions_size();
+  return ShapeUtil::ByteSizeOf(shape, sizeof(void*)) + metadata_size;
 }
 
 const InstructionValueSet& CpuExecutable::GetRootValueSet() const {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 2b715bfa17a..f516a1538d3 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -2357,7 +2357,95 @@ Status IrEmitter::HandleCall(HloInstruction* call) {
   return Status::OK();
 }
 
+Status IrEmitter::HandleSliceToDynamic(HloInstruction* hlo) {
+  // TODO(jackcao): Generalize this to generic llvm emitter.
+  TF_RET_CHECK(hlo->shape().rank() == 1);
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
+  for (int64 i = 1; i < hlo->operand_count(); ++i) {
+    const int64 dim_index = i - 1;
+    llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(i));
+    llvm::LoadInst* dim_size = b_.CreateLoad(source_buffer, "dim_size");
+    llvm::Value* dest_buffer = GetEmittedValueFor(hlo);
+    llvm::Value* raw_buffer =
+        b_.CreateBitCast(dest_buffer, b_.getInt8Ty()->getPointerTo());
+
+    int32 raw_data_size =
+        ShapeUtil::ByteSizeOf(ShapeUtil::MakeStaticShape(hlo->shape()));
+    llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
+        b_.getInt8Ty(), raw_buffer, raw_data_size + dim_index * sizeof(int32));
+    b_.CreateStore(dim_size,
+                   b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()));
+  }
+
+  return EmitTargetElementLoop(hlo,
+                               [=](const llvm_ir::IrArray::Index& dest_index) {
+                                 // TODO(jackcao): Properly linearize dest_index
+                                 // and delinearize to source index.
+                                 return GetIrArrayFor(hlo->operand(0))
+                                     .EmitReadArrayElement(dest_index, &b_);
+                               });
+}
+
+Status IrEmitter::HandlePadToStatic(HloInstruction* hlo) {
+  // TODO(jackcao): Generalize this to generic llvm emitter.
+  TF_RET_CHECK(hlo->operand(0)->shape().rank() == 1);
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(hlo));
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice data_slice,
+                      assignment_.GetUniqueSlice(hlo, {0}));
+  const Shape& data_shape = ShapeUtil::GetSubshape(hlo->shape(), {0});
+  llvm::Value* data_address = EmitBufferPointer(data_slice, data_shape);
+  llvm_ir::IrArray data_array(data_address, data_shape);
+  TF_RETURN_IF_ERROR(llvm_ir::LoopEmitter(
+                         [=](const llvm_ir::IrArray::Index& dest_index) {
+                           // TODO(jackcao): Properly linearize dest_index and
+                           // delinearize to source index.
+                           return GetIrArrayFor(hlo->operand(0))
+                               .EmitReadArrayElement(dest_index, &b_);
+                         },
+                         llvm_ir::IrArray(data_address, data_shape), &b_)
+                         .EmitLoop(IrName(hlo)));
+  std::vector<llvm::Value*> tuple_operand_ptrs;
+  tuple_operand_ptrs.push_back(data_array.GetBasePointer());
+
+  // PadToStatic has a dynamic tensor as input and variadic size of outputs:
+  // (static_tensor, dynamic_dim_0, dynamic_dim_1, ... )
+  // Dynamic dimension sizes starts from output index 1.
+  for (int64 i = 1; i < hlo->shape().tuple_shapes_size(); ++i) {
+    // Read from the metadata section of the dynamic input (operand 0).
+    const Shape& dim_shape = ShapeUtil::GetSubshape(hlo->shape(), {i});
+    TF_RET_CHECK(Shape::Equal()(dim_shape, ShapeUtil::MakeScalarShape(S32)));
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dim_size_slice,
+                        assignment_.GetUniqueSlice(hlo, {i}));
+    llvm::Value* dest_dim_size_address =
+        EmitBufferPointer(dim_size_slice, data_shape);
+    const int64 dim_index = i - 1;
+    llvm::Value* source_buffer = GetEmittedValueFor(hlo->operand(0));
+    llvm::Value* raw_buffer =
+        b_.CreateBitCast(source_buffer, b_.getInt8Ty()->getPointerTo());
+    int32 raw_data_size = ShapeUtil::ByteSizeOf(
+        ShapeUtil::MakeStaticShape(hlo->operand(0)->shape()));
+    llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
+        b_.getInt8Ty(), raw_buffer, raw_data_size + dim_index * sizeof(int32));
+    llvm::Value* dim_size = b_.CreateLoad(
+        b_.CreateBitCast(metadata, b_.getInt32Ty()->getPointerTo()));
+    b_.CreateStore(dim_size, b_.CreateBitCast(dest_dim_size_address,
+                                              b_.getInt32Ty()->getPointerTo()));
+    tuple_operand_ptrs.push_back(dest_dim_size_address);
+  }
+
+  // Emit static tensor and dynamic sizes as one tuple.
+  llvm_ir::EmitTuple(GetIrArrayFor(hlo), tuple_operand_ptrs, &b_);
+  return Status::OK();
+}
+
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
+  if (custom_call->custom_call_target() == "PadToStatic") {
+    return HandlePadToStatic(custom_call);
+  }
+  if (custom_call->custom_call_target() == "SliceToDynamic") {
+    return HandleSliceToDynamic(custom_call);
+  }
   absl::Span<HloInstruction* const> operands(custom_call->operands());
   llvm::Type* i8_ptr_type = b_.getInt8PtrTy();
   llvm::AllocaInst* operands_alloca =
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 24524c67b11..9b0d11e9f3f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -183,6 +183,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   }
 
  private:
+  Status HandleSliceToDynamic(HloInstruction* hlo);
+  Status HandlePadToStatic(HloInstruction* hlo);
   Status HandleAllReduceSingleReplica(HloInstruction* crs);
   Status HandleAllReduceMultipleReplica(HloInstruction* crs);
 
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index a1872330648..b7a67b4e66e 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
@@ -93,6 +94,18 @@ class ShapedBuffer {
     buffers_.replace_shape_ptr(&on_device_shape_);
   }
 
+  // Reset the shape of this shaped buffer and underlying buffer structure.
+  //
+  // Precondition: EqualStructure(this->on_device_shape_, on_device_shape).
+  void set_shapes(const Shape& on_host_shape, const Shape& on_device_shape) {
+    CHECK(ShapeUtil::EqualStructure(on_device_shape, on_device_shape_))
+        << "Structures are not the same. new: " << on_device_shape
+        << ", old: " << on_device_shape_;
+    on_host_shape_ = on_host_shape;
+    on_device_shape_ = on_device_shape;
+    buffers_.replace_shape_ptr(&on_device_shape_);
+  }
+
   // Returns the underlying ShapeTree containing all the device addresses in the
   // ShapedBuffer.
   const ShapeTree<se::DeviceMemoryBase>& buffers() const { return buffers_; }
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 22ee5a16a30..52cbb8f95ac 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
@@ -150,6 +151,19 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   return equal;
 }
 
+/* static */ bool ShapeUtil::EqualStructure(const Shape& lhs,
+                                            const Shape& rhs) {
+  bool equal = true;
+  ForEachSubshape(lhs, [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+    equal &= IndexIsValid(rhs, index);
+  });
+  ForEachSubshape(rhs, [&](const Shape& /*subshape*/, const ShapeIndex& index) {
+    equal &= IndexIsValid(lhs, index);
+  });
+
+  return equal;
+}
+
 /* static */ int64 ShapeUtil::TrueRank(const Shape& shape) {
   int64 accum = 0;
   for (int64 dimension : shape.dimensions()) {
@@ -261,6 +275,12 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return ValidateShape(*shape);
 }
 
+/* static */ Shape ShapeUtil::MakeStaticShape(const Shape& original) {
+  Shape result = original;
+  result.clear_dynamic_dimensions();
+  return result;
+}
+
 /* static */ Shape ShapeUtil::MakeTupleShape(absl::Span<const Shape> shapes) {
   Shape result;
   result.set_element_type(TUPLE);
@@ -626,8 +646,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   if (shape.element_type() == TUPLE) {
     return ByteSizeOfTupleIndexTable(shape, pointer_size);
   } else if (shape.IsArray()) {
-    int64 byte_size = ByteSizeOfElements(shape);
-    return byte_size;
+    return ByteSizeOfElements(shape);
   } else if (shape.element_type() == TOKEN) {
     return 0;
   } else if (shape.element_type() == OPAQUE_TYPE) {
@@ -1441,6 +1460,19 @@ ShapeUtil::ReshapeLeavesDimensionsUnmodified(
   return shape;
 }
 
+/* static */ bool ShapeUtil::DynamicShapeIsCompatible(
+    const xla::Shape& dynamic_shape, const xla::Shape& bounded_shape) {
+  if (dynamic_shape.rank() != bounded_shape.rank()) {
+    return false;
+  }
+  for (int64 i = 0; i < dynamic_shape.rank(); ++i) {
+    if (dynamic_shape.dimensions(i) > bounded_shape.dimensions(i)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 /* static */ Shape ShapeUtil::FilterDimensions(
     const std::function<bool(int64)>& p, Shape shape) {
   CHECK(shape.IsArray());
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 7e05e17865d..dde56587482 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -298,6 +298,16 @@ class ShapeUtil {
   // As Equal, but allow one of lhs and rhs to be F16 while the other is F32.
   static bool EqualIgnoringFpPrecision(const Shape& lhs, const Shape& rhs);
 
+  // Two shapes have same structure if all subshape indices of lhs are presented
+  // on rhs and vice versa.
+  // A nested tuple shape of (F32, (S32[2], F32[2, 2])) is structurally equal to
+  // (S32, (F32[3], S32[2])) as their structures are both (,(,))
+  //
+  // In contrast, (F32, (F32, F32)) is structurally different from
+  // ((F32, F32), F32) as the former has structure (,(,)) while the latter has
+  // ((,),)
+  static bool EqualStructure(const Shape& lhs, const Shape& rhs);
+
   // Returns the number of dimensions for which the dimension is not (trivially)
   // 1. e.g., f32[2x1x1] has a true rank of 1D, the other dimensions are just
   // fluff. Note that zero dimensions are included in the true rank, e.g.,
@@ -339,6 +349,9 @@ class ShapeUtil {
   // element type changed to type.
   static Shape ChangeElementType(const Shape& original, PrimitiveType type);
 
+  // Retursn a shape with same dimensions but with all dimensions set to static.
+  static Shape MakeStaticShape(const Shape& original);
+
   // Creates a tuple shape from a slice of element shapes within the tuple.
   static Shape MakeTupleShape(absl::Span<const Shape> shapes);
 
@@ -643,12 +656,16 @@ class ShapeUtil {
   static Shape FilterDimensions(const std::function<bool(int64)>& p,
                                 Shape shape);
 
-  // Iterates through all the shape indexes, in minor to major order, starting
-  // from the base indexes, incrementing by the incr steps, up to count
-  // (index[i] < base[i] + count[i]), and calls the visitor_function with the
-  // current index.
-  // The visitor_function visitor function should return true if it wants to
-  // continue, or false otherwise.
+  // Returns true if `dynamic_shape` has dimensions that are less-equal to the
+  // "bounded_shape".
+  static bool DynamicShapeIsCompatible(const xla::Shape& dynamic_shape,
+                                       const xla::Shape& bounded_shape);
+
+  // Iterates through all the shape indexes, in minor to major order,
+  // starting from the base indexes, incrementing by the incr steps, up to
+  // count (index[i] < base[i] + count[i]), and calls the visitor_function
+  // with the current index. The visitor_function visitor function should
+  // return true if it wants to continue, or false otherwise.
   //
   // visitor_function must be a callable of type
   // StatusOr<bool>(absl::Span<int64>) or compatible.
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index d71e6e2cc73..494ba29e981 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -49,6 +49,7 @@ cc_library(
     deps = [
         ":xrt_state_ops",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index d39b37387f2..2fc599e42df 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
@@ -38,7 +39,11 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/monitoring/timed.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/platform.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
 
@@ -146,6 +151,231 @@ xla::StatusOr<InputBuffers> GetChainedOpInputs(
   return std::move(input_buffers);
 }
 
+// Given a shape, returns a byte array representing the shape metadata of the
+// shape. The shape metadata contains dimensions sizes stored as contiguous S32.
+std::vector<int32> PrepareMetadata(const xla::Shape& shape) {
+  DCHECK(shape.is_static());
+  DCHECK(shape.IsArray());
+  // Each dimension size is stored as a S32.
+  std::vector<int32> result(shape.dimensions_size());
+  for (int64 i = 0; i < shape.dimensions_size(); ++i) {
+    result[i] = shape.dimensions(i);
+  }
+  return result;
+}
+
+// Given a buffer with dynamic shape, update buffer metadata at the correct
+// offset starting from that buffer.
+//
+// +-----------+
+// |Payload    |
+// +-----------+
+// | Padding   |
+// +-----------+
+// |dim_size_0 |  (each dim_size is a S32):
+// +-----------+
+// |dim_size_1 |
+// +-----------+
+//  ..........
+// +-----------+
+//
+// Size of payload = ByteSizeOf(runtime_shape)
+// Size of payload + padding = ByteSizeOf(compile_time_shape_static)
+// Size of payload + padding + metadata = ByteSizeOf(compile_time_shape)
+Status UpdateMetadata(se::Stream* stream, se::DeviceMemory<uint8>* buffer,
+                      const xla::Shape& compile_time_shape,
+                      const xla::Shape& runtime_shape) {
+  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
+                                         stream->parent()->platform()));
+  TF_ASSIGN_OR_RETURN(
+      auto transfer_manager,
+      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+  xla::Shape compile_time_shape_static =
+      xla::ShapeUtil::MakeStaticShape(compile_time_shape);
+  uint64 offset = shape_size_fn(compile_time_shape_static);
+  uint64 metadata_size = shape_size_fn(compile_time_shape) - offset;
+  auto metadata_buffer =
+      stream->parent()->GetSubBuffer(buffer, offset, metadata_size);
+
+  auto metadata_literal = std::make_shared<xla::Literal>(
+      xla::LiteralUtil::CreateR1<int32>(PrepareMetadata(runtime_shape)));
+  TF_RETURN_IF_ERROR(transfer_manager->TransferArrayToDeviceAsync(
+      stream, *metadata_literal, metadata_buffer));
+  // Retain the literal until the end of the transfer.
+  stream->ThenDoHostCallback([metadata_literal]() { return Status::OK(); });
+  return Status::OK();
+}
+
+// Given a static input buffer, convert it to dynamic form by expanding it to
+// the bounded size and attaching a metadata filled with dimension sizes.
+//
+// From:
+// +--------+
+// |Payload |
+// +--------+
+//
+// To:
+//
+// +--------+
+// |Payload |
+// +--------+
+// | Padding|
+// +--------+
+// |Metadata|
+// +--------+
+//
+// As we can't expand the size of an existing memory allocation, a reallocation
+// is required. A list of new allocations are returned after this function. The
+// caller is reponsible for maintaining those allocations.
+xla::StatusOr<std::vector<se::OwningDeviceMemory>> UpdateDynamicInputs(
+    se::Stream* stream, se::DeviceMemoryAllocator* allocator,
+    std::vector<xla::ShapedBuffer*> runtime_inputs,
+    const std::vector<xla::ShapeLayout>& compile_time_shapes) {
+  std::vector<se::OwningDeviceMemory> new_allocations;
+  TF_RET_CHECK(runtime_inputs.size() == compile_time_shapes.size());
+  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
+                                         stream->parent()->platform()));
+  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+  for (int64 i = 0; i < compile_time_shapes.size(); i++) {
+    const xla::Shape& compile_time_shape = compile_time_shapes[i].shape();
+    if (compile_time_shape.is_static()) {
+      continue;
+    }
+    auto* runtime_input = runtime_inputs[i];
+
+    bool element_modified = false;
+    TF_RETURN_IF_ERROR(xla::ShapeUtil::ForEachSubshapeWithStatus(
+        compile_time_shape,
+        [&](const xla::Shape& compile_time_shape,
+            const xla::ShapeIndex& index) -> Status {
+          if (compile_time_shape.IsTuple() || compile_time_shape.is_static()) {
+            return Status::OK();
+          }
+          const xla::Shape& runtime_shape = xla::ShapeUtil::GetSubshape(
+              runtime_input->on_device_shape(), index);
+          TF_RET_CHECK(!runtime_shape.IsTuple());
+          TF_RET_CHECK(xla::ShapeUtil::DynamicShapeIsCompatible(
+              runtime_shape, compile_time_shape));
+          se::DeviceMemoryBase* static_input =
+              runtime_input->buffers().mutable_element(index);
+          TF_ASSIGN_OR_RETURN(
+              auto dynamic_input,
+              allocator->Allocate(stream->parent()->device_ordinal(),
+                                  shape_size_fn(compile_time_shape)));
+          new_allocations.emplace_back(std::move(dynamic_input));
+          se::DeviceMemory<uint8>* dynamic_input_base =
+              new_allocations.back().ptr();
+          // Send the original data to the new location.
+          stream->ThenMemcpyD2D(dynamic_input_base, *static_input,
+                                static_input->size());
+          TF_RETURN_IF_ERROR(UpdateMetadata(stream, dynamic_input_base,
+                                            compile_time_shape, runtime_shape));
+          // Modify the memory location in the input shape tree to point to the
+          // new input.
+          runtime_input->set_buffer(*dynamic_input_base, index);
+          element_modified = true;
+          return Status::OK();
+        }));
+    if (element_modified) {
+      runtime_input->set_shapes(compile_time_shape, compile_time_shape);
+      // The input location has been modified, need to fix tuple table to
+      // point to the correct address.
+      TF_ASSIGN_OR_RETURN(
+          auto transfer_manager,
+          xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+      TF_RETURN_IF_ERROR(
+          transfer_manager->WriteTupleIndexTablesAsync(stream, *runtime_input));
+    }
+  }
+  return std::move(new_allocations);
+}
+
+xla::StatusOr<xla::Literal> ReadMetadataLiteral(
+    se::Stream* stream, se::DeviceMemoryBase* buffer,
+    const xla::Shape& buffer_shape, xla::TransferManager* transfer_manager) {
+  TF_ASSIGN_OR_RETURN(auto compiler, xla::Compiler::GetForPlatform(
+                                         stream->parent()->platform()));
+  auto shape_size_fn = compiler->ShapeSizeBytesFunction();
+  xla::Shape buffer_shape_static =
+      xla::ShapeUtil::MakeStaticShape(buffer_shape);
+  const int64 offset = shape_size_fn(buffer_shape_static);
+  int64 metadata_size = shape_size_fn(buffer_shape) - offset;
+  TF_RET_CHECK(metadata_size != 0);
+  auto buffer_8 = se::DeviceMemory<uint8>(*buffer);
+  auto metadata_buffer =
+      stream->parent()->GetSubBuffer(&buffer_8, offset, metadata_size);
+  return transfer_manager->TransferArrayFromDevice(
+      stream,
+      xla::ShapeUtil::MakeShape(xla::S32, {buffer_shape.dimensions_size()}),
+      metadata_buffer);
+}
+
+// For each subshape in the result buffer that's dynamic, read the dynamic
+// dimension sizes from the metadata, and update output shapes. The result shape
+// is a static and concrete shape.
+xla::Status UpdateDynamicOutputs(se::Stream* stream,
+                                 xla::ShapedBuffer* shaped_buffer,
+                                 xla::Shape* output_host_shape,
+                                 xla::Shape* output_device_shape) {
+  DCHECK(output_device_shape->is_dynamic());
+  TF_ASSIGN_OR_RETURN(
+      auto transfer_manager,
+      xla::TransferManager::GetForPlatform(stream->parent()->platform()));
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
+  TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
+      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+        const xla::Shape& buffer_shape =
+            xla::ShapeUtil::GetSubshape(*output_device_shape, index);
+        if (buffer_shape.IsTuple()) {
+          return Status::OK();
+        }
+        xla::Shape& host_shape =
+            *xla::ShapeUtil::GetMutableSubshape(output_host_shape, index);
+        xla::Shape& device_shape =
+            *xla::ShapeUtil::GetMutableSubshape(output_device_shape, index);
+        if (device_shape.is_static()) {
+          return Status::OK();
+        }
+        TF_ASSIGN_OR_RETURN(auto metadata,
+                            ReadMetadataLiteral(stream, buffer, buffer_shape,
+                                                transfer_manager));
+        // Update shape size from metadata.
+        for (int64 i = 0; i < metadata.element_count(); ++i) {
+          host_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+          device_shape.mutable_dimensions()[i] = metadata.Get<int32>({i});
+        }
+        return Status::OK();
+      }));
+  output_host_shape->clear_dynamic_dimensions();
+  output_device_shape->clear_dynamic_dimensions();
+  return Status::OK();
+}
+
+// Create output tuple from run_result.
+xla::StatusOr<RefPtr<XRTTupleAllocation>> CreateOutputTuple(
+    se::Stream* stream, xla::ScopedShapedBuffer run_result,
+    xla::Backend* backend, int device_ordinal) {
+  XRTTupleAllocation* output_tuple;
+  xla::ShapedBuffer shaped_buffer = run_result.release();
+  if (shaped_buffer.on_device_shape().is_dynamic()) {
+    // Update dynamic shapes from output buffer, and create a XRT tensor with
+    // dimension sizes read from metadata.
+    xla::Shape output_host_shape = shaped_buffer.on_host_shape();
+    xla::Shape output_device_shape = shaped_buffer.on_device_shape();
+    TF_RETURN_IF_ERROR(UpdateDynamicOutputs(
+        stream, &shaped_buffer, &output_host_shape, &output_device_shape));
+    TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
+        shaped_buffer, output_host_shape, output_device_shape, backend,
+        device_ordinal, &output_tuple));
+  } else {
+    // Fast-path: Don't copy shapes of output buffer.
+    TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
+        shaped_buffer, backend, device_ordinal, &output_tuple));
+  }
+  return RefPtr<XRTTupleAllocation>(output_tuple);
+}
+
 xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
     OpKernelContext* context, XRTGenericDeviceAccessor::ScopedRef* device_ref,
     xla::LocalExecutable* executable, const InputBuffers& input_buffers,
@@ -191,18 +421,31 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
 
   Env* env = Env::Default();
   auto start_time = env->NowMicros();
+  const std::vector<xla::ShapeLayout>& shape_layouts =
+      executable->executable()
+          ->module_config()
+          .entry_computation_layout()
+          .parameter_layouts();
+  TF_ASSIGN_OR_RETURN(auto new_allocations,
+                      UpdateDynamicInputs(stream, run_options.allocator(),
+                                          input_buffers.input_pointers,
+                                          shape_layouts));
+  auto new_allocations_ptr =
+      std::make_shared<std::vector<se::OwningDeviceMemory>>(
+          std::move(new_allocations));
   TF_ASSIGN_OR_RETURN(
       xla::ScopedShapedBuffer run_result,
       executable->Run(input_buffers.input_pointers, run_options));
+  // Retain the new allocation for input memory until the end of execution.
+  stream->ThenDoHostCallback([new_allocations_ptr]() { return Status::OK(); });
+
   auto elapsed = env->NowMicros() - start_time;
   VLOG(2) << "Elapsed time: " << elapsed << "us";
 
-  auto shaped_buffer = run_result.release();
-  XRTTupleAllocation* output_tuple;
-  TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
-      shaped_buffer, device_ref->backend(), device_ref->device_ordinal(),
-      &output_tuple));
-  RefPtr<XRTTupleAllocation> output_tuple_ptr(output_tuple);
+  TF_ASSIGN_OR_RETURN(
+      RefPtr<XRTTupleAllocation> output_tuple_ptr,
+      CreateOutputTuple(stream, std::move(run_result), device_ref->backend(),
+                        device_ref->device_ordinal()));
 
   // The ScopedShapedBuffer returned by the executable Run() API, in case of
   // input/output buffer aliasing, might have holes in it, which need to be
@@ -215,7 +458,7 @@ xla::StatusOr<RefPtr<XRTTupleAllocation>> RunExecutable(
           const xla::HloInputOutputAliasConfig::Alias& alias) -> Status {
     TF_RET_CHECK(alias.parameter_number < input_buffers.input_tuples.size());
     return alias.kind == xla::HloInputOutputAliasConfig::AliasKind::kUserAlias
-               ? output_tuple->AliasBufferFrom(
+               ? output_tuple_ptr->AliasBufferFrom(
                      *input_buffers.input_tuples[alias.parameter_number],
                      alias.parameter_index, output_index)
                : Status::OK();
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 243289c8821..fbf9dfd0a17 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -49,6 +49,67 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+xla::XlaComputation ReturnDynamicR1() {
+  xla::XlaBuilder builder("ReturnDynamicR1");
+  auto p0 = xla::Parameter(&builder, 0,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P0");
+  auto p1 = xla::Parameter(&builder, 1,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P1");
+  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
+                           "P2");
+  auto sum = xla::Add(p0, p1);
+  auto pad_sum = xla::SetDimensionSize(sum, p2, 0);
+  return builder.Build(pad_sum).ValueOrDie();
+}
+
+xla::XlaComputation AcceptDynamicR1() {
+  xla::XlaBuilder builder("AcceptDynamicR1");
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  auto p0 = xla::Parameter(&builder, 0, dyn_shape, "P0");
+  auto p1 = xla::Parameter(&builder, 1, dyn_shape, "P1");
+  auto sum = xla::Add(p0, p1);
+  return builder.Build(sum).ValueOrDie();
+}
+
+xla::XlaComputation ReturnDynamicR1Tuple() {
+  xla::XlaBuilder builder("ReturnDynamicR1Tuple");
+  auto p0 = xla::Parameter(&builder, 0,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P0");
+  auto p1 = xla::Parameter(&builder, 1,
+                           xla::ShapeUtil::MakeShape(xla::F32, {4}), "P1");
+  auto p2 = xla::Parameter(&builder, 2, xla::ShapeUtil::MakeShape(xla::S32, {}),
+                           "P2");
+  auto sum = xla::Add(p0, p1);
+  auto sub = xla::Sub(p0, p1);
+  auto one = xla::One(&builder, xla::S32);
+  auto pad_sum = xla::SetDimensionSize(sum, p2, 0);
+  auto pad_sub = xla::SetDimensionSize(sub, p2 + one, 0);
+  auto tuple = xla::Tuple(&builder, {pad_sum, sum, pad_sub});
+  return builder.Build(tuple, /*remove_dynamic_dimensions=*/true).ValueOrDie();
+}
+
+xla::XlaComputation AcceptDynamicR1Tuple() {
+  xla::XlaBuilder builder("AcceptDynamicR1");
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  xla::Shape tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({dyn_shape, dyn_shape});
+  xla::Shape nest_tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({dyn_shape, dyn_shape});
+  auto p = xla::Parameter(&builder, 0, tuple_shape, "P0");
+  auto p0 = xla::GetTupleElement(p, 0);
+  auto p1 = xla::GetTupleElement(p, 1);
+  auto sum = xla::Add(p0, p1);
+  return builder.Build(sum).ValueOrDie();
+}
+
+template <typename T>
+xla::LiteralProto CreateR0(T v) {
+  auto array = xla::LiteralUtil::CreateR0<T>(v);
+  return array.ToProto();
+}
+
 class XrtClientSession : public ClientSession {
  public:
   explicit XrtClientSession(const Scope& scope) : ClientSession(scope) {
@@ -61,6 +122,11 @@ class XrtClientSession : public ClientSession {
 string* xla_test_device_ptr;  // initial value set in main()
 string* xla_platform_ptr;     // initial value set in main()
 
+bool SupportDynamicShapes() {
+  // TODO(jackcao): Support dynamic shapes on XLA GPU.
+  return *xla_test_device_ptr != "XLA_GPU";
+}
+
 string DeviceFromFlag() {
   string xla_test_device = *xla_test_device_ptr;
   return absl::StrCat("/device:", xla_test_device, ":0");
@@ -1035,6 +1101,239 @@ TEST(RawApiTest, CompileAndExecute) {
   EXPECT_EQ(program_shape.parameters_size(), 2);
 }
 
+TEST(RawApiTest, DynamicR1Test) {
+  if (!SupportDynamicShapes()) {
+    return;
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f, -1.0f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, 2.5f, 1.17f});
+  xrt::XLAAllocation p2;
+  *p2.mutable_value() = CreateR0<xla::int32>(2);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(ReturnDynamicR1(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
+  auto p2_handle = ops::XRTAllocate(root, p2_value);
+  auto result = ops::XRTExecute(
+      root, c_handle.handle, e_config,
+      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, DynamicR1TupleTest) {
+  if (!SupportDynamicShapes()) {
+    return;
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f, -1.0f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f, 1.0f});
+  xrt::XLAAllocation p2;
+  *p2.mutable_value() = CreateR0<xla::int32>(2);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {4}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S32, {}).ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape(
+          {dyn_shape, xla::ShapeUtil::MakeShape(xla::F32, {4}), dyn_shape})
+          .ToProto();
+  StoreComputationSnapshot(ReturnDynamicR1Tuple(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto p2_value = ops::Const(cpu_root, p2.SerializeAsString());
+  auto p2_handle = ops::XRTAllocate(root, p2_value);
+  auto result = ops::XRTExecute(
+      root, c_handle.handle, e_config,
+      {Output(p0_handle), Output(p1_handle), Output(p2_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected0 = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f});
+  auto expected1 = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f, 0.0f});
+  auto expected2 = xla::LiteralUtil::CreateR1<float>({0.0f, 3.0f, 1.0f});
+  auto expected =
+      xla::LiteralUtil::MakeTuple({&expected0, &expected1, &expected2});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, AcceptDynamicR1TupleTest) {
+  if (!SupportDynamicShapes()) {
+    return;
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f});
+
+  xrt::XLATupleNode tuple_desc;
+  auto subdesc_10 = tuple_desc.add_tuples();
+  auto subdesc_11 = tuple_desc.add_tuples();
+  subdesc_10->set_input_index(0);
+  subdesc_10->set_release_input_handle(true);
+  subdesc_11->set_input_index(1);
+  subdesc_11->set_release_input_handle(true);
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  xla::Shape dyn_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_input_shape.set_dynamic_dimension(0, true);
+  xla::Shape dyn_tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({dyn_input_shape, dyn_input_shape});
+  *shapes->add_parameters() = dyn_tuple_shape.ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(AcceptDynamicR1Tuple(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+
+  auto tuple_0 = ops::Const(root.WithDevice("/device:CPU:0"),
+                            tuple_desc.SerializeAsString());
+  auto t0_handle = ops::XRTMakeTuple(
+      root, tuple_0,
+      {static_cast<Output>(p0_handle), static_cast<Output>(p1_handle)});
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {static_cast<Output>(t0_handle)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
+TEST(RawApiTest, AcceptDynamicR1Test) {
+  if (!SupportDynamicShapes()) {
+    return;
+  }
+  xrt::XLAAllocation p0;
+  *p0.mutable_value() = FloatVector({1.0f, 2.0f, 0.5f});
+  xrt::XLAAllocation p1;
+  *p1.mutable_value() = FloatVector({1.0f, -1.0f, -0.5f});
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  xla::Shape dyn_input_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_input_shape.set_dynamic_dimension(0, true);
+  *shapes->add_parameters() = dyn_input_shape.ToProto();
+  *shapes->add_parameters() = dyn_input_shape.ToProto();
+  xla::Shape dyn_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
+  dyn_shape.set_dynamic_dimension(0, true);
+  *shapes->mutable_result() = dyn_shape.ToProto();
+  StoreComputationSnapshot(AcceptDynamicR1(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  Scope cpu_root = root.WithDevice("/device:CPU:0");
+  auto e_config = ops::Const(cpu_root, e.SerializeAsString());
+  auto computation = ops::Const(cpu_root, c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value = ops::Const(cpu_root, p0.SerializeAsString());
+  auto allocate_op_0 = ops::XRTAllocate(root, p0_value);
+  auto p1_value = ops::Const(cpu_root, p1.SerializeAsString());
+  auto allocate_op_1 = ops::XRTAllocate(root, p1_value);
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {Output(allocate_op_0), Output(allocate_op_1)});
+  auto read_back = ops::XRTReadLiteralAndRelease(root, result);
+  TF_ASSERT_OK(root.status());
+
+  XrtClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, c_handle.program_shape}, &outputs));
+
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<tstring>()()));
+
+  auto expected = xla::LiteralUtil::CreateR1<float>({2.0f, 1.0f, 0.0f});
+  EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+}
+
 TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
   xrt::XLAAllocation p0;
   *p0.mutable_value() = FloatVector({1.0f, 2.0f});

From 062cf92d066771ab3cf2910f125b0209c305eb2b Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Wed, 13 May 2020 14:17:05 -0700
Subject: [PATCH 132/412] [tf.lite] Adds a setQuantizedModelsAllowed() Java API
 for running quant models with GPU delegate

PiperOrigin-RevId: 311402449
Change-Id: I49809a004ad11c4bc9d9e5272472f3b85ea7948f
---
 .../org/tensorflow/lite/gpu/GpuDelegate.java  | 22 ++++-
 .../java/src/main/native/gpu_delegate_jni.cc  |  6 +-
 tensorflow/lite/java/BUILD                    |  1 +
 .../java/org/tensorflow/lite/Interpreter.java |  5 ++
 .../lite/NativeInterpreterWrapper.java        |  7 ++
 .../native/nativeinterpreterwrapper_jni.cc    |  9 ++
 .../lite/InterpreterTestHelper.java           | 29 +++++++
 .../tensorflow/lite/gpu/GpuDelegateTest.java  | 85 ++++++++++++++++++-
 8 files changed, 160 insertions(+), 4 deletions(-)
 create mode 100644 tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java

diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
index 8d802ae044a..895f12f0233 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
@@ -62,6 +62,18 @@ public class GpuDelegate implements Delegate, Closeable {
       return this;
     }
 
+    /**
+     * Enables running quantized models with the delegate. Defaults to false.
+     *
+     * <p>WARNING: This is an experimental API and subject to change.
+     *
+     * @param quantizedModelsAllowed When {@code true}, the GPU may run quantized models.
+     */
+    public Options setQuantizedModelsAllowed(boolean quantizedModelsAllowed) {
+      this.quantizedModelsAllowed = quantizedModelsAllowed;
+      return this;
+    }
+
     /**
      * Sets the inference preference for precision/compilation/runtime tradeoffs.
      *
@@ -74,11 +86,16 @@ public class GpuDelegate implements Delegate, Closeable {
     }
 
     boolean precisionLossAllowed = true;
+    boolean quantizedModelsAllowed = false;
     int inferencePreference = INFERENCE_PREFERENCE_FAST_SINGLE_ANSWER;
   }
 
   public GpuDelegate(Options options) {
-    delegateHandle = createDelegate(options.precisionLossAllowed, options.inferencePreference);
+    delegateHandle =
+        createDelegate(
+            options.precisionLossAllowed,
+            options.quantizedModelsAllowed,
+            options.inferencePreference);
   }
 
   public GpuDelegate() {
@@ -107,7 +124,8 @@ public class GpuDelegate implements Delegate, Closeable {
     System.loadLibrary(TFLITE_GPU_LIB);
   }
 
-  private static native long createDelegate(boolean precisionLossAllowed, int preference);
+  private static native long createDelegate(
+      boolean precisionLossAllowed, boolean quantizedModelsAllowed, int preference);
 
   private static native void deleteDelegate(long delegateHandle);
 }
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
index 089e2c2f816..900cc0e0d75 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
@@ -23,7 +23,7 @@ extern "C" {
 
 JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
     JNIEnv* env, jclass clazz, jboolean precision_loss_allowed,
-    jint inference_preference) {
+    jboolean quantized_models_allowed, jint inference_preference) {
   TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
   if (precision_loss_allowed == JNI_TRUE) {
     options.inference_priority1 = TFLITE_GPU_INFERENCE_PRIORITY_MIN_LATENCY;
@@ -31,6 +31,10 @@ JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_gpu_GpuDelegate_createDelegate(
         TFLITE_GPU_INFERENCE_PRIORITY_MIN_MEMORY_USAGE;
     options.inference_priority3 = TFLITE_GPU_INFERENCE_PRIORITY_MAX_PRECISION;
   }
+  options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
+  if (quantized_models_allowed) {
+    options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+  }
   options.inference_preference = static_cast<int32_t>(inference_preference);
   return reinterpret_cast<jlong>(TfLiteGpuDelegateV2Create(&options));
 }
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 46cd1be25cb..5eb5e8ab023 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -353,6 +353,7 @@ filegroup(
 filegroup(
     name = "portable_gpu_tests",
     srcs = [
+        "src/test/java/org/tensorflow/lite/InterpreterTestHelper.java",
         "src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java",
     ],
     visibility = ["//visibility:public"],
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index d191b550d8f..5625ef98bb6 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -491,6 +491,11 @@ public final class Interpreter implements AutoCloseable {
     wrapper.resetVariableTensors();
   }
 
+  int getExecutionPlanLength() {
+    checkNotClosed();
+    return wrapper.getExecutionPlanLength();
+  }
+
   /** Release resources associated with the {@code Interpreter}. */
   @Override
   public void close() {
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index a22d7241587..8eb3c66f3b5 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -324,6 +324,11 @@ final class NativeInterpreterWrapper implements AutoCloseable {
     return outputTensor;
   }
 
+  /** Gets the number of ops in the execution plan. */
+  int getExecutionPlanLength() {
+    return getExecutionPlanLength(interpreterHandle);
+  }
+
   private void applyDelegates(Interpreter.Options options) {
     // First apply the flex delegate if necessary. This ensures the graph is fully resolved before
     // applying other delegates.
@@ -419,6 +424,8 @@ final class NativeInterpreterWrapper implements AutoCloseable {
 
   private static native int getOutputCount(long interpreterHandle);
 
+  private static native int getExecutionPlanLength(long interpreterHandle);
+
   private static native String[] getInputNames(long interpreterHandle);
 
   private static native String[] getOutputNames(long interpreterHandle);
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 971aa5efd7a..690b58ac1f4 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -241,6 +241,15 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_getOutputTensorIndex(
   return interpreter->outputs()[output_index];
 }
 
+JNIEXPORT jint JNICALL
+Java_org_tensorflow_lite_NativeInterpreterWrapper_getExecutionPlanLength(
+    JNIEnv* env, jclass clazz, jlong handle) {
+  tflite_api_dispatcher::Interpreter* interpreter =
+      convertLongToInterpreter(env, handle);
+  if (interpreter == nullptr) return 0;
+  return static_cast<jint>(interpreter->execution_plan().size());
+}
+
 JNIEXPORT jint JNICALL
 Java_org_tensorflow_lite_NativeInterpreterWrapper_getInputCount(JNIEnv* env,
                                                                 jclass clazz,
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java
new file mode 100644
index 00000000000..34eb47e4dbe
--- /dev/null
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/InterpreterTestHelper.java
@@ -0,0 +1,29 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite;
+
+/** Utility for interacting with Interpreter in delegate tests. */
+public abstract class InterpreterTestHelper {
+
+  /**
+   * Returns the number of nodes in the execution plan that are invoked per inference.
+   *
+   * <p>WARNING: This is an experimental API and subject to change.
+   */
+  public static int executionPlanLength(Interpreter interpreter) {
+    return interpreter.getExecutionPlanLength();
+  }
+}
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
index 1fe4a531624..d92a7119aab 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/gpu/GpuDelegateTest.java
@@ -18,12 +18,17 @@ package org.tensorflow.lite.gpu;
 import static com.google.common.truth.Truth.assertThat;
 
 import java.nio.ByteBuffer;
+import java.util.AbstractMap;
+import java.util.ArrayList;
+import java.util.Comparator;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.PriorityQueue;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
 import org.tensorflow.lite.Interpreter;
+import org.tensorflow.lite.InterpreterTestHelper;
 import org.tensorflow.lite.TestUtils;
 
 /** Unit tests for {@link org.tensorflow.lite.gpu.GpuDelegate}. */
@@ -32,6 +37,9 @@ public final class GpuDelegateTest {
 
   private static final String MODEL_PATH = "tensorflow/lite/testdata/multi_add.bin";
   private static final ByteBuffer MODEL_BUFFER = TestUtils.getTestFileAsBuffer(MODEL_PATH);
+  private static final ByteBuffer MOBILENET_QUANTIZED_MODEL_BUFFER =
+      TestUtils.getTestFileAsBuffer(
+          "third_party/tensorflow/lite/java/demo/app/src/main/assets/mobilenet_v1_1.0_224_quant.tflite");
 
   @Test
   public void testBasic() throws Exception {
@@ -41,7 +49,7 @@ public final class GpuDelegateTest {
   }
 
   @Test
-  public void testInterpreterWithGpu() throws Exception {
+  public void testInterpreterWithGpu_FloatModel() throws Exception {
     Interpreter.Options options = new Interpreter.Options();
     try (GpuDelegate delegate = new GpuDelegate();
         Interpreter interpreter = new Interpreter(MODEL_BUFFER, options.addDelegate(delegate))) {
@@ -60,4 +68,79 @@ public final class GpuDelegateTest {
       assertThat(parsedOutput1).usingTolerance(0.1f).containsExactly(expected1).inOrder();
     }
   }
+
+  @Test
+  public void testInterpreterWithGpu_QuantModelRunWithDelegate() throws Exception {
+    ByteBuffer img =
+        TestUtils.getTestImageAsByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
+
+    Interpreter.Options options = new Interpreter.Options();
+    try (GpuDelegate delegate =
+            new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+        Interpreter interpreter =
+            new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
+      byte[][] output = new byte[1][1001];
+      interpreter.run(img, output);
+      // Should be only 1 node (Delegate) in the execution plan.
+      assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(1);
+      assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
+      assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
+      // 653 == "military uniform"
+      assertThat(getTopKLabels(output, 3)).contains(653);
+    }
+  }
+
+  @Test
+  public void testInterpreterWithGpu_QuantModelRunOnCPU() throws Exception {
+    ByteBuffer img =
+        TestUtils.getTestImageAsByteBuffer(
+            "tensorflow/lite/java/src/testdata/grace_hopper_224.jpg");
+
+    Interpreter.Options options = new Interpreter.Options();
+    try (GpuDelegate delegate = new GpuDelegate();
+        Interpreter interpreter =
+            new Interpreter(MOBILENET_QUANTIZED_MODEL_BUFFER, options.addDelegate(delegate))) {
+      byte[][] output = new byte[1][1001];
+      interpreter.run(img, output);
+      // Original execution plan remains since default behavior doesn't allow quantized models.
+      assertThat(InterpreterTestHelper.executionPlanLength(interpreter)).isEqualTo(31);
+      assertThat(interpreter.getInputTensor(0).shape()).isEqualTo(new int[] {1, 224, 224, 3});
+      assertThat(interpreter.getOutputTensor(0).shape()).isEqualTo(new int[] {1, 1001});
+      // 653 == "military uniform"
+      assertThat(getTopKLabels(output, 3)).contains(653);
+    }
+  }
+
+  private static ArrayList<Integer> getTopKLabels(byte[][] byteLabels, int k) {
+    float[][] labels = new float[1][1001];
+    for (int i = 0; i < byteLabels[0].length; ++i) {
+      labels[0][i] = (byteLabels[0][i] & 0xff) / 255.0f;
+    }
+    return getTopKLabels(labels, k);
+  }
+
+  private static ArrayList<Integer> getTopKLabels(float[][] labels, int k) {
+    PriorityQueue<Map.Entry<Integer, Float>> pq =
+        new PriorityQueue<>(
+            k,
+            new Comparator<Map.Entry<Integer, Float>>() {
+              @Override
+              public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
+                // Intentionally reversed to put high confidence at the head of the queue.
+                return o1.getValue().compareTo(o2.getValue()) * -1;
+              }
+            });
+
+    for (int i = 0; i < labels[0].length; ++i) {
+      pq.add(new AbstractMap.SimpleEntry<>(i, labels[0][i]));
+    }
+
+    final ArrayList<Integer> topKLabels = new ArrayList<>();
+    int topKLabelsSize = Math.min(pq.size(), k);
+    for (int i = 0; i < topKLabelsSize; ++i) {
+      topKLabels.add(pq.poll().getKey());
+    }
+    return topKLabels;
+  }
 }

From 4f6a3a4db05fb591a22c2107f30cba5c3e251412 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Wed, 13 May 2020 14:23:56 -0700
Subject: [PATCH 133/412] Make regularizers API more consistent.

PiperOrigin-RevId: 311403808
Change-Id: I2a372937bdc316f742015be6080ad945bf970377
---
 .../python/keras/layers/serialization_test.py |   8 +-
 tensorflow/python/keras/regularizers.py       | 115 +++++++++++-------
 .../keras/tests/add_loss_correctness_test.py  |   2 +-
 .../python/keras/utils/generic_utils_test.py  |   6 +-
 .../tensorflow.keras.regularizers.-l1.pbtxt   |  18 +++
 .../tensorflow.keras.regularizers.-l2.pbtxt   |  18 +++
 .../v1/tensorflow.keras.regularizers.l1.pbtxt |  18 +++
 .../v1/tensorflow.keras.regularizers.l2.pbtxt |  18 +++
 .../v1/tensorflow.keras.regularizers.pbtxt    |  24 ++--
 .../tensorflow.keras.regularizers.-l1.pbtxt   |  18 +++
 .../tensorflow.keras.regularizers.-l2.pbtxt   |  18 +++
 .../v2/tensorflow.keras.regularizers.l1.pbtxt |  18 +++
 .../v2/tensorflow.keras.regularizers.l2.pbtxt |  18 +++
 .../v2/tensorflow.keras.regularizers.pbtxt    |  24 ++--
 14 files changed, 258 insertions(+), 65 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt

diff --git a/tensorflow/python/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
index b18a0fbd8cc..920881c6a3e 100644
--- a/tensorflow/python/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -53,7 +53,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
     new_layer = keras.layers.deserialize(config)
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     if tf2.enabled():
       self.assertEqual(new_layer.kernel_initializer.__class__,
                        keras.initializers.OnesV2)
@@ -88,7 +88,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
         config, custom_objects={'SerializableInt': SerializableInt})
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     if tf2.enabled():
       self.assertEqual(new_layer.kernel_initializer.__class__,
                        keras.initializers.OnesV2)
@@ -116,7 +116,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertEqual(new_layer.beta_initializer.__class__,
                        keras.initializers.Zeros)
     self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
 
   @parameterized.parameters(
       [batchnorm_v1.BatchNormalization, batchnorm_v2.BatchNormalization])
@@ -135,7 +135,7 @@ class LayerSerializationTest(parameterized.TestCase, test.TestCase):
       self.assertEqual(new_layer.beta_initializer.__class__,
                        keras.initializers.Zeros)
     self.assertEqual(new_layer.gamma_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
 
   @parameterized.parameters([rnn_v1.LSTM, rnn_v2.LSTM])
   def test_serialize_deserialize_lstm(self, layer):
diff --git a/tensorflow/python/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
index 973d916f7e0..b8bae4cc155 100644
--- a/tensorflow/python/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -14,13 +14,14 @@
 # ==============================================================================
 """Built-in regularizers.
 """
+# pylint: disable=invalid-name
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import six
 
-from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import backend
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
@@ -60,8 +61,8 @@ class Regularizer(object):
   >>> layer = tf.keras.layers.Dense(
   ...     5, input_dim=5,
   ...     kernel_initializer='ones',
-  ...     kernel_regularizer=tf.keras.regularizers.l1(0.01),
-  ...     activity_regularizer=tf.keras.regularizers.l2(0.01))
+  ...     kernel_regularizer=tf.keras.regularizers.L1(0.01),
+  ...     activity_regularizer=tf.keras.regularizers.L2(0.01))
   >>> tensor = tf.ones(shape=(5, 5)) * 2.0
   >>> out = layer(tensor)
 
@@ -73,9 +74,9 @@ class Regularizer(object):
   ## Available penalties
 
   ```python
-  tf.keras.regularizers.l1(0.3)  # L1 Regularization Penalty
-  tf.keras.regularizers.l2(0.1)  # L2 Regularization Penalty
-  tf.keras.regularizers.l1_l2(l1=0.01, l2=0.01)  # L1 + L2 penalties
+  tf.keras.regularizers.L1(0.3)  # L1 Regularization Penalty
+  tf.keras.regularizers.L2(0.1)  # L2 Regularization Penalty
+  tf.keras.regularizers.L1L2(l1=0.01, l2=0.01)  # L1 + L2 penalties
   ```
 
   ## Directly calling a regularizer
@@ -84,7 +85,7 @@ class Regularizer(object):
   as if it is a one-argument function.
 
   E.g.
-  >>> regularizer = tf.keras.regularizers.l2(2.)
+  >>> regularizer = tf.keras.regularizers.L2(2.)
   >>> tensor = tf.ones(shape=(5, 5))
   >>> regularizer(tensor)
   <tf.Tensor: shape=(), dtype=float32, numpy=50.0>
@@ -194,7 +195,7 @@ class Regularizer(object):
 
 @keras_export('keras.regularizers.L1L2')
 class L1L2(Regularizer):
-  r"""A regularizer that applies both L1 and L2 regularization penalties.
+  """A regularizer that applies both L1 and L2 regularization penalties.
 
   The L1 regularization penalty is computed as:
   `loss = l1 * reduce_sum(abs(x))`
@@ -202,19 +203,23 @@ class L1L2(Regularizer):
   The L2 regularization penalty is computed as
   `loss = l2 * reduce_sum(square(x))`
 
+  L1L2 may be passed to a layer as a string identifier:
+
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1_l2')
+
+  In this case, the default values used are `l1=0.01` and `l2=0.01`.
+
   Attributes:
       l1: Float; L1 regularization factor.
       l2: Float; L2 regularization factor.
   """
 
   def __init__(self, l1=0., l2=0.):  # pylint: disable=redefined-outer-name
-    self.l1 = K.cast_to_floatx(l1)
-    self.l2 = K.cast_to_floatx(l2)
+    self.l1 = backend.cast_to_floatx(l1)
+    self.l2 = backend.cast_to_floatx(l2)
 
   def __call__(self, x):
-    if not self.l1 and not self.l2:
-      return K.constant(0.)
-    regularization = 0.
+    regularization = backend.constant(0., dtype=x.dtype)
     if self.l1:
       regularization += self.l1 * math_ops.reduce_sum(math_ops.abs(x))
     if self.l2:
@@ -225,39 +230,64 @@ class L1L2(Regularizer):
     return {'l1': float(self.l1), 'l2': float(self.l2)}
 
 
-# Aliases.
-
-
-@keras_export('keras.regularizers.l1')
-def l1(l=0.01):
-  r"""Create a regularizer that applies an L1 regularization penalty.
+@keras_export('keras.regularizers.L1', 'keras.regularizers.l1')
+class L1(Regularizer):
+  """A regularizer that applies a L1 regularization penalty.
 
   The L1 regularization penalty is computed as:
-  `loss = l * reduce_sum(abs(x))`
+  `loss = l1 * reduce_sum(abs(x))`
 
-  Arguments:
-      l: Float; L1 regularization factor.
+  L1 may be passed to a layer as a string identifier:
 
-  Returns:
-    An L1 Regularizer with the given regularization factor.
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l1')
+
+  In this case, the default value used is `l1=0.01`.
+
+  Attributes:
+      l1: Float; L1 regularization factor.
   """
-  return L1L2(l1=l)
+
+  def __init__(self, l1=0.01, **kwargs):  # pylint: disable=redefined-outer-name
+    l1 = kwargs.pop('l', l1)  # Backwards compatibility
+    if kwargs:
+      raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
+    self.l1 = backend.cast_to_floatx(l1)
+
+  def __call__(self, x):
+    return self.l1 * math_ops.reduce_sum(math_ops.abs(x))
+
+  def get_config(self):
+    return {'l1': float(self.l1)}
 
 
-@keras_export('keras.regularizers.l2')
-def l2(l=0.01):
-  r"""Create a regularizer that applies an L2 regularization penalty.
+@keras_export('keras.regularizers.L2', 'keras.regularizers.l2')
+class L2(Regularizer):
+  """A regularizer that applies a L2 regularization penalty.
 
   The L2 regularization penalty is computed as:
-  `loss = l * reduce_sum(square(x))`
+  `loss = l2 * reduce_sum(square(x))`
 
-  Arguments:
-      l: Float; L2 regularization factor.
+  L2 may be passed to a layer as a string identifier:
 
-  Returns:
-    An L2 Regularizer with the given regularization factor.
+  >>> dense = tf.keras.layers.Dense(3, kernel_regularizer='l2')
+
+  In this case, the default value used is `l2=0.01`.
+
+  Attributes:
+      l2: Float; L2 regularization factor.
   """
-  return L1L2(l2=l)
+
+  def __init__(self, l2=0.01, **kwargs):  # pylint: disable=redefined-outer-name
+    l2 = kwargs.pop('l', l2)  # Backwards compatibility
+    if kwargs:
+      raise TypeError('Argument(s) not recognized: %s' % (kwargs,))
+    self.l2 = backend.cast_to_floatx(l2)
+
+  def __call__(self, x):
+    return self.l2 * math_ops.reduce_sum(math_ops.square(x))
+
+  def get_config(self):
+    return {'l2': float(self.l2)}
 
 
 @keras_export('keras.regularizers.l1_l2')
@@ -280,6 +310,11 @@ def l1_l2(l1=0.01, l2=0.01):  # pylint: disable=redefined-outer-name
   return L1L2(l1=l1, l2=l2)
 
 
+# Deserialization aliases.
+l1 = L1
+l2 = L2
+
+
 @keras_export('keras.regularizers.serialize')
 def serialize(regularizer):
   return serialize_keras_object(regularizer)
@@ -287,6 +322,10 @@ def serialize(regularizer):
 
 @keras_export('keras.regularizers.deserialize')
 def deserialize(config, custom_objects=None):
+  if config == 'l1_l2':
+    # Special case necessary since the defaults used for "l1_l2" (string)
+    # differ from those of the L1L2 class.
+    return L1L2(l1=0.01, l2=0.01)
   return deserialize_keras_object(
       config,
       module_objects=globals(),
@@ -296,18 +335,12 @@ def deserialize(config, custom_objects=None):
 
 @keras_export('keras.regularizers.get')
 def get(identifier):
+  """Retrieve a regularizer instance from a config or identifier."""
   if identifier is None:
     return None
   if isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
-    identifier = str(identifier)
-    # We have to special-case functions that return classes.
-    # TODO(omalleyt): Turn these into classes or class aliases.
-    special_cases = ['l1', 'l2', 'l1_l2']
-    if identifier in special_cases:
-      # Treat like a class.
-      return deserialize({'class_name': identifier, 'config': {}})
     return deserialize(str(identifier))
   elif callable(identifier):
     return identifier
diff --git a/tensorflow/python/keras/tests/add_loss_correctness_test.py b/tensorflow/python/keras/tests/add_loss_correctness_test.py
index 323a2626c15..a19eec75ffb 100644
--- a/tensorflow/python/keras/tests/add_loss_correctness_test.py
+++ b/tensorflow/python/keras/tests/add_loss_correctness_test.py
@@ -288,7 +288,7 @@ class TestAddLossCorrectness(keras_parameterized.TestCase):
           model_layers, input_shape=(10,))
 
       x = np.ones((10, 10), 'float32')
-      y = np.ones((10, 1), 'float32')
+      y = np.zeros((10, 1), 'float32')
 
       optimizer = RMSPropOptimizer(learning_rate=0.001)
       model.compile(
diff --git a/tensorflow/python/keras/utils/generic_utils_test.py b/tensorflow/python/keras/utils/generic_utils_test.py
index 334758871fa..ddaa60c3c24 100644
--- a/tensorflow/python/keras/utils/generic_utils_test.py
+++ b/tensorflow/python/keras/utils/generic_utils_test.py
@@ -201,7 +201,7 @@ class SerializeKerasObjectTest(test.TestCase):
         config, custom_objects={'SerializableInt': SerializableInt})
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     self.assertEqual(new_layer.units.__class__, SerializableInt)
     self.assertEqual(new_layer.units, 3)
 
@@ -253,7 +253,7 @@ class SerializeKerasObjectTest(test.TestCase):
     self.assertEqual(new_layer.name, 'SerializableNestedInt')
     self.assertEqual(new_layer.activation, keras.activations.relu)
     self.assertEqual(new_layer.bias_regularizer.__class__,
-                     keras.regularizers.L1L2)
+                     keras.regularizers.L2)
     self.assertEqual(new_layer.units.__class__, SerializableNestedInt)
     self.assertEqual(new_layer.units, 3)
     self.assertEqual(new_layer.units.int_obj.__class__, SerializableInt)
@@ -293,7 +293,7 @@ class SerializeKerasObjectTest(test.TestCase):
             'SerializableNestedInt': SerializableNestedInt
         })
     self.assertEqual(new_layer.activation, keras.activations.relu)
-    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L1L2)
+    self.assertIsInstance(new_layer.bias_regularizer, keras.regularizers.L2)
     self.assertIsInstance(new_layer.units, SerializableNestedInt)
     self.assertEqual(new_layer.units, 3)
     self.assertIs(new_layer.units.fn, serializable_fn)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
new file mode 100644
index 00000000000..5cb133ca85d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
new file mode 100644
index 00000000000..c5b706d1d2f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.-l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
new file mode 100644
index 00000000000..eb769a0dc44
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
new file mode 100644
index 00000000000..fda5c76ecd2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
index bb10d41d704..96a4b193b1b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.regularizers.pbtxt
@@ -1,13 +1,29 @@
 path: "tensorflow.keras.regularizers"
 tf_module {
+  member {
+    name: "L1"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "L1L2"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "L2"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Regularizer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "l1"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "l2"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,18 +32,10 @@ tf_module {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "l1"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "l1_l2"
     argspec: "args=[\'l1\', \'l2\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.01\'], "
   }
-  member_method {
-    name: "l2"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "serialize"
     argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
new file mode 100644
index 00000000000..5cb133ca85d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
new file mode 100644
index 00000000000..c5b706d1d2f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.-l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.L2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
new file mode 100644
index 00000000000..eb769a0dc44
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l1.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l1"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l1\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt
new file mode 100644
index 00000000000..fda5c76ecd2
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.l2.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.regularizers.l2"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'l2\'], varargs=None, keywords=kwargs, defaults=[\'0.01\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
index bb10d41d704..96a4b193b1b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.regularizers.pbtxt
@@ -1,13 +1,29 @@
 path: "tensorflow.keras.regularizers"
 tf_module {
+  member {
+    name: "L1"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "L1L2"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "L2"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Regularizer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "l1"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "l2"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -16,18 +32,10 @@ tf_module {
     name: "get"
     argspec: "args=[\'identifier\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "l1"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "l1_l2"
     argspec: "args=[\'l1\', \'l2\'], varargs=None, keywords=None, defaults=[\'0.01\', \'0.01\'], "
   }
-  member_method {
-    name: "l2"
-    argspec: "args=[\'l\'], varargs=None, keywords=None, defaults=[\'0.01\'], "
-  }
   member_method {
     name: "serialize"
     argspec: "args=[\'regularizer\'], varargs=None, keywords=None, defaults=None"

From 8588e0aab8c1ef6a4214bcc2f7d0bb61578a88b3 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Wed, 13 May 2020 14:27:47 -0700
Subject: [PATCH 134/412] Support running a remote function with packed input
 handles. - Support copying a packed TensorHandle from a client to a remote
 worker.

PiperOrigin-RevId: 311404609
Change-Id: Iadf2c7793dc3631f7be05de611d059733bbfdd63
---
 tensorflow/c/eager/c_api_remote_test.cc       |  14 ++-
 .../common_runtime/eager/tensor_handle.cc     |  23 +++-
 .../core/common_runtime/eager/tensor_handle.h |   5 +
 .../eager/eager_service_impl.cc               |  48 ++++++++
 .../eager/eager_service_impl.h                |   2 +
 .../eager/eager_service_impl_test.cc          | 102 +++++++++++++++++
 .../eager/remote_copy_node.cc                 | 104 +++++++++++++++++-
 .../eager/remote_copy_node.h                  |   3 +
 tensorflow/core/protobuf/eager_service.proto  |  22 ++++
 9 files changed, 316 insertions(+), 7 deletions(-)

diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 12c63675c87..9dc18c7a6f1 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -434,7 +434,7 @@ string AddVariablesFunction() {
   return def.SerializeAsString();
 }
 
-TEST(CAPI, TestFunctionWithPackedInput) {
+void TestFunctionWithPackedInput(const bool remote) {
   tensorflow::ServerDef server_def = GetServerDef(3);
 
   // This server def has the task index set to 0.
@@ -502,6 +502,10 @@ TEST(CAPI, TestFunctionWithPackedInput) {
   ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
   TFE_OpAddInput(func, packed_handle, status);
   ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  if (remote) {
+    TFE_OpSetDevice(func, task1_name, status);
+    ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  }
 
   TFE_TensorHandle* retvals[1] = {nullptr};
   int num_retvals = 1;
@@ -537,6 +541,14 @@ TEST(CAPI, TestFunctionWithPackedInput) {
   worker_server2.release();
 }
 
+TEST(CAPI, TestLocalFunctionWithPackedInput) {
+  TestFunctionWithPackedInput(/*remote=*/false);
+}
+
+TEST(CAPI, TestRemoteFunctionWithPackedInput) {
+  TestFunctionWithPackedInput(/*remote=*/true);
+}
+
 void TestRemoteExecuteDeleteContextWithOutstandingRPC(bool async) {
   tensorflow::ServerDef server_def = GetServerDef(2);
 
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index dfe3e4a1426..49fa69e2185 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -318,17 +318,14 @@ TensorHandle::TensorHandle(Device* d, Device* op_device,
 }
 
 Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                        const tensorflow::DataType dtype,
+                                        const tensorflow::TensorShape& shape,
                                         EagerContext* ctx,
                                         TensorHandle** packed_handle) {
   if (handles.empty()) {
     return errors::InvalidArgument("Handles should not be empty.");
   }
 
-  // Get the dtype and shape from the fisrt handle since all handles have the
-  // same dtype and shape.
-  tensorflow::DataType dtype = handles.at(0)->dtype;
-  tensorflow::TensorShape shape;
-  TF_RETURN_IF_ERROR(handles.at(0)->Shape(&shape));
   ResourceHandleInfo resource_handle_info;
   if (dtype == DT_RESOURCE) {
     TF_RETURN_IF_ERROR(
@@ -360,6 +357,22 @@ Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
   return Status::OK();
 }
 
+Status TensorHandle::CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                        EagerContext* ctx,
+                                        TensorHandle** packed_handle) {
+  if (handles.empty()) {
+    return errors::InvalidArgument("Handles should not be empty.");
+  }
+
+  // Get the dtype and shape from the fisrt handle since all handles have the
+  // same dtype and shape.
+  tensorflow::DataType dtype = handles.at(0)->dtype;
+  tensorflow::TensorShape shape;
+  TF_RETURN_IF_ERROR(handles.at(0)->Shape(&shape));
+  return CreatePackedHandle(std::move(handles), dtype, shape, ctx,
+                            packed_handle);
+}
+
 TensorHandle::TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
                            const tensorflow::DataType dtype,
                            const tensorflow::TensorShape& shape,
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 25d7fea3200..6f9ee565c73 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -91,6 +91,11 @@ class TensorHandle : public AbstractTensorHandleInterface,
   // Create a handle which packs the given handles of the same dtype and shape.
   // If handles are on different devices, assign the packed handle to a
   // CompositeDevice.
+  static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
+                                   const tensorflow::DataType dtype,
+                                   const tensorflow::TensorShape& shape,
+                                   EagerContext* ctx,
+                                   TensorHandle** packed_handle);
   static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
                                    EagerContext* ctx,
                                    TensorHandle** packed_handle);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 95131150d3d..6dc03cbc527 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -524,6 +524,8 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
       s = context->Context()->Executor().AddOrExecute(std::move(node));
     } else if (item.has_send_tensor()) {
       s = SendTensor(item.send_tensor(), context->Context());
+    } else if (item.has_send_packed_handle()) {
+      s = SendPackedHandle(item.send_packed_handle(), context->Context());
     } else if (item.has_register_function()) {
       s = RegisterFunction(item.register_function(), context->Context());
     } else if (item.has_cleanup_function()) {
@@ -643,6 +645,52 @@ Status EagerServiceImpl::SendTensor(const SendTensorOp& send_tensor,
   return Status::OK();
 }
 
+Status EagerServiceImpl::SendPackedHandle(
+    const SendPackedHandleOp& send_packed_handle, EagerContext* eager_context) {
+  if (send_packed_handle.handles().empty()) {
+    return errors::InvalidArgument("Handles should not be empty.");
+  }
+
+  std::vector<tensorflow::TensorHandle*> handles;
+  handles.resize(send_packed_handle.handles_size());
+  for (int i = 0; i < send_packed_handle.handles_size(); ++i) {
+    const auto& item = send_packed_handle.handles(i);
+    if (item.has_local_handle()) {
+      Tensor tensor;
+      if (!ParseTensorProtoToTensor(item.local_handle().tensor(), &tensor)) {
+        return errors::InvalidArgument(
+            "Invalid TensorProto: ",
+            item.local_handle().tensor().DebugString());
+      }
+      Device* op_device = nullptr;
+      TF_RETURN_IF_ERROR(eager_context->FindDeviceFromName(
+          item.local_handle().device().c_str(), &op_device));
+      handles[i] = TensorHandle::CreateLocalHandle(
+          std::move(tensor), /*d=*/nullptr, op_device, eager_context);
+    } else {
+      TF_RETURN_IF_ERROR(
+          eager_context->RemoteMgr()->DeserializeRemoteTensorHandle(
+              item.remote_handle(), &handles[i]));
+    }
+  }
+
+  tensorflow::TensorHandle* packed_handle = nullptr;
+  std::vector<tensorflow::TensorHandle*> handles_to_pack = handles;
+  // Create a unshaped packed TensorHandle.
+  TF_RETURN_IF_ERROR(TensorHandle::CreatePackedHandle(
+      std::move(handles_to_pack), handles.at(0)->dtype, TensorShape(),
+      eager_context, &packed_handle));
+
+  for (auto* h : handles) {
+    // Unref handle since it has a ref in the packed handle now.
+    h->Unref();
+  }
+
+  eager_context->RemoteMgr()->AddOperationOutputs({packed_handle},
+                                                  send_packed_handle.op_id());
+  return Status::OK();
+}
+
 tensorflow::Status EagerServiceImpl::GetServerContext(
     uint64 context_id, ServerContext** server_context) {
   tf_shared_lock l(contexts_mu_);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
index 06d4c36b61c..1e4d36ccf9f 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -212,6 +212,8 @@ class EagerServiceImpl {
                    QueueResponse* queue_response);
   Status SendTensor(const SendTensorOp& send_tensor,
                     EagerContext* eager_context);
+  Status SendPackedHandle(const SendPackedHandleOp& send_packed_handle,
+                          EagerContext* eager_context);
   Status RegisterFunction(const RegisterFunctionOp& register_function,
                           EagerContext* eager_context);
   Status CleanupFunction(const CleanupFunctionOp& cleanup_function);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 9930bb86e6b..23bf324b80f 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -881,6 +881,108 @@ TEST_F(EagerServiceImplTest, SendTensorTest) {
                                                &close_context_response));
 }
 
+// Test serializes and sends a pack TensorHandle.
+TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
+  TestEagerServiceImpl eager_service_impl(&worker_env_);
+
+  const string device0 = "/job:localhost/replica:0/task:0/device:CPU:0";
+  const string device1 = "/job:localhost/replica:0/task:1/device:CPU:0";
+  const string device2 = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  uint64 context_id = random::New64();
+  CreateContextRequest request;
+  auto* server_def = request.mutable_server_def();
+  server_def->set_job_name("localhost");
+  server_def->set_task_index(0);
+  request.add_cluster_device_attributes()->set_name(device0);
+  request.add_cluster_device_attributes()->set_name(device1);
+  request.add_cluster_device_attributes()->set_name(device2);
+  request.set_context_id(context_id);
+  CreateContextResponse response;
+
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+
+  // Copy a tensor to device0
+  auto* send_tensor = remote_enqueue_request.add_queue()->mutable_send_tensor();
+  send_tensor->set_op_id(1);
+  SetTensorProto(send_tensor->add_tensors());
+
+  // Copy a packed handle to device0
+  auto* send_packed_handle =
+      remote_enqueue_request.add_queue()->mutable_send_packed_handle();
+  send_packed_handle->set_op_id(3);
+  RemoteTensorHandle* remote_handle =
+      send_packed_handle->add_handles()->mutable_remote_handle();
+  remote_handle->set_op_id(send_tensor->op_id());
+  remote_handle->set_output_num(0);
+  remote_handle->set_op_device(device0);
+  remote_handle->set_device(device0);
+
+  SendPackedHandleOp::LocalTensorHandle* lcoal_handle =
+      send_packed_handle->add_handles()->mutable_local_handle();
+  SetTensorProto(lcoal_handle->mutable_tensor());
+  lcoal_handle->set_device(device1);
+
+  remote_handle = send_packed_handle->add_handles()->mutable_remote_handle();
+  remote_handle->set_op_id(2);
+  remote_handle->set_output_num(5);
+  remote_handle->set_op_device(device2);
+  remote_handle->set_device(device2);
+
+  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                          &remote_enqueue_response));
+
+  tensorflow::TensorHandle* packed_handle;
+  TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+      context_id, RemoteTensorHandleInternal(3, 0), &packed_handle));
+
+  EXPECT_EQ(packed_handle->Type(), TensorHandle::PACKED);
+  EXPECT_EQ(packed_handle->NumPackedHandles(), 3);
+
+  TensorHandle* handle0 = nullptr;
+  TF_ASSERT_OK(packed_handle->ExtractPackedHandle(0, &handle0));
+  EXPECT_EQ(handle0->Type(), TensorHandle::LOCAL);
+  EXPECT_EQ(handle0->op_device()->name(), device0);
+  const Tensor* t0 = nullptr;
+  TF_ASSERT_OK(handle0->Tensor(&t0));
+  auto actual = t0->flat<float>();
+  EXPECT_EQ(4, actual.size());
+  EXPECT_EQ(1.0, actual(0));
+  EXPECT_EQ(2.0, actual(1));
+  EXPECT_EQ(3.0, actual(2));
+  EXPECT_EQ(4.0, actual(3));
+
+  TensorHandle* handle1 = nullptr;
+  TF_ASSERT_OK(packed_handle->ExtractPackedHandle(1, &handle1));
+  EXPECT_EQ(handle1->Type(), TensorHandle::LOCAL);
+  EXPECT_EQ(handle1->op_device()->name(), device1);
+  const Tensor* t1 = nullptr;
+  TF_ASSERT_OK(handle0->Tensor(&t1));
+  EXPECT_EQ(t1, t0);
+
+  TensorHandle* handle2 = nullptr;
+  TF_ASSERT_OK(packed_handle->ExtractPackedHandle(2, &handle2));
+  EXPECT_EQ(handle2->Type(), TensorHandle::REMOTE);
+  EXPECT_EQ(handle2->op_device()->name(), device2);
+  int64 op_id;
+  int32 output_num;
+  TF_ASSERT_OK(handle2->RemoteAddressUntilReady(
+      absl::get<Device*>(handle2->device()), &op_id, &output_num));
+  EXPECT_EQ(op_id, 2);
+  EXPECT_EQ(output_num, 5);
+
+  CloseContextRequest close_context_request;
+  close_context_request.set_context_id(context_id);
+  close_context_request.set_context_view_id(0);
+  CloseContextResponse close_context_response;
+  TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                               &close_context_response));
+}
+
 // Test requests sent to the eager service on master.
 TEST_F(EagerServiceImplTest, RequestsToMasterTest) {
   tensorflow::Rendezvous* rendezvous =
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index b281bcef2b3..5d0793b258c 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 namespace eager {
@@ -290,6 +292,102 @@ void RemoteCopyNode::StartRecv(StatusCallback done) {
   }
 }
 
+Status SerializePackedHandle(const uint64 op_id, TensorHandle* packed_handle,
+                             const Device* target_device, EagerContext* ctx,
+                             SendPackedHandleOp* op) {
+  op->set_op_id(op_id);
+  for (int i = 0; i < packed_handle->NumPackedHandles(); ++i) {
+    TensorHandle* h = nullptr;
+    TF_RETURN_IF_ERROR(packed_handle->ExtractPackedHandle(i, &h));
+    if (h->Type() == TensorHandle::LOCAL) {
+      // AsProtoTensorContent doesn't work when the tensor is on the GPU, hence
+      // copy it to the CPU before copying it out.
+      Tensor tensor;
+      TF_RETURN_IF_ERROR(h->CopyToDevice(*ctx, ctx->HostCPU(), &tensor));
+      auto* local_handle = op->add_handles()->mutable_local_handle();
+      local_handle->set_device(h->op_device() ? h->op_device()->name()
+                                              : ctx->HostCPU()->name());
+      tensor.AsProtoTensorContent(local_handle->mutable_tensor());
+    } else if (h->Type() == TensorHandle::REMOTE) {
+      // Only serialize the resource dtype and shape of the first handle, since
+      // all handles are of the same resource dtype and shape.
+      Device* src_device = absl::get<Device*>(h->device());
+      const bool serialize_resource_dtype_and_shape =
+          (i == 0) && (h->dtype == DT_RESOURCE) &&
+          (ctx->OnSameTask(src_device, target_device));
+      TF_RETURN_IF_ERROR(ctx->RemoteMgr()->SerializeRemoteTensorHandle(
+          h, op->add_handles()->mutable_remote_handle(), src_device,
+          absl::get<Device*>(h->DeviceOrHostCPU(*ctx))->name(),
+          serialize_resource_dtype_and_shape));
+    } else {
+      return errors::InvalidArgument("Nested packed handles are not supported");
+    }
+  }
+  return Status::OK();
+}
+
+void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
+  Status s;
+  const uint64 context_view_id = ctx_->GetContextViewId();
+  if (!send_device_->IsLocal()) {
+    s = errors::InvalidArgument(
+        "Copy a packed handle from a remote device is not supported");
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+
+  EnqueueRequest request;
+  uint64 context_id = ctx_->GetContextId();
+  request.set_context_id(context_id);
+  s = SerializePackedHandle(recv_op_id_, src_, recv_device_, ctx_,
+                            request.add_queue()->mutable_send_packed_handle());
+  if (!s.ok()) {
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+
+  TensorShape shape;
+  s = src_->Shape(&shape);
+  if (!s.ok()) {
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+  captured_state_->SetSrcShape(shape);
+
+  core::RefCountPtr<eager::EagerClient> eager_client;
+  s = ctx_->GetClient(recv_device_, &eager_client);
+  if (!s.ok()) {
+    captured_state_->dst()->PoisonRemote(s, recv_device_, context_view_id);
+    done(s);
+    return;
+  }
+
+  EnqueueResponse* response = new EnqueueResponse;
+  Device* recv_device = recv_device_;
+  const std::shared_ptr<CapturedSharedState>& captured_state = captured_state_;
+  eager_client->StreamingEnqueueAsync(
+      &request, response,
+      [captured_state, response, recv_device, context_view_id,
+       done](const Status& s) {
+        if (s.ok()) {
+          Status status = captured_state->dst()->SetRemoteShape(
+              captured_state->GetSrcShape(), recv_device, context_view_id);
+          if (!status.ok()) {
+            LOG(ERROR) << "Ignoring an error encountered when setting remote "
+                          "shape of tensor received by SendPackedHadnle rpc: "
+                       << status.ToString();
+          }
+        } else {
+          captured_state->dst()->PoisonRemote(s, recv_device, context_view_id);
+        }
+        done(s);
+        delete response;
+      });
+}
+
 void RemoteCopyNode::StartRemoteSendTensor(StatusCallback done) {
   Status s;
   EnqueueRequest request;
@@ -351,7 +449,11 @@ Status RemoteCopyNode::Prepare() {
 
 void RemoteCopyNode::RunAsync(StatusCallback done) {
   started_ = true;
-  if (ctx_->UseSendTensorRPC() && send_device_->IsLocal() &&
+  if (src_->Type() == TensorHandle::PACKED) {
+    return StartSendPackedHandle(std::move(done));
+  }
+
+  if ((ctx_->UseSendTensorRPC()) && send_device_->IsLocal() &&
       !recv_device_->IsLocal()) {
     return StartRemoteSendTensor(std::move(done));
   }
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
index a527cd47127..7816a24ed33 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.h
@@ -121,6 +121,9 @@ class RemoteCopyNode : public AsyncEagerNode {
   // SendTensor RPC *on the receiver*.
   void StartRemoteSendTensor(StatusCallback done);
 
+  // Send a local packed TensorHandle to a remote device.
+  void StartSendPackedHandle(StatusCallback done);
+
   // State that is captured by Send and/or Recv callbacks (depending on which
   // one(s) is remote) and outlives this node in the case of remote->remote
   // copy.
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index e9e21777d3f..3fe2bd486ba 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -69,6 +69,7 @@ message QueueItem {
     // enqueued in streaming call. Request with this item type waits for pending
     // nodes to finish on the remote executor and report status.
     SyncRemoteExecutorForStream sync_remote_executor_for_stream = 6;
+    SendPackedHandleOp send_packed_handle = 7;
   }
 }
 
@@ -238,6 +239,27 @@ message SendTensorOp {
   string device_name = 3;
 }
 
+// Send a packed TensorHandle to a remote worker.
+message SendPackedHandleOp {
+  // Op id of the remote packed TensorHandle.
+  int64 op_id = 1;
+
+  message LocalTensorHandle {
+    TensorProto tensor = 1;
+    // Device where the tensor is produced.
+    string device = 2;
+  }
+
+  message Handle {
+    oneof item {
+      LocalTensorHandle local_handle = 1;
+      RemoteTensorHandle remote_handle = 2;
+    }
+  }
+
+  repeated Handle handles = 2;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // Eager Service defines a TensorFlow service that executes operations eagerly

From 2560d6fd31b20e81a5a98a73f325fb1dcf0c68a7 Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Wed, 13 May 2020 14:39:12 -0700
Subject: [PATCH 135/412] Fix issue where metric instances created in
 subclassed layer are not tracked on creation but only on call.

PiperOrigin-RevId: 311407078
Change-Id: I2cd7ecb675699a56a4b90e5a29ba80ce6ca59cac
---
 tensorflow/python/keras/engine/base_layer.py  |  6 +++
 .../python/keras/engine/base_layer_v1.py      |  6 +++
 tensorflow/python/keras/engine/training.py    |  7 ----
 .../python/keras/engine/training_test.py      | 37 +++++++++++++++++++
 4 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index f6fa17df5c2..94b696d842b 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -2585,6 +2585,12 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     except AttributeError:
       pass
 
+    # Keep track of metric instance created in subclassed layer.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    for val in nest.flatten(value):
+      if isinstance(val, metrics_module.Metric):
+        self._metrics.append(val)
+
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
     # Be careful about metric if it becomes a Module in future.
     # Append value to self._layers if relevant
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 24d12ae4d59..4a277ec3a3e 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -2223,6 +2223,12 @@ class Layer(base_layer.Layer):
     except AttributeError:
       pass
 
+    # Keep track of metric instance created in subclassed layer.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    for val in nest.flatten(value):
+      if isinstance(val, metrics_module.Metric):
+        self._metrics.append(val)
+
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
     # Be careful about metric if it becomes a Module in future.
     # Append value to self._layers if relevant
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 52bf42a099d..d8c95b2a972 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -334,13 +334,6 @@ class Model(base_layer.Layer, version_utils.ModelVersionSelector):
 
     super(Model, self).__setattr__(name, value)
 
-    # Keep track of metric instance created in subclassed model/layer.
-    # We do this so that we can maintain the correct order of metrics by adding
-    # the instance to the `metrics` list as soon as it is created.
-    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
-    if isinstance(value, metrics_module.Metric):
-      self._metrics.append(value)
-
   @generic_utils.default
   def build(self, input_shape):
     """Builds the model based on input shapes received.
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index e4c1ff6b1f8..c1c498b207b 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -2979,6 +2979,8 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
         return self.dense1(x)
 
     model = TestModel()
+    self.assertListEqual([m.name for m in model.metrics],
+                         ['metric_1', 'metric_2'])
     model.compile(
         loss='mse',
         optimizer=RMSPropOptimizer(0.01),
@@ -2998,6 +3000,41 @@ class TestTrainingWithMetrics(keras_parameterized.TestCase):
     model.train_on_batch(x, y)
     model.test_on_batch(x, y)
 
+  @keras_parameterized.run_all_keras_modes
+  def test_multiple_add_metric_calls_layer(self):
+
+    class TestLayer(layers_module.Layer):
+
+      def __init__(self):
+        super(TestLayer, self).__init__(name='test_layer')
+        self.dense1 = layers_module.Dense(2, kernel_initializer='ones')
+        self.m1 = metrics_module.Mean(name='m_1')
+        self.m2 = [
+            metrics_module.Mean(name='m_2'),
+            metrics_module.Mean(name='m_3')
+        ]
+        self.m3 = {
+            'mean4': metrics_module.Mean(name='m_4'),
+            'mean5': metrics_module.Mean(name='m_5')
+        }
+
+      def call(self, x):
+        self.add_metric(self.m2[0](x))
+        self.add_metric(self.m2[1](x))
+        self.add_metric(self.m1(x))
+        self.add_metric(self.m3['mean4'](x))
+        self.add_metric(self.m3['mean5'](x))
+        self.add_metric(math_ops.reduce_sum(x), name='m_6', aggregation='mean')
+        return self.dense1(x)
+
+    layer = TestLayer()
+    self.assertListEqual([m.name for m in layer.metrics],
+                         ['m_1', 'm_2', 'm_3', 'm_4', 'm_5'])
+
+    layer(np.ones((10, 10)))
+    self.assertListEqual([m.name for m in layer.metrics],
+                         ['m_1', 'm_2', 'm_3', 'm_4', 'm_5', 'm_6'])
+
   @keras_parameterized.run_all_keras_modes
   def test_duplicate_metric_name_in_add_metric(self):
 

From 8a25406f8a6260dee347512f1cb2d44634cc4977 Mon Sep 17 00:00:00 2001
From: Srinivasan Narayanamoorthy <srinivasan.narayanamoorthy@intel.com>
Date: Wed, 13 May 2020 14:58:02 -0700
Subject: [PATCH 136/412] review changes.

---
 third_party/mkl_dnn/mkldnn_v1.BUILD | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index c7aa0207ee2..93499fd62f2 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -70,8 +70,8 @@ cc_library(
         "src/cpu/**/*.cpp",
         "src/cpu/**/*.hpp",
         "src/cpu/xbyak/*.h",
-    ]) + [":dnnl_config_h"]
-       + [":dnnl_version_h"],
+    ]) + [":dnnl_config_h",
+          ":dnnl_version_h"],
     hdrs = glob(["include/*"]),
     copts = [
         "-fexceptions",

From 2046f7c450b8215f33b4ebfca094b637e36e6a7f Mon Sep 17 00:00:00 2001
From: Bruce Fontaine <bfontain@google.com>
Date: Wed, 13 May 2020 14:56:38 -0700
Subject: [PATCH 137/412] Move TPUClusterResolver into tpu subdirectory.

PiperOrigin-RevId: 311410592
Change-Id: I7c4ca01621ae27cd4c36ff996cf90237328d75e4
---
 tensorflow/opensource_only.files              |   2 -
 .../python/distribute/cluster_resolver/BUILD  |  30 +-
 .../distribute/cluster_resolver/tpu/BUILD     |  44 +++
 .../tpu/tpu_cluster_resolver.py               | 349 ++++++++++++++++++
 .../{ => tpu}/tpu_cluster_resolver_test.py    |   4 +-
 .../cluster_resolver/tpu_cluster_resolver.py  | 334 +----------------
 ...ter_resolver.-t-p-u-cluster-resolver.pbtxt |   2 +-
 ...ter_resolver.-t-p-u-cluster-resolver.pbtxt |   2 +-
 8 files changed, 403 insertions(+), 364 deletions(-)
 create mode 100644 tensorflow/python/distribute/cluster_resolver/tpu/BUILD
 create mode 100644 tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
 rename tensorflow/python/distribute/cluster_resolver/{ => tpu}/tpu_cluster_resolver_test.py (99%)

diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 9ca7bb4fe28..41750ea02b4 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -6,8 +6,6 @@ tensorflow/compat_template_v1.__init__.py
 tensorflow/compiler/mlir/glob_lit_test.bzl
 tensorflow/lite/micro/build_def.bzl
 tensorflow/python/autograph/core/config.py
-tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
-tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
 tensorflow/python/eager/benchmarks_test_base.py
 tensorflow/python/tpu/profiler/pip_package/BUILD
 tensorflow/python/tpu/profiler/pip_package/README
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
index 8577f1978b9..c7427af2081 100644
--- a/tensorflow/python/distribute/cluster_resolver/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -1,10 +1,6 @@
 # Description: Operations defined for Cluster Resolvers
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_additional_rpc_deps",
-)
 
 package(
     default_visibility = [
@@ -64,12 +60,7 @@ py_library(
     name = "tpu_cluster_resolver_py",
     srcs = ["tpu_cluster_resolver.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training_server_lib",
-        "//tensorflow/python/tpu:tpu_lib",
-        "//tensorflow/python/tpu/client",
-    ] + tf_additional_rpc_deps(),
+    deps = ["//tensorflow/python/distribute/cluster_resolver/tpu:tpu_cluster_resolver_py"],
 )
 
 py_library(
@@ -137,25 +128,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "tpu_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["tpu_cluster_resolver_test.py"],
-    grpc_enabled = True,
-    main = "tpu_cluster_resolver_test.py",
-    python_version = "PY3",
-    deps = [
-        ":tpu_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training_server_lib",
-        "//tensorflow/python/tpu/client",
-        "@absl_py//absl/testing:flagsaver",
-    ],
-)
-
 tf_py_test(
     name = "slurm_cluster_resolver_py_test",
     size = "small",
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
new file mode 100644
index 00000000000..4825bf3b6d8
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
@@ -0,0 +1,44 @@
+# Description: OSS only cluster resolvers
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_rpc_deps",
+)
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+py_library(
+    name = "tpu_cluster_resolver_py",
+    srcs = ["tpu_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/tpu/client",
+    ] + tf_additional_rpc_deps(),
+)
+
+tf_py_test(
+    name = "tpu_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["tpu_cluster_resolver_test.py"],
+    grpc_enabled = True,
+    main = "tpu_cluster_resolver_test.py",
+    python_version = "PY3",
+    deps = [
+        ":tpu_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/tpu/client",
+    ],
+)
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
new file mode 100644
index 00000000000..943b736fde4
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
@@ -0,0 +1,349 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Cloud TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
+
+try:
+  from cloud_tpu_client import client  # pylint: disable=g-import-not-at-top
+except ImportError:
+  logging.debug(
+      'Falling back to TensorFlow client; we recommended you install the Cloud '
+      'TPU client directly with pip install cloud-tpu-client.')
+  from tensorflow.python.tpu.client import client  # pylint: disable=g-import-not-at-top
+
+
+def is_running_in_gce():
+  return True
+
+
+_TPU_DEVICE_REGEX = re.compile(
+    r'.*task:(?P<host_id>\d+)/.*device:TPU:(?P<core_id>\d+)$')
+_TPU_CONN_RETRIES = 120
+DeviceDetails = collections.namedtuple(
+    'DeviceDetails', ['device_map', 'total_cores'])
+
+
+class TPUClusterResolver(cluster_resolver.ClusterResolver):
+  """Cluster Resolver for Google Cloud TPUs.
+
+  This is an implementation of cluster resolvers for the Google Cloud TPU
+  service. As Cloud TPUs are in alpha, you will need to specify a API definition
+  file for this to consume, in addition to a list of Cloud TPUs in your Google
+  Cloud Platform project.
+
+  TPUClusterResolver supports the following distinct environments:
+  Google Compute Engine
+  Google Kubernetes Engine
+  Google internal
+  """
+
+  @staticmethod
+  def _get_device_dict_and_cores(devices):
+    """Returns a dict of hosts to cores and total cores given devices names.
+
+    Returns a namedtuple with two attributes:
+      device_map: A map of host_ids to a list of core_ids.
+      total_cores: The total number of cores within the TPU system.
+
+    Args:
+      devices: A list of devices returned by session.list_devices()
+    """
+    device_map = collections.defaultdict(list)
+    num_cores = 0
+    for device in devices:
+      match = _TPU_DEVICE_REGEX.match(device.name)
+      if match:
+        host_id = match.group('host_id')
+        core_id = match.group('core_id')
+        device_map[host_id].append(core_id)
+        num_cores += 1
+    return DeviceDetails(device_map, num_cores)
+
+  @staticmethod
+  def _verify_and_return_same_core_count(device_dict):
+    """Verifies that every device in device_dict has the same # of cores."""
+    num_cores_per_host_set = (
+        {len(core_ids) for core_ids in device_dict.values()})
+    if len(num_cores_per_host_set) != 1:
+      raise RuntimeError('TPU cores on each device is not the same. This '
+                         'should never happen. Devices: {}'.format(device_dict))
+    return num_cores_per_host_set.pop()
+
+  def __init__(self,
+               tpu=None,
+               zone=None,
+               project=None,
+               job_name='worker',
+               coordinator_name=None,
+               coordinator_address=None,
+               credentials='default',
+               service=None,
+               discovery_url=None):
+    """Creates a new TPUClusterResolver object.
+
+    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
+    for the IP addresses and ports of each Cloud TPU listed.
+
+    Args:
+      tpu: A string corresponding to the TPU to use. If the string is an empty
+        string, the string 'local', or a string that begins with 'grpc://', then
+          it is assumed to not correspond with a Cloud TPU and will instead be
+          passed as the session master and no ClusterSpec propagation will be
+          done. In the future, this may also support a list of strings when
+          multiple Cloud TPUs are used.
+      zone: Zone where the TPUs are located. If omitted or empty, we will assume
+        that the zone of the TPU is the same as the zone of the GCE VM, which we
+        will try to discover from the GCE metadata service.
+      project: Name of the GCP project containing Cloud TPUs. If omitted or
+        empty, we will try to discover the project name of the GCE VM from the
+        GCE metadata service.
+      job_name: Name of the TensorFlow job the TPUs belong to.
+      coordinator_name: The name to use for the coordinator. Set to None if the
+        coordinator should not be included in the computed ClusterSpec.
+      coordinator_address: The address of the coordinator (typically an ip:port
+        pair). If set to None, a TF server will be started. If coordinator_name
+        is None, a TF server will not be started even if coordinator_address is
+        None.
+      credentials: GCE Credentials. If None, then we use default credentials
+        from the oauth2client
+      service: The GCE API object returned by the googleapiclient.discovery
+        function. If you specify a custom service object, then the credentials
+        parameter will be ignored.
+      discovery_url: A URL template that points to the location of the discovery
+        service. It should have two parameters {api} and {apiVersion} that when
+        filled in produce an absolute URL to the discovery document for that
+        service. The environment variable 'TPU_API_DISCOVERY_URL' will override
+        this.
+
+    Raises:
+      ImportError: If the googleapiclient is not installed.
+      ValueError: If no TPUs are specified.
+      RuntimeError: If an empty TPU name is specified and this is running in a
+        Google Cloud environment.
+    """
+
+    self._cloud_tpu_client = client.Client(
+        tpu=tpu,
+        zone=zone,
+        project=project,
+        credentials=credentials,
+        service=service,
+        discovery_url=discovery_url)
+
+    self._tpu = self._cloud_tpu_client.name()
+    # By default the task_type is 'worker` and the task_id is 0 (which is the
+    # first worker in the task).
+    self.task_type = job_name
+    self.task_id = 0
+    self._coordinator_name = coordinator_name
+    if (coordinator_name and not coordinator_address):
+      self._start_local_server()
+    else:
+      self._coordinator_address = coordinator_address
+
+  def __enter__(self):
+    self._cloud_tpu_client.enter()
+
+  def __exit__(self, type, value, traceback):  # pylint: disable=redefined-builtin
+    self._cloud_tpu_client.exit(type, value, traceback)
+
+  def master(self, task_type=None, task_id=None, rpc_layer=None):
+    """Get the Master string to be used for the session.
+
+    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
+    first instance in the ClusterSpec returned by the cluster_spec function.
+
+    If a non-TPU name is used when constructing a TPUClusterResolver, that will
+    be returned instead (e.g. If the tpus argument's value when constructing
+    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
+    'grpc://10.240.1.2:8470' will be returned).
+
+    Args:
+      task_type: (Optional, string) The type of the TensorFlow task of the
+        master.
+      task_id: (Optional, integer) The index of the TensorFlow task of the
+        master.
+      rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
+        communicate with TPUs.
+
+    Returns:
+      string, the connection string to use when creating a session.
+
+    Raises:
+      ValueError: If none of the TPUs specified exists.
+    """
+
+    cluster_spec = self.cluster_spec()
+    if task_type is not None and task_id is not None:
+      # task_type and task_id is from the function parameter
+      master = cluster_spec.task_address(task_type, task_id)
+    elif self.task_type is not None and self.task_id is not None:
+      # task_type and task_id is from the object
+      master = cluster_spec.task_address(self.task_type, self.task_id)
+    else:
+      # by default we take the first item in the cluster with the right name
+      job_tasks = cluster_spec.job_tasks(self.task_type)
+      if not job_tasks:
+        raise ValueError('No TPUs with the specified names exist.')
+      master = job_tasks[0]
+    return cluster_resolver.format_master_url(master, 'grpc')
+
+  def get_master(self):
+    return self.master()
+
+  def get_job_name(self):
+    return self.task_type
+
+  def get_tpu_system_metadata(self):
+    """Returns the metadata of the TPU system.
+
+    Users can call this method to get some facts of the TPU system, like
+    total number of cores, number of TPU workers and the devices. E.g.
+    ```python
+
+    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
+    tpu_system_medata = resolver.get_tpu_system_metadata()
+    num_hosts = tpu_system_medata.num_hosts
+    ```
+
+    Returns:
+      A `tf.tpu.experimental.TPUSystemMetadata` object.
+    """
+    cluster_spec = self.cluster_spec()
+    cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
+    tpu_system_metadata = (
+        tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
+            self.master(),
+            cluster_def=cluster_def,
+            query_topology=False))
+
+    return tpu_system_metadata
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest TPU information.
+
+    We retrieve the information from the GCE APIs every time this method is
+    called.
+
+    Returns:
+      A ClusterSpec containing host information returned from Cloud TPUs,
+      or None.
+
+    Raises:
+      RuntimeError: If the provided TPU is not healthy.
+    """
+    ############################################################################
+    # There are 5 potential cases this code must handle:
+    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
+    #      a. Create a ClusterSpec that includes the coordinator job
+    #      b. Create a ClusterSpec without the coordinator job.
+    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
+    #     tasks and
+    #      a. Create a ClusterSpec with the coordinator
+    #      b. Create a ClusterSpec without the coordinator
+    ############################################################################
+
+    network_endpoints = self._cloud_tpu_client.network_endpoints()
+    worker_list = [
+        '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+        for endpoint in network_endpoints
+    ]
+    cluster_spec = {self.task_type: worker_list}
+    if self._coordinator_address:
+      # {1, 2}.a
+      cluster_spec[self._coordinator_name] = [self._coordinator_address]
+
+    return server_lib.ClusterSpec(cluster_spec)
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_id=None,
+                       config_proto=None):
+    """Returns the number of TPU cores per worker.
+
+    Connects to the master and list all the devices present in the master,
+    and counts them up. Also verifies that the device counts per host in the
+    cluster is the same before returning the number of TPU cores per host.
+
+    Args:
+      task_type: Unused.
+      task_id: Unused.
+      config_proto: Used to create a connection to a TPU master in order to
+        retrieve the system metadata.
+
+    Raises:
+      RuntimeError: If we cannot talk to a TPU worker after retrying or if the
+        number of TPU devices per host is different.
+    """
+    retry_count = 1
+    # TODO(b/120564445): Replace with standard library for retries.
+    while True:
+      try:
+        device_details = TPUClusterResolver._get_device_dict_and_cores(
+            cluster_resolver.get_accelerator_devices(
+                self.master(), config_proto=config_proto))
+        break
+      except errors.DeadlineExceededError:
+        error_message = ('Failed to connect to master. The TPU might not be '
+                         'ready (e.g. still scheduling) or the master '
+                         'address is incorrect: got (%s)' % self.master())
+        if retry_count <= _TPU_CONN_RETRIES:
+          logging.warning(error_message)
+          logging.warning('Retrying (%d/%d)...', retry_count, _TPU_CONN_RETRIES)
+          retry_count += 1
+        else:
+          raise RuntimeError(error_message)
+
+    if device_details.total_cores:
+      return {'TPU': TPUClusterResolver._verify_and_return_same_core_count(
+          device_details.device_map)}
+    return {'TPU': 0}
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in."""
+    return self._environment
+
+  def _start_local_server(self):
+    address = compat.as_text(self._cloud_tpu_client.get_local_ip())
+    self._server = server_lib.Server({'local': ['0.0.0.0:0']},
+                                     protocol='grpc',
+                                     config=None,
+                                     start=True)
+    # self._server.target is of the form: grpc://ipaddress:port
+    target = compat.as_bytes(self._server.target)
+    splits = target.split(compat.as_bytes(':'))
+    assert len(splits) == 3, self._server.target
+    assert splits[0] == compat.as_bytes('grpc'), self._server.target
+    self._coordinator_port = compat.as_text(splits[2])
+    self._coordinator_address = '%s:%s' % (
+        address, compat.as_text(self._coordinator_port))
+
+  def __deepcopy__(self, memo):
+    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
+    return self
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
similarity index 99%
rename from tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
index 1fad0a3fc95..1dc9a73fd74 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver_test.py
@@ -25,7 +25,7 @@ from six.moves.urllib.error import URLError
 
 from tensorflow.python import framework
 from tensorflow.python.client import session
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver as resolver
+from tensorflow.python.distribute.cluster_resolver.tpu import tpu_cluster_resolver as resolver
 from tensorflow.python.eager.context import LogicalDevice
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
@@ -41,7 +41,7 @@ except ImportError:
   logging.debug(
       'Falling back to TensorFlow client; we recommended you install the Cloud '
       'TPU client directly with pip install cloud-tpu-client.')
-  from tensorflow.python.tpu.client import client
+  from tensorflow.python.tpu.client import client  # pylint: disable=g-import-not-at-top
 
 
 class MockRequestClass(object):
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
index 79ec0bc13d1..5731c2c930a 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -12,339 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for Cloud TPUs."""
+"""Shim so that direct imports of tpu_cluster_resolver get correct symbols.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import re
-
-from tensorflow.python.distribute.cluster_resolver import cluster_resolver
-from tensorflow.python.framework import errors
-from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
-from tensorflow.python.training import server_lib
-from tensorflow.python.util import compat
+from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import is_running_in_gce  # pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import TPUClusterResolver
 from tensorflow.python.util.tf_export import tf_export
 
-try:
-  from cloud_tpu_client import client  # pylint: disable=g-import-not-at-top
-except ImportError:
-  logging.debug(
-      'Falling back to TensorFlow client; we recommended you install the Cloud '
-      'TPU client directly with pip install cloud-tpu-client.')
-  from tensorflow.python.tpu.client import client
-
-def is_running_in_gce():
-  return True
-
-
-_TPU_DEVICE_REGEX = re.compile(
-    r'.*task:(?P<host_id>\d+)/.*device:TPU:(?P<core_id>\d+)$')
-_TPU_CONN_RETRIES = 120
-DeviceDetails = collections.namedtuple(
-    'DeviceDetails', ['device_map', 'total_cores'])
-
-
-@tf_export('distribute.cluster_resolver.TPUClusterResolver')
-class TPUClusterResolver(cluster_resolver.ClusterResolver):
-  """Cluster Resolver for Google Cloud TPUs.
-
-  This is an implementation of cluster resolvers for the Google Cloud TPU
-  service. As Cloud TPUs are in alpha, you will need to specify a API definition
-  file for this to consume, in addition to a list of Cloud TPUs in your Google
-  Cloud Platform project.
-
-  TPUClusterResolver supports the following distinct environments:
-  Google Compute Engine
-  Google Kubernetes Engine
-  Google internal
-  """
-
-  @staticmethod
-  def _get_device_dict_and_cores(devices):
-    """Returns a dict of hosts to cores and total cores given devices names.
-
-    Returns a namedtuple with two attributes:
-      device_map: A map of host_ids to a list of core_ids.
-      total_cores: The total number of cores within the TPU system.
-
-    Args:
-      devices: A list of devices returned by session.list_devices()
-    """
-    device_map = collections.defaultdict(list)
-    num_cores = 0
-    for device in devices:
-      match = _TPU_DEVICE_REGEX.match(device.name)
-      if match:
-        host_id = match.group('host_id')
-        core_id = match.group('core_id')
-        device_map[host_id].append(core_id)
-        num_cores += 1
-    return DeviceDetails(device_map, num_cores)
-
-  @staticmethod
-  def _verify_and_return_same_core_count(device_dict):
-    """Verifies that every device in device_dict has the same # of cores."""
-    num_cores_per_host_set = (
-        {len(core_ids) for core_ids in device_dict.values()})
-    if len(num_cores_per_host_set) != 1:
-      raise RuntimeError('TPU cores on each device is not the same. This '
-                         'should never happen. Devices: {}'.format(device_dict))
-    return num_cores_per_host_set.pop()
-
-  def __init__(self,
-               tpu=None,
-               zone=None,
-               project=None,
-               job_name='worker',
-               coordinator_name=None,
-               coordinator_address=None,
-               credentials='default',
-               service=None,
-               discovery_url=None):
-    """Creates a new TPUClusterResolver object.
-
-    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
-    for the IP addresses and ports of each Cloud TPU listed.
-
-    Args:
-      tpu: A string corresponding to the TPU to use. If the string is an empty
-        string, the string 'local', or a string that begins with 'grpc://', then
-          it is assumed to not correspond with a Cloud TPU and will instead be
-          passed as the session master and no ClusterSpec propagation will be
-          done. In the future, this may also support a list of strings when
-          multiple Cloud TPUs are used.
-      zone: Zone where the TPUs are located. If omitted or empty, we will assume
-        that the zone of the TPU is the same as the zone of the GCE VM, which we
-        will try to discover from the GCE metadata service.
-      project: Name of the GCP project containing Cloud TPUs. If omitted or
-        empty, we will try to discover the project name of the GCE VM from the
-        GCE metadata service.
-      job_name: Name of the TensorFlow job the TPUs belong to.
-      coordinator_name: The name to use for the coordinator. Set to None if the
-        coordinator should not be included in the computed ClusterSpec.
-      coordinator_address: The address of the coordinator (typically an ip:port
-        pair). If set to None, a TF server will be started. If coordinator_name
-        is None, a TF server will not be started even if coordinator_address is
-        None.
-      credentials: GCE Credentials. If None, then we use default credentials
-        from the oauth2client
-      service: The GCE API object returned by the googleapiclient.discovery
-        function. If you specify a custom service object, then the credentials
-        parameter will be ignored.
-      discovery_url: A URL template that points to the location of the discovery
-        service. It should have two parameters {api} and {apiVersion} that when
-        filled in produce an absolute URL to the discovery document for that
-        service. The environment variable 'TPU_API_DISCOVERY_URL' will override
-        this.
-
-    Raises:
-      ImportError: If the googleapiclient is not installed.
-      ValueError: If no TPUs are specified.
-      RuntimeError: If an empty TPU name is specified and this is running in a
-        Google Cloud environment.
-    """
-
-    self._cloud_tpu_client = client.Client(
-        tpu=tpu,
-        zone=zone,
-        project=project,
-        credentials=credentials,
-        service=service,
-        discovery_url=discovery_url)
-
-    self._tpu = self._cloud_tpu_client.name()
-    # By default the task_type is 'worker` and the task_id is 0 (which is the
-    # first worker in the task).
-    self.task_type = job_name
-    self.task_id = 0
-    self._coordinator_name = coordinator_name
-    if (coordinator_name and not coordinator_address):
-      self._start_local_server()
-    else:
-      self._coordinator_address = coordinator_address
-
-  def __enter__(self):
-    self._cloud_tpu_client.enter()
-
-  def __exit__(self, type, value, traceback):  # pylint: disable=redefined-builtin
-    self._cloud_tpu_client.exit(type, value, traceback)
-
-  def master(self, task_type=None, task_id=None, rpc_layer=None):
-    """Get the Master string to be used for the session.
-
-    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
-    first instance in the ClusterSpec returned by the cluster_spec function.
-
-    If a non-TPU name is used when constructing a TPUClusterResolver, that will
-    be returned instead (e.g. If the tpus argument's value when constructing
-    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
-    'grpc://10.240.1.2:8470' will be returned).
-
-    Args:
-      task_type: (Optional, string) The type of the TensorFlow task of the
-        master.
-      task_id: (Optional, integer) The index of the TensorFlow task of the
-        master.
-      rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
-        communicate with TPUs.
-
-    Returns:
-      string, the connection string to use when creating a session.
-
-    Raises:
-      ValueError: If none of the TPUs specified exists.
-    """
-
-    cluster_spec = self.cluster_spec()
-    if task_type is not None and task_id is not None:
-      # task_type and task_id is from the function parameter
-      master = cluster_spec.task_address(task_type, task_id)
-    elif self.task_type is not None and self.task_id is not None:
-      # task_type and task_id is from the object
-      master = cluster_spec.task_address(self.task_type, self.task_id)
-    else:
-      # by default we take the first item in the cluster with the right name
-      job_tasks = cluster_spec.job_tasks(self.task_type)
-      if not job_tasks:
-        raise ValueError('No TPUs with the specified names exist.')
-      master = job_tasks[0]
-    return cluster_resolver.format_master_url(master, 'grpc')
-
-  def get_master(self):
-    return self.master()
-
-  def get_job_name(self):
-    return self.task_type
-
-  def get_tpu_system_metadata(self):
-    """Returns the metadata of the TPU system.
-
-    Users can call this method to get some facts of the TPU system, like
-    total number of cores, number of TPU workers and the devices. E.g.
-    ```python
-
-    resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
-    tpu_system_medata = resolver.get_tpu_system_metadata()
-    num_hosts = tpu_system_medata.num_hosts
-    ```
-
-    Returns:
-      A `tf.tpu.experimental.TPUSystemMetadata` object.
-    """
-    cluster_spec = self.cluster_spec()
-    cluster_def = cluster_spec.as_cluster_def() if cluster_spec else None
-    tpu_system_metadata = (
-        tpu_system_metadata_lib._query_tpu_system_metadata(  # pylint: disable=protected-access
-            self.master(),
-            cluster_def=cluster_def,
-            query_topology=False))
-
-    return tpu_system_metadata
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest TPU information.
-
-    We retrieve the information from the GCE APIs every time this method is
-    called.
-
-    Returns:
-      A ClusterSpec containing host information returned from Cloud TPUs,
-      or None.
-
-    Raises:
-      RuntimeError: If the provided TPU is not healthy.
-    """
-    ############################################################################
-    # There are 5 potential cases this code must handle:
-    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
-    #      a. Create a ClusterSpec that includes the coordinator job
-    #      b. Create a ClusterSpec without the coordinator job.
-    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
-    #     tasks and
-    #      a. Create a ClusterSpec with the coordinator
-    #      b. Create a ClusterSpec without the coordinator
-    ############################################################################
-
-    network_endpoints = self._cloud_tpu_client.network_endpoints()
-    worker_list = [
-        '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
-        for endpoint in network_endpoints
-    ]
-    cluster_spec = {self.task_type: worker_list}
-    if self._coordinator_address:
-      # {1, 2}.a
-      cluster_spec[self._coordinator_name] = [self._coordinator_address]
-
-    return server_lib.ClusterSpec(cluster_spec)
-
-  def num_accelerators(self,
-                       task_type=None,
-                       task_id=None,
-                       config_proto=None):
-    """Returns the number of TPU cores per worker.
-
-    Connects to the master and list all the devices present in the master,
-    and counts them up. Also verifies that the device counts per host in the
-    cluster is the same before returning the number of TPU cores per host.
-
-    Args:
-      task_type: Unused.
-      task_id: Unused.
-      config_proto: Used to create a connection to a TPU master in order to
-        retrieve the system metadata.
-
-    Raises:
-      RuntimeError: If we cannot talk to a TPU worker after retrying or if the
-        number of TPU devices per host is different.
-    """
-    retry_count = 1
-    # TODO(b/120564445): Replace with standard library for retries.
-    while True:
-      try:
-        device_details = TPUClusterResolver._get_device_dict_and_cores(
-            cluster_resolver.get_accelerator_devices(
-                self.master(), config_proto=config_proto))
-        break
-      except errors.DeadlineExceededError:
-        error_message = ('Failed to connect to master. The TPU might not be '
-                         'ready (e.g. still scheduling) or the master '
-                         'address is incorrect: got (%s)' % self.master())
-        if retry_count <= _TPU_CONN_RETRIES:
-          logging.warning(error_message)
-          logging.warning('Retrying (%d/%d)...', retry_count, _TPU_CONN_RETRIES)
-          retry_count += 1
-        else:
-          raise RuntimeError(error_message)
-
-    if device_details.total_cores:
-      return {'TPU': TPUClusterResolver._verify_and_return_same_core_count(
-          device_details.device_map)}
-    return {'TPU': 0}
-
-  @property
-  def environment(self):
-    """Returns the current environment which TensorFlow is running in."""
-    return self._environment
-
-  def _start_local_server(self):
-    address = compat.as_text(self._cloud_tpu_client.get_local_ip())
-    self._server = server_lib.Server({'local': ['0.0.0.0:0']},
-                                     protocol='grpc',
-                                     config=None,
-                                     start=True)
-    # self._server.target is of the form: grpc://ipaddress:port
-    target = compat.as_bytes(self._server.target)
-    splits = target.split(compat.as_bytes(':'))
-    assert len(splits) == 3, self._server.target
-    assert splits[0] == compat.as_bytes('grpc'), self._server.target
-    self._coordinator_port = compat.as_text(splits[2])
-    self._coordinator_address = '%s:%s' % (
-        address, compat.as_text(self._coordinator_port))
-
-  def __deepcopy__(self, memo):
-    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
-    return self
+tf_export('distribute.cluster_resolver.TPUClusterResolver')(TPUClusterResolver)
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index c0dc0054165..658212aca5e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.distribute.cluster_resolver.TPUClusterResolver"
 tf_class {
-  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver.TPUClusterResolver\'>"
   is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
   is_instance: "<type \'object\'>"
   member {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
index c0dc0054165..658212aca5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.cluster_resolver.-t-p-u-cluster-resolver.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.distribute.cluster_resolver.TPUClusterResolver"
 tf_class {
-  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver.TPUClusterResolver\'>"
+  is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver.TPUClusterResolver\'>"
   is_instance: "<class \'tensorflow.python.distribute.cluster_resolver.cluster_resolver.ClusterResolver\'>"
   is_instance: "<type \'object\'>"
   member {

From 18c0da102443f3500c43618d469bd7e7f761696c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 15:00:16 -0700
Subject: [PATCH 138/412] Correctly handle empty matrices in tf.linalg.svd.

PiperOrigin-RevId: 311411299
Change-Id: Ie5440ad4593291409f801fb174fbac3120db0eb7
---
 tensorflow/core/kernels/BUILD                 |  4 +++-
 tensorflow/core/kernels/svd_op_gpu.cu.cc      | 19 +++++++++++++--
 tensorflow/core/kernels/svd_op_impl.h         | 23 +++++++++++++++----
 tensorflow/python/kernel_tests/svd_op_test.py |  7 +++---
 4 files changed, 42 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 7cfb6fcae67..6cb8704f494 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3792,7 +3792,9 @@ tf_kernel_library(
 tf_kernel_library(
     name = "svd_op",
     prefix = "svd_op",
-    deps = LINALG_DEPS,
+    deps = LINALG_DEPS + if_cuda([
+        ":eye_functor",
+    ]),
 )
 
 tf_kernel_library(
diff --git a/tensorflow/core/kernels/svd_op_gpu.cu.cc b/tensorflow/core/kernels/svd_op_gpu.cu.cc
index 2821abf8a6c..482fd057e4e 100644
--- a/tensorflow/core/kernels/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/svd_op_gpu.cu.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/kernels/eye_functor.h"
 #include "tensorflow/core/kernels/linalg_ops_common.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -390,8 +391,22 @@ class SvdOpGpu : public AsyncOpKernel {
                          done);
 
     if (n == 0 || m == 0) {
-      // If X is an empty matrix (0 rows, 0 col), X * X' == X.
-      // Therefore, we return X.
+      if (n == m || !compute_uv_ || !full_matrices_) {
+        // S, U, and V are all empty. Nothing to do.
+        done();
+        return;
+      }
+      auto device = context->eigen_device<GPUDevice>();
+      functor::EyeFunctor<GPUDevice, Scalar> eye;
+      if (m > 0) {
+        // Return a full canonical basis for the column space.
+        auto outputU_reshaped = outputU->flat_inner_dims<Scalar, 3>();
+        eye(device, outputU_reshaped);
+      } else if (n > 0) {
+        // Return a full canonical basis for the row space.
+        auto outputV_reshaped = outputV->flat_inner_dims<Scalar, 3>();
+        eye(device, outputV_reshaped);
+      }
       done();
       return;
     }
diff --git a/tensorflow/core/kernels/svd_op_impl.h b/tensorflow/core/kernels/svd_op_impl.h
index 2a67700c126..675826a057c 100644
--- a/tensorflow/core/kernels/svd_op_impl.h
+++ b/tensorflow/core/kernels/svd_op_impl.h
@@ -83,16 +83,29 @@ class SvdOp : public LinearAlgebraOp<Scalar> {
 
   void ComputeMatrix(OpKernelContext* context, const ConstMatrixMaps& inputs,
                      MatrixMaps* outputs) final {
+    int64 n = inputs[0].cols();
+    int64 m = inputs[0].rows();
+    const bool empty = (m == 0 || n == 0);
     int options = 0;  // Don't compute singular vectors;
     if (compute_uv_) {
       options = full_matrices_ ? Eigen::ComputeFullU | Eigen::ComputeFullV
                                : Eigen::ComputeThinU | Eigen::ComputeThinV;
     }
-    Eigen::BDCSVD<Matrix> svd(inputs[0], options);
-    outputs->at(0) = svd.singularValues().template cast<Scalar>();
-    if (compute_uv_) {
-      outputs->at(1) = svd.matrixU();
-      outputs->at(2) = svd.matrixV();
+    if (!empty) {
+      Eigen::BDCSVD<Matrix> svd(inputs[0], options);
+      outputs->at(0) = svd.singularValues().template cast<Scalar>();
+      if (compute_uv_) {
+        outputs->at(1) = svd.matrixU();
+        outputs->at(2) = svd.matrixV();
+      }
+    } else if (compute_uv_ && full_matrices_) {
+      // For an empty matrix where only one dimension is zero, we still set
+      // U or V to the unit matrix for the dimension that is non-zero.
+      if (m > 0) {
+        outputs->at(1) = Matrix::Identity(m, m);
+      } else {
+        outputs->at(2) = Matrix::Identity(n, n);
+      }
     }
   }
 
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index 120e604e7ae..a53d2470aa5 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -93,7 +93,8 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
                   full_matrices_):
 
   def CompareSingularValues(self, x, y, tol):
-    self.assertAllClose(x, y, atol=(x[0] + y[0]) * tol)
+    atol = (x[0] + y[0]) * tol if len(x) else tol
+    self.assertAllClose(x, y, atol=atol)
 
   def CompareSingularVectors(self, x, y, rank, tol):
     # We only compare the first 'rank' singular vectors since the
@@ -374,8 +375,8 @@ if __name__ == "__main__":
   for compute_uv in False, True:
     for full_matrices in False, True:
       for dtype in dtypes_to_test:
-        for rows in 1, 2, 5, 10, 32, 100:
-          for cols in 1, 2, 5, 10, 32, 100:
+        for rows in 0, 1, 2, 5, 10, 32, 100:
+          for cols in 0, 1, 2, 5, 10, 32, 100:
             for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
               shape = batch_dims + (rows, cols)
               # TF2 does not support placeholders under eager so we skip it

From 4eeb6d742e1ff416f9fc2baeba2fab698e6f28cf Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Wed, 13 May 2020 15:00:41 -0700
Subject: [PATCH 139/412] More robustly check for undefined symbols before
 attempting to use them. This check is required because undefined symbols are
 initialized with a special placeholder before entering control flow. This
 placeholder can lead to confusing error messages if left unchecked. The
 change introduces two more general  operators: "variable load" and "return".

PiperOrigin-RevId: 311411422
Change-Id: Ic8abda74c1f68c1d4de491949d309d60099b91b4
---
 tensorflow/python/autograph/converters/BUILD  |  14 ++
 .../autograph/converters/asserts_test.py      |   4 +-
 .../python/autograph/converters/functions.py  |   9 -
 .../autograph/converters/functions_test.py    |   4 +-
 .../autograph/converters/return_statements.py |  99 ++++----
 .../converters/return_statements_test.py      |   3 +-
 .../python/autograph/converters/variables.py  |  76 ++++++
 .../autograph/converters/variables_test.py    | 116 +++++++++
 tensorflow/python/autograph/core/BUILD        |   1 +
 .../autograph/core/function_wrappers.py       |  11 +-
 .../autograph/core/function_wrappers_test.py  |   2 +-
 .../python/autograph/impl/conversion.py       |   2 +
 tensorflow/python/autograph/operators/BUILD   |  18 +-
 .../python/autograph/operators/__init__.py    |   8 +-
 .../autograph/operators/control_flow.py       |  26 +-
 .../operators/control_flow_deprecated_py2.py  |  10 +-
 .../autograph/operators/control_flow_test.py  |  10 +-
 .../python/autograph/operators/symbols.py     | 115 ---------
 .../autograph/operators/symbols_test.py       | 230 ------------------
 .../{special_values.py => variables.py}       |  40 +--
 ...ecial_values_test.py => variables_test.py} |  32 ++-
 .../reaching_definitions_py3_test.py          |  12 +
 .../reaching_definitions_test.py              |   3 +
 23 files changed, 351 insertions(+), 494 deletions(-)
 create mode 100644 tensorflow/python/autograph/converters/variables.py
 create mode 100644 tensorflow/python/autograph/converters/variables_test.py
 delete mode 100644 tensorflow/python/autograph/operators/symbols.py
 delete mode 100644 tensorflow/python/autograph/operators/symbols_test.py
 rename tensorflow/python/autograph/operators/{special_values.py => variables.py} (72%)
 rename tensorflow/python/autograph/operators/{special_values_test.py => variables_test.py} (58%)

diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 9c1d5a38707..ec780a7c0a1 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -33,6 +33,7 @@ py_library(
         "logical_expressions.py",
         "return_statements.py",
         "slices.py",
+        "variables.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -213,3 +214,16 @@ py_test(
         "//tensorflow/python/autograph/pyct",
     ],
 )
+
+py_test(
+    name = "variables_test",
+    srcs = ["variables_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":converters",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/autograph/core:test_lib",
+        "//tensorflow/python/autograph/pyct",
+    ],
+)
diff --git a/tensorflow/python/autograph/converters/asserts_test.py b/tensorflow/python/autograph/converters/asserts_test.py
index fd31cd15a0e..dc435cbc90e 100644
--- a/tensorflow/python/autograph/converters/asserts_test.py
+++ b/tensorflow/python/autograph/converters/asserts_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.converters import asserts
 from tensorflow.python.autograph.converters import functions
+from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
@@ -36,7 +37,8 @@ class AssertsTest(converter_testing.TestCase):
       return a
 
     with ops.Graph().as_default():
-      with self.converted(test_fn, (functions, asserts), {}) as result:
+      with self.converted(
+          test_fn, (functions, asserts, return_statements), {}) as result:
         op = result.test_fn(constant_op.constant(False))
 
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, 'testmsg'):
diff --git a/tensorflow/python/autograph/converters/functions.py b/tensorflow/python/autograph/converters/functions.py
index fc33dafb63d..26ead131f9b 100644
--- a/tensorflow/python/autograph/converters/functions.py
+++ b/tensorflow/python/autograph/converters/functions.py
@@ -38,15 +38,6 @@ class _Function(object):
 class FunctionTransformer(converter.Base):
   """Wraps function bodies around autograph-specific boilerplate."""
 
-  def visit_Return(self, node):
-    if node.value is None:
-      return node
-    node = self.generic_visit(node)
-    return templates.replace(
-        'return function_context_name.mark_return_value(value)',
-        function_context_name=self.state[_Function].context_name,
-        value=node.value)
-
   def _function_scope_options(self, fn_scope):
     """Returns the options with which to create function scopes."""
     # Top-level function receive the options that were directly requested.
diff --git a/tensorflow/python/autograph/converters/functions_test.py b/tensorflow/python/autograph/converters/functions_test.py
index aad455e67d7..2a51ef71ebf 100644
--- a/tensorflow/python/autograph/converters/functions_test.py
+++ b/tensorflow/python/autograph/converters/functions_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.converters import functions
+from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import converter_testing
@@ -74,7 +75,7 @@ class FunctionTransformer(converter_testing.TestCase):
       l += 1
       return l, inner_fn(l)
 
-    with self.converted(test_fn, functions, {},
+    with self.converted(test_fn, (functions, return_statements), {},
                         (ops.name_scope,)) as result:
       first, second = result.test_fn(constant_op.constant(1))
       self.assertIn('test_fn/', first.op.name)
@@ -119,6 +120,7 @@ class FunctionTransformer(converter_testing.TestCase):
     ns = {'TestClass': TestClass}
     node, ctx = self.prepare(TestClass, ns)
     node = functions.transform(node, ctx)
+    node = return_statements.transform(node, ctx)
 
     with self.compiled(node, {}, (ops.name_scope,)) as result:
       first, second = result.TestClass().test_fn(constant_op.constant(1))
diff --git a/tensorflow/python/autograph/converters/return_statements.py b/tensorflow/python/autograph/converters/return_statements.py
index 39bac60fb91..e4062e42db7 100644
--- a/tensorflow/python/autograph/converters/return_statements.py
+++ b/tensorflow/python/autograph/converters/return_statements.py
@@ -220,9 +220,9 @@ class ReturnStatementsTransformer(converter.Base):
         retval = val
   """
 
-  def __init__(self, ctx, default_to_null_return):
+  def __init__(self, ctx, allow_missing_return):
     super(ReturnStatementsTransformer, self).__init__(ctx)
-    self.default_to_null_return = default_to_null_return
+    self.allow_missing_return = allow_missing_return
 
   def visit_Return(self, node):
     for block in reversed(self.state[_Block].stack):
@@ -339,75 +339,68 @@ class ReturnStatementsTransformer(converter.Base):
     return node
 
   def visit_FunctionDef(self, node):
-    self.state[_Function].enter()
-    self.state[_Block].enter()
-    self.state[_Block].is_function = True
+    with self.state[_Function] as fn:
+      with self.state[_Block] as block:
+        block.is_function = True
 
-    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    do_return_var_name = self.ctx.namer.new_symbol(
-        'do_return', scope.referenced)
-    retval_var_name = self.ctx.namer.new_symbol('retval_', scope.referenced)
-    self.state[_Function].do_return_var_name = do_return_var_name
-    self.state[_Function].retval_var_name = retval_var_name
+        scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+        do_return_var_name = self.ctx.namer.new_symbol('do_return',
+                                                       scope.referenced)
+        retval_var_name = self.ctx.namer.new_symbol('retval_', scope.referenced)
+        fn.do_return_var_name = do_return_var_name
+        fn.retval_var_name = retval_var_name
 
-    converted_body = self._visit_statement_block(node, node.body)
+        node.body = self._visit_statement_block(node, node.body)
 
-    # Avoid placing statements before any eventual docstring.
-    # TODO(mdan): Should a docstring even be included in the output?
-    docstring = None
-    if converted_body:
-      if (isinstance(converted_body[0], gast.Expr) and
-          isinstance(converted_body[0].value, gast.Constant)):
-        docstring = converted_body[0]
-        converted_body = converted_body[1:]
+        if block.return_used:
 
-    if self.state[_Block].return_used:
+          if self.allow_missing_return:
+            # The function whould have a single `with` node that wraps the
+            # entire body. If the function had a docstring, the body has two
+            # nodes, with the `with` as the second node.
+            wrapper_node = node.body[-1]
+            assert isinstance(wrapper_node, gast.With), (
+                'This transformer requires the functions converter.')
 
-      if self.default_to_null_return:
-        # TODO(mdan): Remove the (do_return_var_name,) below.
-        # Currently, that line ensures the variable is both defined and alive
-        # throughout the function.
-        template = """
-          do_return_var_name = False
-          retval_var_name = ag__.UndefinedReturnValue()
-          body
-          (do_return_var_name,)
-          return ag__.retval(retval_var_name)
-        """
-      else:
-        template = """
-          body
-          return retval_var_name
-        """
-      node.body = templates.replace(
-          template,
-          body=converted_body,
-          do_return_var_name=do_return_var_name,
-          retval_var_name=retval_var_name)
+            template = """
+              do_return_var_name = False
+              retval_var_name = ag__.UndefinedReturnValue()
+              body
+              return function_context.ret(retval_var_name, do_return_var_name)
+            """
 
-      if docstring:
-        node.body.insert(0, docstring)
+            wrapper_node.body = templates.replace(
+                template,
+                body=wrapper_node.body,
+                do_return_var_name=do_return_var_name,
+                function_context=anno.getanno(node, 'function_context_name'),
+                retval_var_name=retval_var_name)
+          else:
+            template = """
+              body
+              return retval_var_name
+            """
+            node.body = templates.replace(
+                template,
+                body=node.body,
+                do_return_var_name=do_return_var_name,
+                retval_var_name=retval_var_name)
 
-    self.state[_Block].exit()
-    self.state[_Function].exit()
     return node
 
 
 def transform(node, ctx, default_to_null_return=True):
-  """Ensure a function has only a single return."""
-  # Note: Technically, these two could be merged into a single walk, but
-  # keeping them separate helps with readability.
-
+  """Ensure a function has only a single return, at the end."""
   node = qual_names.resolve(node)
   node = activity.resolve(node, ctx, None)
 
+  # Note: Technically, these two could be merged into a single walk, but
+  # keeping them separate helps with readability.
   node = ConditionalReturnRewriter(ctx).visit(node)
 
   node = qual_names.resolve(node)
   node = activity.resolve(node, ctx, None)
-
   transformer = ReturnStatementsTransformer(
-      ctx, default_to_null_return=default_to_null_return)
+      ctx, allow_missing_return=default_to_null_return)
   node = transformer.visit(node)
-
   return node
diff --git a/tensorflow/python/autograph/converters/return_statements_test.py b/tensorflow/python/autograph/converters/return_statements_test.py
index df687927638..3f1e6a0bd97 100644
--- a/tensorflow/python/autograph/converters/return_statements_test.py
+++ b/tensorflow/python/autograph/converters/return_statements_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.autograph.converters import functions
 from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import ops
@@ -28,7 +29,7 @@ class SingleReturnTest(converter_testing.TestCase):
 
   def assertTransformedEquivalent(self, test_fn, *inputs):
     ns = {'ops': ops}
-    with self.converted(test_fn, return_statements, ns) as result:
+    with self.converted(test_fn, (functions, return_statements), ns) as result:
       self.assertEqual(test_fn(*inputs), result.test_fn(*inputs))
 
   def test_straightline(self):
diff --git a/tensorflow/python/autograph/converters/variables.py b/tensorflow/python/autograph/converters/variables.py
new file mode 100644
index 00000000000..3028a65a69b
--- /dev/null
+++ b/tensorflow/python/autograph/converters/variables.py
@@ -0,0 +1,76 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Overloads all variable read operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.pyct import anno
+from tensorflow.python.autograph.pyct import templates
+
+
+class VariableAccessTransformer(converter.Base):
+  """Rewrites basic symbol reads.
+
+  This transformer rewrites variable reads with a "read" operator which allows
+  tracking activity.
+
+  Example:
+
+  For a basic statement:
+
+      a = b + c
+
+  This is translated to:
+
+      a = ld(b) + ld(c)
+
+  Augmented assignment operations also introduce a `ld` operator:
+
+      a += b
+
+  The assignment target also receives an operator to properly represent the
+  read:
+
+      a = ld(a)
+      a += ld(b)
+  """
+
+  def visit_Name(self, node):
+    # Only the loads which existed in the original code are overloaded.
+    if not anno.hasanno(node, anno.Static.ORIG_DEFINITIONS):
+      return node
+    if isinstance(node.ctx, gast.Load):
+      node = templates.replace_as_expression('ag__.ld(var_)', var_=node)
+    return node
+
+  def visit_AugAssign(self, node):
+    if isinstance(node.target, gast.Name):
+      template = """
+        var_ = ag__.ld(var_)
+        original
+      """
+      node = templates.replace(template, var_=node.target, original=node)
+    else:
+      node = self.generic_visit(node)
+    return node
+
+
+def transform(node, ctx):
+  return VariableAccessTransformer(ctx).visit(node)
diff --git a/tensorflow/python/autograph/converters/variables_test.py b/tensorflow/python/autograph/converters/variables_test.py
new file mode 100644
index 00000000000..556dafbaa8a
--- /dev/null
+++ b/tensorflow/python/autograph/converters/variables_test.py
@@ -0,0 +1,116 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for variables module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+
+from tensorflow.python.autograph.converters import variables
+from tensorflow.python.autograph.core import converter_testing
+from tensorflow.python.platform import test
+
+
+class VariablesTest(converter_testing.TestCase):
+
+  @contextlib.contextmanager
+  def apply_add_one_conversion(self, fn):
+    """Generates code which adds 1 to all variable reads."""
+    with self.converted(fn, variables, {}) as result:
+      result.ag__.__dict__['ld'] = lambda x: x + 1
+      yield result
+
+  def test_read(self):
+
+    def test_fn(l):
+      return l
+
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(1), 2)
+
+  def test_aug_assign(self):
+
+    def test_fn(l):
+      l *= 10
+      return l
+
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(1), (1 + 1) * 10 + 1)  # two reads
+
+  def test_attribute(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.v = 1
+
+      def __add__(self, other):
+        self.v += other
+        return self
+
+    def test_fn(l):
+      return l.v
+
+    tc = TestClass()
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(tc), 2)
+
+  def test_subscript(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.v = 1
+
+      def __add__(self, other):
+        self.v += other
+        return self
+
+      def __getitem__(self, _):
+        return self.v
+
+    def test_fn(l):
+      return l[0]
+
+    tc = TestClass()
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(tc), 2)
+
+  def test_call(self):
+
+    class TestClass(object):
+
+      def __init__(self):
+        self.v = 1
+
+      def __add__(self, other):
+        self.v += other
+        return self
+
+      def __call__(self):
+        return self.v
+
+    def test_fn(l):
+      return l()
+
+    tc = TestClass()
+    with self.apply_add_one_conversion(test_fn) as result:
+      self.assertEqual(result.test_fn(tc), 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index 655dc118a37..4a5c50dac55 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -30,6 +30,7 @@ py_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python/autograph/operators",
         "//tensorflow/python/autograph/pyct",
         "//tensorflow/python/autograph/pyct/static_analysis",
         "//tensorflow/python/autograph/utils",
diff --git a/tensorflow/python/autograph/core/function_wrappers.py b/tensorflow/python/autograph/core/function_wrappers.py
index cc0e7b98de5..d425f8b679d 100644
--- a/tensorflow/python/autograph/core/function_wrappers.py
+++ b/tensorflow/python/autograph/core/function_wrappers.py
@@ -20,12 +20,16 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.core import converter
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.util import nest
 
 
+# TODO(mdan): Move this into operators - it represents a function definition.
+
+
 class FunctionScope(object):
   """Context manager that wraps the body of a converted function.
 
@@ -84,8 +88,13 @@ class FunctionScope(object):
     if self.use_auto_deps:
       self.autodeps_scope.__exit__(exc_type, exc_val, exc_tb)
 
-  def mark_return_value(self, value):
+  def ret(self, value, did_return):
     """Marks a value as returned from the function guarded by the scope."""
+    del did_return
+
+    if isinstance(value, variables.UndefinedReturnValue):
+      return None
+
     if self.use_auto_deps:
       self._return_value_marked = True
       if value is None:
diff --git a/tensorflow/python/autograph/core/function_wrappers_test.py b/tensorflow/python/autograph/core/function_wrappers_test.py
index 917a5358633..344ba495570 100644
--- a/tensorflow/python/autograph/core/function_wrappers_test.py
+++ b/tensorflow/python/autograph/core/function_wrappers_test.py
@@ -46,7 +46,7 @@ class FunctionWrappersTest(test.TestCase):
         converter.ConversionOptions(
             optional_features=converter.Feature.AUTO_CONTROL_DEPS)) as scope:
       v.assign(2)
-      op = scope.mark_return_value(constant_op.constant(1))
+      op = scope.ret(constant_op.constant(1), True)
     self.evaluate(op)
     self.assertEqual(self.evaluate(v.read_value()), 2)
 
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 7a7efe3d43a..eeea0aef896 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -36,6 +36,7 @@ from tensorflow.python.autograph.converters import lists
 from tensorflow.python.autograph.converters import logical_expressions
 from tensorflow.python.autograph.converters import return_statements
 from tensorflow.python.autograph.converters import slices
+from tensorflow.python.autograph.converters import variables
 from tensorflow.python.autograph.core import config
 from tensorflow.python.autograph.core import converter
 from tensorflow.python.autograph.core import function_wrappers
@@ -92,6 +93,7 @@ class AutoGraphTranspiler(transpiler.FunctionTranspiler):
     node = control_flow.transform(node, ctx)
     node = conditional_expressions.transform(node, ctx)
     node = logical_expressions.transform(node, ctx)
+    node = variables.transform(node, ctx)
     return node
 
 
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 6db9e4f8e3b..3851c7b44ba 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -29,8 +29,7 @@ py_library(
         "logical.py",
         "py_builtins.py",
         "slices.py",
-        "special_values.py",
-        "symbols.py",
+        "variables.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -148,19 +147,8 @@ py_test(
 )
 
 py_test(
-    name = "special_values_test",
-    srcs = ["special_values_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY2AND3",
-    deps = [
-        ":operators",
-        "//tensorflow/python:client_testlib",
-    ],
-)
-
-py_test(
-    name = "symbols_test",
-    srcs = ["symbols_test.py"],
+    name = "variables_test",
+    srcs = ["variables_test.py"],
     python_version = "PY3",
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 495b6070aae..f7f9078107c 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -60,8 +60,6 @@ from tensorflow.python.autograph.operators.py_builtins import range_
 from tensorflow.python.autograph.operators.slices import get_item
 from tensorflow.python.autograph.operators.slices import GetItemOpts
 from tensorflow.python.autograph.operators.slices import set_item
-from tensorflow.python.autograph.operators.special_values import is_undefined
-from tensorflow.python.autograph.operators.special_values import is_undefined_return
-from tensorflow.python.autograph.operators.special_values import retval
-from tensorflow.python.autograph.operators.special_values import Undefined
-from tensorflow.python.autograph.operators.special_values import UndefinedReturnValue
+from tensorflow.python.autograph.operators.variables import ld
+from tensorflow.python.autograph.operators.variables import Undefined
+from tensorflow.python.autograph.operators.variables import UndefinedReturnValue
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 48b7971ec16..592281b0ce2 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -65,7 +65,7 @@ import traceback
 import numpy as np
 
 from tensorflow.python.autograph.operators import py_builtins
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.autograph.utils import compat_util
 from tensorflow.python.autograph.utils import misc
@@ -103,13 +103,13 @@ def _verify_loop_init_vars(values, symbol_names):
   for name, value in zip(symbol_names, values):
     if value is None:
       raise ValueError('"{}" may not be None before the loop.'.format(name))
-    if special_values.is_undefined_return(value):
+    if isinstance(value, variables.UndefinedReturnValue):
       # Assumption: the loop will only capture the variable which tracks the
       # return value if the loop contained a return statement.
       # TODO(mdan): This should be checked at the place where return occurs.
       raise ValueError(
           'return statements are not supported within a TensorFlow loop.')
-    if special_values.is_undefined(value):
+    if isinstance(value, variables.Undefined):
       raise ValueError('"{}" must be defined before the loop.'.format(name))
 
 
@@ -495,8 +495,7 @@ def _tf_range_for_stmt(
   iterate = compat_util.BasicRef(start)
 
   def _value_or(name, var, default):
-    if (name == opts['iterate_names']
-        and isinstance(var, special_values.Undefined)):
+    if (name == opts['iterate_names'] and isinstance(var, variables.Undefined)):
       return default
     return var
 
@@ -1019,7 +1018,15 @@ def _wrap_disallow_undefs_from_cond(func, branch_name):
       results_tuple = results
     else:
       results_tuple = results,
-    undefined = tuple(filter(special_values.is_undefined, results_tuple))
+
+    for result in results_tuple:
+      if isinstance(result, variables.UndefinedReturnValue):
+        raise ValueError(
+            'A value must also be returned from the {} branch. If a value is '
+            'returned from one branch of a conditional a value must be '
+            'returned from all branches.'.format(branch_name))
+
+    undefined = [v for v in results_tuple if isinstance(v, variables.Undefined)]
     if undefined:
       raise ValueError(
           'The following symbols must also be initialized in the {} branch: {}.'
@@ -1027,13 +1034,6 @@ def _wrap_disallow_undefs_from_cond(func, branch_name):
           ' statement.'.format(branch_name,
                                tuple(s.symbol_name for s in undefined)))
 
-    for result in results_tuple:
-      if special_values.is_undefined_return(result):
-        raise ValueError(
-            'A value must also be returned from the {} branch. If a value is '
-            'returned from one branch of a conditional a value must be '
-            'returned from all branches.'.format(branch_name))
-
     return results
 
   return wrapper
diff --git a/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py b/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
index e01a2f206c8..5a900fb19ed 100644
--- a/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
+++ b/tensorflow/python/autograph/operators/control_flow_deprecated_py2.py
@@ -66,7 +66,7 @@ import functools
 import numpy as np
 
 from tensorflow.python.autograph.operators import py_builtins
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.autograph.utils import misc
 from tensorflow.python.autograph.utils import tensors
@@ -103,13 +103,13 @@ INEFFICIENT_UNROLL_MIN_OPS = 1
 
 def _disallow_undefs_into_loop(*values):
   """Ensures that all values in the state are defined when entering a loop."""
-  undefined = tuple(filter(special_values.is_undefined, values))
+  undefined = [v for v in values if isinstance(v, variables.Undefined)]
   if undefined:
     raise ValueError(
         '{} must be defined before the loop.'.format(
             ','.join(s.symbol_name for s in undefined)))
   for value in values:
-    if special_values.is_undefined_return(value):
+    if isinstance(value, variables.UndefinedReturnValue):
       # Assumption: the loop will only capture the variable which tracks the
       # return value if the loop contained a return statement.
       # TODO(mdan): This should be checked at the place where return occurs.
@@ -1129,7 +1129,7 @@ def _wrap_disallow_undefs_from_cond(func, branch_name):
       results_tuple = results
     else:
       results_tuple = results,
-    undefined = tuple(filter(special_values.is_undefined, results_tuple))
+    undefined = [v for v in results_tuple if isinstance(v, variables.Undefined)]
     if undefined:
       raise ValueError(
           'The following symbols must also be initialized in the {} branch: {}.'
@@ -1138,7 +1138,7 @@ def _wrap_disallow_undefs_from_cond(func, branch_name):
                                tuple(s.symbol_name for s in undefined)))
 
     for result in results_tuple:
-      if special_values.is_undefined_return(result):
+      if isinstance(result, variables.UndefinedReturnValue):
         raise ValueError(
             'A value must also be returned from the {} branch. If a value is '
             'returned from one branch of a conditional a value must be '
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 5f0a9d09bf3..1c4407904b2 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -29,7 +29,7 @@ import numpy as np
 import six
 
 from tensorflow.python.autograph.operators import control_flow
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables as variable_operators
 from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import def_function
@@ -546,7 +546,7 @@ class ForLoopTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, '"s" may not be None'):
       self._basic_loop(None, lambda i, s: s)
     with self.assertRaisesRegex(ValueError, '"s" must be defined'):
-      self._basic_loop(special_values.Undefined(''), lambda i, s: s)
+      self._basic_loop(variable_operators.Undefined(''), lambda i, s: s)
 
   def test_tensor_none_output(self):
     with self.assertRaisesRegex(ValueError, '"s" is None at the end'):
@@ -785,7 +785,7 @@ class WhileLoopTest(test.TestCase):
     with self.assertRaisesRegex(ValueError, '"s" may not be None'):
       self._basic_loop(None, lambda i, s: s)
     with self.assertRaisesRegex(ValueError, '"s" must be defined'):
-      self._basic_loop(special_values.Undefined(''), lambda i, s: s)
+      self._basic_loop(variable_operators.Undefined(''), lambda i, s: s)
 
   def test_tensor_none_output(self):
     with self.assertRaisesRegex(ValueError, '"s" is None at the end'):
@@ -887,10 +887,10 @@ class IfStmtTest(test.TestCase):
   def test_tensor_undefined_output(self):
     with self.assertRaisesRegex(
         ValueError, "must also be initialized in the if.*'s'"):
-      self._basic_cond(lambda: special_values.Undefined('s'), lambda: 1)
+      self._basic_cond(lambda: variable_operators.Undefined('s'), lambda: 1)
     with self.assertRaisesRegex(
         ValueError, "must also be initialized in the else.*'s'"):
-      self._basic_cond(lambda: 1, lambda: special_values.Undefined('s'))
+      self._basic_cond(lambda: 1, lambda: variable_operators.Undefined('s'))
 
   def test_tensor_dtype_change(self):
     with self.assertRaisesRegex(TypeError, '"s" has dtype int32.*but.*float32'):
diff --git a/tensorflow/python/autograph/operators/symbols.py b/tensorflow/python/autograph/operators/symbols.py
deleted file mode 100644
index 0dd7e0a5956..00000000000
--- a/tensorflow/python/autograph/operators/symbols.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Abstract representation of composite symbols that can be used in staging code.
-
-This provides a way to checkpoint the values of symbols that may be undefined
-entering staged control flow. This checkpointing is necessary to prevent some
-unintended side-effects. For example checkpointing prevents side-effects in one
-branch of a conditional from leaking into another.
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.operators import special_values
-
-
-is_undefined = special_values.is_undefined
-Undefined = special_values.Undefined
-
-
-class Symbol(object):
-  """Representation of a simple or composite Python symbol.
-
-  Subclasses should implement `maybe_compute_value(self)` that returns the value
-  corresponding to the symbol or Undefined if no such value exists.
-  """
-
-  def __init__(self, name):
-    self.name = name
-
-
-class ValueSymbol(Symbol):
-  """Representation of a simple Python symbol with a concrete value.
-
-  This includes variables and literals. Since we are reifying undefined symbols
-  `Undefined` is also a valid value.
-  """
-
-  def __init__(self, name, value):
-    super(ValueSymbol, self).__init__(name)
-    self.value = value
-
-  def maybe_compute_value(self):
-    return self.value
-
-
-class AttributeAccessSymbol(Symbol):
-  """Representation of Python attribute access e.g. `a.b`."""
-
-  def __init__(self, parent_symbol, attr_name):
-    super(AttributeAccessSymbol, self).__init__(
-        parent_symbol.name + '.' + attr_name)
-    self.attr_name = attr_name
-    self.parent_symbol = parent_symbol
-
-  def maybe_compute_value(self):
-    """Compute the value corresponding to the attribute access or `Undefined`.
-
-    This will be `Undefined` if no such value exists either because there is no
-    such attribute or if the base is itself undefined.
-
-    Returns:
-      value corresponding to the attribute access or `Undefined`
-    """
-    parent_value = self.parent_symbol.maybe_compute_value()
-    if (is_undefined(parent_value) or
-        getattr(parent_value, self.attr_name, None) is None):
-      return Undefined(self.name)
-
-    return parent_value.__getattribute__(self.attr_name)
-
-
-class SubscriptSymbol(Symbol):
-  """Representation of Python subscript access e.g. `a[b]`."""
-
-  def __init__(self, parent_symbol, index_symbol):
-    super(SubscriptSymbol, self).__init__(
-        parent_symbol.name + '[' + index_symbol.name + ']')
-    self.index_symbol = index_symbol
-    self.parent_symbol = parent_symbol
-
-  def maybe_compute_value(self):
-    """Compute the value corresponding to the subscript access or `Undefined`.
-
-    This will be `Undefined` if no such value exists either because there is no
-    element corresponding to the given subscript or if the base itself is
-    not defined.
-
-    Returns:
-      value corresponding to the subscript access or `Undefined`
-    """
-    parent_value = self.parent_symbol.maybe_compute_value()
-    index_value = self.index_symbol.maybe_compute_value()
-    if is_undefined(parent_value) or is_undefined(index_value):
-      return Undefined(self.name)
-
-    try:
-      return parent_value[index_value]
-    except (IndexError, KeyError, TypeError):
-      # Reify the lack of an object for the given index/key
-      # This allows us to define them later without regret
-      return Undefined(self.name)
diff --git a/tensorflow/python/autograph/operators/symbols_test.py b/tensorflow/python/autograph/operators/symbols_test.py
deleted file mode 100644
index 3acb16273bd..00000000000
--- a/tensorflow/python/autograph/operators/symbols_test.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for special symbol handling."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.autograph.operators import special_values
-from tensorflow.python.autograph.operators import symbols
-from tensorflow.python.platform import test
-
-Undefined = special_values.Undefined
-AttributeAccessSymbol = symbols.AttributeAccessSymbol
-SubscriptSymbol = symbols.SubscriptSymbol
-ValueSymbol = symbols.ValueSymbol
-
-
-class SymbolsTest(test.TestCase):
-
-  def test_value_symbol_returns_value(self):
-    a = 42
-    a_symbol = ValueSymbol('a', a)
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(a_symbol.name, 'a')
-
-  def test_attribute_access_missing_attribute(self):
-    class Foo(object):
-      pass
-    a = Foo()
-
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a.b')
-
-  def test_attribute_access_undefined_target(self):
-    a = Undefined('a')
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a.b')
-
-  def test_attribute_access_basic(self):
-    class Foo(object):
-
-      def __init__(self):
-        self.b = 'this is an attribute'
-
-    a = Foo()
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a.b)
-
-  def test_item_access_undefined_index(self):
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    a = Foo()
-    b = Undefined('b')
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a[b]')
-
-  def test_item_access_no_getitem(self):
-    class Foo(object):
-      pass
-
-    a = Foo()
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a[b]')
-
-  def test_item_access_undefined_root(self):
-    a = Undefined('a')
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertIsInstance(a_b_symbol.maybe_compute_value(), Undefined)
-    self.assertEqual(a_b_symbol.maybe_compute_value().symbol_name, 'a[b]')
-
-  def test_item_access_basic(self):
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    a = Foo()
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a[b])
-
-  def test_item_access_after_attribute_access(self):
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    class Bar(object):
-
-      def __init__(self):
-        self.b = Foo()
-
-    a = Bar()
-    c = 42
-    a_symbol = ValueSymbol('a', a)
-    c_symbol = ValueSymbol('c', c)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-    a_b_c_symbol = SubscriptSymbol(a_b_symbol, c_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(c_symbol.maybe_compute_value(), c)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a.b)
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a.b[c])
-
-  def test_attribute_access_after_item_access(self):
-    class Bar(object):
-
-      def __init__(self):
-        self.c = object()
-
-    item = Bar()
-
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return item
-
-    a = Foo()
-    b = 42
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-    a_b_c_symbol = AttributeAccessSymbol(a_b_symbol, 'c')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a[b])
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a[b].c)
-
-  def test_item_access_after_item_access(self):
-    class Bar(object):
-
-      def __getitem__(self, key):
-        return 'this is an item'
-
-    item = Bar()
-
-    class Foo(object):
-
-      def __getitem__(self, key):
-        return item
-
-    a = Foo()
-    b = 42
-    c = 43
-    a_symbol = ValueSymbol('a', a)
-    b_symbol = ValueSymbol('b', b)
-    c_symbol = ValueSymbol('b', c)
-    a_b_symbol = SubscriptSymbol(a_symbol, b_symbol)
-    a_b_c_symbol = SubscriptSymbol(a_b_symbol, c_symbol)
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(b_symbol.maybe_compute_value(), b)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a[b])
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a[b][c])
-
-  def test_attribute_access_after_attribute_access(self):
-    class Bar(object):
-
-      def __init__(self):
-        self.c = object()
-
-    class Foo(object):
-
-      def __init__(self):
-        self.b = Bar()
-
-    a = Foo()
-    a_symbol = ValueSymbol('a', a)
-    a_b_symbol = AttributeAccessSymbol(a_symbol, 'b')
-    a_b_c_symbol = AttributeAccessSymbol(a_b_symbol, 'c')
-
-    self.assertEqual(a_symbol.maybe_compute_value(), a)
-    self.assertEqual(a_b_symbol.maybe_compute_value(), a.b)
-    self.assertEqual(a_b_c_symbol.maybe_compute_value(), a.b.c)
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/autograph/operators/special_values.py b/tensorflow/python/autograph/operators/variables.py
similarity index 72%
rename from tensorflow/python/autograph/operators/special_values.py
rename to tensorflow/python/autograph/operators/variables.py
index c172cce23f1..150f64e1758 100644
--- a/tensorflow/python/autograph/operators/special_values.py
+++ b/tensorflow/python/autograph/operators/variables.py
@@ -19,6 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 
+def ld(v):
+  """Load variable operator."""
+  if isinstance(v, Undefined):
+    return v.read()
+  return v
+
+
 class Undefined(object):
   """Represents an undefined symbol in Python.
 
@@ -51,6 +58,10 @@ class Undefined(object):
   def __init__(self, symbol_name):
     self.symbol_name = symbol_name
 
+  def read(self):
+    raise UnboundLocalError("'{}' is used before assignment".format(
+        self.symbol_name))
+
   def __repr__(self):
     return self.symbol_name
 
@@ -66,34 +77,7 @@ class Undefined(object):
     return self
 
 
-def is_undefined(value):
-  """Checks whether Autograph has determined that a given value is undefined.
-
-  This only works in places where Autograph reifies undefined symbols. Note that
-  if this function is passed a truly undefined symbol the call-site will raise
-  NameError.
-
-  Args:
-    value: value to test for undefinedness
-  Returns:
-    Boolean, whether the input value is undefined.
-  """
-  return isinstance(value, Undefined)
-
-
 # TODO(mdan): Refactor as a RetVal object, aggregating the value and do_return.
 class UndefinedReturnValue(object):
-  """Represents a default return value from a function (None in Python)."""
+  """Represents a return value that is undefined."""
   pass
-
-
-def retval(value):
-  """Returns the actual value that a return statement should produce."""
-  if isinstance(value, UndefinedReturnValue):
-    return None
-  return value
-
-
-def is_undefined_return(value):
-  """Checks whether `value` is the default return value."""
-  return isinstance(value, UndefinedReturnValue)
diff --git a/tensorflow/python/autograph/operators/special_values_test.py b/tensorflow/python/autograph/operators/variables_test.py
similarity index 58%
rename from tensorflow/python/autograph/operators/special_values_test.py
rename to tensorflow/python/autograph/operators/variables_test.py
index 1742cc4277d..168e6172232 100644
--- a/tensorflow/python/autograph/operators/special_values_test.py
+++ b/tensorflow/python/autograph/operators/variables_test.py
@@ -18,28 +18,38 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.autograph.operators import special_values
+from tensorflow.python.autograph.operators import variables
 from tensorflow.python.platform import test
 
 
 class SpecialValuesTest(test.TestCase):
 
   def test_undefined(self):
-    undefined_symbol = special_values.Undefined('name')
-    self.assertEqual(undefined_symbol.symbol_name, 'name')
+    undefined_symbol = variables.Undefined('name')
+    undefined_symbol2 = variables.Undefined('name')
 
-    undefined_symbol2 = special_values.Undefined('name')
+    self.assertEqual(undefined_symbol.symbol_name, 'name')
+    self.assertEqual(undefined_symbol2.symbol_name, 'name')
     self.assertNotEqual(undefined_symbol, undefined_symbol2)
 
-    self.assertTrue(special_values.is_undefined(undefined_symbol))
-    self.assertTrue(special_values.is_undefined(undefined_symbol2))
-
   def test_undefined_operations(self):
-    undefined_symbol = special_values.Undefined('name')
+    undefined_symbol = variables.Undefined('name')
+
+    self.assertIsInstance(undefined_symbol.foo, variables.Undefined)
+    self.assertIsInstance(undefined_symbol[0], variables.Undefined)
+    self.assertNotIsInstance(undefined_symbol.__class__, variables.Undefined)
+
+  def test_read(self):
+    self.assertEqual(variables.ld(1), 1)
+    o = object()
+    self.assertEqual(variables.ld(o), o)
+
+    self.assertIsNone(variables.ld(None))
+
+  def test_read_undefined(self):
+    with self.assertRaisesRegex(UnboundLocalError, 'used before assignment'):
+      variables.ld(variables.Undefined('a'))
 
-    self.assertTrue(special_values.is_undefined(undefined_symbol.foo))
-    self.assertTrue(special_values.is_undefined(undefined_symbol[0]))
-    self.assertFalse(special_values.is_undefined(undefined_symbol.__class__))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
index 7333ec0c872..ba27280f729 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_py3_test.py
@@ -78,6 +78,18 @@ class ReachingDefinitionsAnalyzerTest(
 
     self.assertSameDef(local_body[1].test, local_body[2].value.elts[0])
 
+    # Note: the function name is is visible inside the function body. But it's
+    # a closure variable, not a local.
+    #
+    # Example:
+    #
+    #   >>> def f():
+    #   ...  print(f)
+    #   >>> g = f
+    #   >>> f = 'something else'
+    #   >>> g()
+    #   something else
+    #
     self.assertHasDefinedIn(local_body[1], ('a', 'b'))
 
 
diff --git a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
index c4e7cbd4d17..64b00fcbeba 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/reaching_definitions_test.py
@@ -255,6 +255,9 @@ class ReachingDefinitionsAnalyzerTest(ReachingDefinitionsAnalyzerTestBase):
 
     inner_fn_body = fn_body[1].body[1].body
     def_of_a_in_foo = inner_fn_body[0].value
+    # Even though `a` is visible in the inner functio above, the late binding
+    # makes it impossible to assume that the same value will be visible at
+    # call time.
     self.assertHasDefs(def_of_a_in_foo, 0)
 
   def test_nested_functions_isolation(self):

From 7e39134874fb8315ea941c661f32394eaf667c3b Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 13 May 2020 15:05:25 -0700
Subject: [PATCH 140/412] [Grappler] Do not add control edges from placeholder
 inputs in function inlining

PiperOrigin-RevId: 311412339
Change-Id: Ie40c0c44f1d6b42b53c259f8ad92d171577cd9c7
---
 tensorflow/core/grappler/optimizers/function_optimizer.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index eaccff3b127..ed3af955c13 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -1122,7 +1122,15 @@ void AddStrictInputSemantics(Node* caller, Graph* g) {
 
   VLOG(3) << "Add control edges from all data inputs to enforce strict "
              "semantics with regard to function inputs";
+
+  // Do not add control edges from placeholders, because it will prevent
+  // pruning, and they can't produce any side effects anyway.
+  const auto is_placeholder = [](const Node* node) -> bool {
+    return node->type_string() == "Placeholder";
+  };
+
   for (const Node* node : data_inputs) {
+    if (is_placeholder(node)) continue;
     g->AddControlEdge(g->FindNodeId(node->id()), caller,
                       /*allow_duplicates=*/true);
   }

From e1b0e64119a082bda7ac0125c59b970d7eac54f1 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Wed, 13 May 2020 15:33:52 -0700
Subject: [PATCH 141/412] Export RandomZoom after its odd behavior was fixed.

PiperOrigin-RevId: 311417546
Change-Id: Idb5bcff8b97a1bba1ab054a19ad0a701cf04cc00
---
 .../preprocessing/image_preprocessing.py      |  45 ++--
 .../preprocessing/image_preprocessing_test.py |  22 +-
 ...erimental.preprocessing.-random-zoom.pbtxt | 218 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 ...erimental.preprocessing.-random-zoom.pbtxt | 218 ++++++++++++++++++
 ...as.layers.experimental.preprocessing.pbtxt |   4 +
 6 files changed, 495 insertions(+), 16 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
 create mode 100644 tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt

diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
index 05a6e84e6cc..832915dac68 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py
@@ -827,6 +827,7 @@ class RandomRotation(Layer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@keras_export('keras.layers.experimental.preprocessing.RandomZoom')
 class RandomZoom(Layer):
   """Randomly zoom each image during training.
 
@@ -847,7 +848,8 @@ class RandomZoom(Layer):
       For instance, `width_factor=(0.2, 0.3)` result in an output zooming out
       between 20% to 30%.
       `width_factor=(-0.3, -0.2)` result in an output zooming in between 20%
-      to 30%.
+      to 30%. Defaults to `None`, i.e., zooming vertical and horizontal
+      directions by preserving the aspect ratio.
     fill_mode: Points outside the boundaries of the input are filled according
       to the given mode (one of `{'constant', 'reflect', 'wrap'}`).
       - *reflect*: `(d c b a | a b c d | d c b a)`
@@ -860,6 +862,14 @@ class RandomZoom(Layer):
     seed: Integer. Used to create a random seed.
     name: A string, the name of the layer.
 
+  Example:
+
+  >>> input_img = np.random.random((32, 224, 224, 3))
+  >>> layer = tf.keras.layers.experimental.preprocessing.RandomZoom(.5, .2)
+  >>> out_img = layer(input_img)
+  >>> out_img.shape
+  TensorShape([32, 224, 224, 3])
+
   Input shape:
     4D tensor with shape:
     `(samples, height, width, channels)`, data_format='channels_last'.
@@ -873,9 +883,10 @@ class RandomZoom(Layer):
       negative.
   """
 
+  # TODO(b/156526279): Add `fill_value` argument.
   def __init__(self,
                height_factor,
-               width_factor,
+               width_factor=None,
                fill_mode='reflect',
                interpolation='bilinear',
                seed=None,
@@ -894,16 +905,17 @@ class RandomZoom(Layer):
                        'got {}'.format(height_factor))
 
     self.width_factor = width_factor
-    if isinstance(width_factor, (tuple, list)):
-      self.width_lower = width_factor[0]
-      self.width_upper = width_factor[1]
-    else:
-      self.width_lower = -width_factor
-      self.width_upper = width_factor
+    if width_factor is not None:
+      if isinstance(width_factor, (tuple, list)):
+        self.width_lower = width_factor[0]
+        self.width_upper = width_factor[1]
+      else:
+        self.width_lower = -width_factor  # pylint: disable=invalid-unary-operand-type
+        self.width_upper = width_factor
 
-    if self.width_lower < -1. or self.width_upper < -1.:
-      raise ValueError('`width_factor` must have values larger than -1, '
-                       'got {}'.format(width_factor))
+      if self.width_lower < -1. or self.width_upper < -1.:
+        raise ValueError('`width_factor` must have values larger than -1, '
+                         'got {}'.format(width_factor))
 
     check_fill_mode_and_interpolation(fill_mode, interpolation)
 
@@ -928,10 +940,13 @@ class RandomZoom(Layer):
           shape=[batch_size, 1],
           minval=1. + self.height_lower,
           maxval=1. + self.height_upper)
-      width_zoom = self._rng.uniform(
-          shape=[batch_size, 1],
-          minval=1. + self.width_lower,
-          maxval=1. + self.width_upper)
+      if self.width_factor is not None:
+        width_zoom = self._rng.uniform(
+            shape=[batch_size, 1],
+            minval=1. + self.width_lower,
+            maxval=1. + self.width_upper)
+      else:
+        width_zoom = height_zoom
       zooms = math_ops.cast(
           array_ops.concat([width_zoom, height_zoom], axis=1),
           dtype=dtypes.float32)
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 28c9955c9dd..38d2d25916a 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -1021,7 +1021,27 @@ class RandomZoomTest(keras_parameterized.TestCase):
     for dtype in (np.int64, np.float32):
       with tf_test_util.use_gpu():
         input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
-        layer = image_preprocessing.RandomZoom((.5, .5), (.5, .5),
+        layer = image_preprocessing.RandomZoom((.5, .5), (.8, .8),
+                                               fill_mode='constant',
+                                               interpolation='nearest')
+        output_image = layer(np.expand_dims(input_image, axis=0))
+        # pyformat: disable
+        expected_output = np.asarray([
+            [0, 0, 0, 0, 0],
+            [0, 5, 7, 9, 0],
+            [0, 10, 12, 14, 0],
+            [0, 20, 22, 24, 0],
+            [0, 0, 0, 0, 0]
+        ]).astype(dtype)
+        # pyformat: enable
+        expected_output = np.reshape(expected_output, (1, 5, 5, 1))
+        self.assertAllEqual(expected_output, output_image)
+
+  def test_random_zoom_out_numeric_preserve_aspect_ratio(self):
+    for dtype in (np.int64, np.float32):
+      with tf_test_util.use_gpu():
+        input_image = np.reshape(np.arange(0, 25), (5, 5, 1)).astype(dtype)
+        layer = image_preprocessing.RandomZoom((.5, .5),
                                                fill_mode='constant',
                                                interpolation='nearest')
         output_image = layer(np.expand_dims(input_image, axis=0))
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
new file mode 100644
index 00000000000..85850223bcb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomZoom"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomZoom\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index 20e5ca1af9c..0964922ea26 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "RandomWidth"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RandomZoom"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Rescaling"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
new file mode 100644
index 00000000000..85850223bcb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-random-zoom.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.RandomZoom"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.image_preprocessing.RandomZoom\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'height_factor\', \'width_factor\', \'fill_mode\', \'interpolation\', \'seed\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'reflect\', \'bilinear\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'training\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_signature\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index 20e5ca1af9c..0964922ea26 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "RandomWidth"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "RandomZoom"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Rescaling"
     mtype: "<type \'type\'>"

From 0ac3572e8de360a0f91a186228fe9de16c92a8cf Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Wed, 13 May 2020 16:07:02 -0700
Subject: [PATCH 142/412] Make SerializeRemoteTensorHandle block only when the
 remote op is a function, in order to still benefit from async execution.

PiperOrigin-RevId: 311423473
Change-Id: I87a3973ddf1954facb69c14499ce2fa07a9d6e99
---
 tensorflow/c/eager/c_api_remote_test.cc       | 22 +++++++++++++++++++
 .../core/common_runtime/eager/execute.cc      | 10 +++++++--
 .../core/common_runtime/eager/execute_node.cc | 16 ++++++++++----
 .../common_runtime/eager/tensor_handle.cc     |  9 ++++----
 .../core/common_runtime/eager/tensor_handle.h |  9 ++++----
 .../eager/eager_service_impl_test.cc          |  5 +++--
 .../eager/remote_copy_node.cc                 |  6 +++--
 .../distributed_runtime/eager/remote_mgr.cc   | 15 ++++++++-----
 .../distributed_runtime/eager/remote_mgr.h    |  9 +++++---
 .../eager/remote_mgr_test.cc                  |  6 +++--
 .../eager/remote_tensor_handle_data.cc        |  9 +++++---
 .../eager/remote_tensor_handle_data.h         |  7 +++---
 12 files changed, 88 insertions(+), 35 deletions(-)

diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 9dc18c7a6f1..544dffb664c 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -434,6 +434,22 @@ string AddVariablesFunction() {
   return def.SerializeAsString();
 }
 
+void VarIsInitialized(TFE_Context* ctx, TFE_TensorHandle* var_handle) {
+  TF_Status* status = TF_NewStatus();
+  TFE_Op* op = TFE_NewOp(ctx, "VarIsInitializedOp", status);
+  EXPECT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(op, var_handle, status);
+  TFE_TensorHandle* is_initialized[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(op, &is_initialized[0], &num_retvals, status);
+  CHECK_EQ(1, num_retvals);
+  TF_Tensor* t = TFE_TensorHandleResolve(is_initialized[0], status);
+  bool initialized = false;
+  memcpy(&initialized, TF_TensorData(t), TF_TensorByteSize(t));
+  EXPECT_EQ(initialized, true);
+  delete status;
+}
+
 void TestFunctionWithPackedInput(const bool remote) {
   tensorflow::ServerDef server_def = GetServerDef(3);
 
@@ -474,6 +490,12 @@ void TestFunctionWithPackedInput(const bool remote) {
   TFE_TensorHandle* h1 = TestVariable(ctx, 2.0, task1_name);
   TFE_TensorHandle* h2 = TestVariable(ctx, 3.0, task2_name);
 
+  // Add a sync point in order to make sure that variables have been initialized
+  // before the function execution starts.
+  // TODO(b/155789951): Remove once b/155789951 is fixed.
+  VarIsInitialized(ctx, h1);
+  VarIsInitialized(ctx, h2);
+
   // Pack 3 variable handles into one TFE_TensorHandle.
   int num_replicas = 3;
   std::vector<TFE_TensorHandle*> handles = {h0, h1, h2};
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index f6b4370bbdc..f23b0fa7877 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -782,9 +782,15 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
         }
       }
       auto* input_handle = remote_op->add_op_inputs()->mutable_remote_handle();
+      // For a multi-device function, a remote RunComponentFunction request is
+      // not sent through StreamingEnqueueAsync. It could arrive at a remote
+      // worker before a remote execution request which produces an input of the
+      // component function. So we wait until the remote input is ready before
+      // serializing it.
+      const bool wait_until_ready = op->is_function();
       TF_RETURN_IF_ERROR(ctx.RemoteMgr()->SerializeRemoteTensorHandle(
-          input, input_handle, input_device, *input_device_name,
-          serialize_resource_dtype_and_shape));
+          input, wait_until_ready, input_handle, input_device,
+          *input_device_name, serialize_resource_dtype_and_shape));
       if (!input_handle->resource_dtypes_and_shapes().empty()) {
         TF_RETURN_IF_ERROR(
             input->AddResourceShapeMirror(op_device, input_handle->op_id(),
diff --git a/tensorflow/core/common_runtime/eager/execute_node.cc b/tensorflow/core/common_runtime/eager/execute_node.cc
index 3197d3e0ac7..27503cfd99d 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node.cc
@@ -97,9 +97,11 @@ Status ExecuteNodeArgs::Init(
 
 #if !defined(IS_MOBILE_PLATFORM)
   if (has_remote_inputs_) {
+    const bool is_function = kernel->IsFunction();
     serialize_remote_handle_ =
-        [ctx, &op_inputs](const FunctionArgIndex& index,
-                          eager::RemoteTensorHandle* handle) -> Status {
+        [ctx, &op_inputs, is_function](
+            const FunctionArgIndex& index,
+            eager::RemoteTensorHandle* handle) -> Status {
       TensorHandle* h = op_inputs[index.index];
       if (op_inputs[index.index]->Type() == TensorHandle::PACKED) {
         TF_RETURN_IF_ERROR(
@@ -112,8 +114,14 @@ Status ExecuteNodeArgs::Init(
             "together.");
       }
       Device* device = absl::get<Device*>(variant_device);
-      return ctx->RemoteMgr()->SerializeRemoteTensorHandle(h, handle, device,
-                                                           device->name());
+      // For a multi-device function, a remote RunComponentFunction request is
+      // not sent through StreamingEnqueueAsync. It could arrive at a remote
+      // worker before a remote execution request which produces an input of the
+      // component function. So we wait until the remote input is ready before
+      // serializing it.
+      const bool wait_util_ready = is_function;
+      return ctx->RemoteMgr()->SerializeRemoteTensorHandle(
+          h, wait_util_ready, handle, device, device->name());
     };
   }
 #endif  // !IS_MOBILE_PLATFORM
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 49fa69e2185..dbfc5639017 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -705,8 +705,8 @@ Status TensorHandle::AddEmptyLocalMirror(const Device* d) {
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
-Status TensorHandle::RemoteAddressUntilReady(const Device* d, int64* op_id,
-                                             int32* output_num) const {
+Status TensorHandle::RemoteAddress(const Device* d, const bool wait_until_ready,
+                                   int64* op_id, int32* output_num) const {
   DVLOG(3) << "RemoteAddress on TensorHandle: " << this << " device: " << d
            << " " << d->name();
 
@@ -714,7 +714,8 @@ Status TensorHandle::RemoteAddressUntilReady(const Device* d, int64* op_id,
     tf_shared_lock l(mu_);
     auto mirror = remote_mirrors_.find(d->name());
     if (mirror != remote_mirrors_.end()) {
-      return mirror->second.OpIdAndOutputNumUntilReady(op_id, output_num);
+      return mirror->second.OpIdAndOutputNum(wait_until_ready, op_id,
+                                             output_num);
     }
 
     return errors::FailedPrecondition(
@@ -726,7 +727,7 @@ Status TensorHandle::RemoteAddressUntilReady(const Device* d, int64* op_id,
   }
 
   auto& data = absl::get<RemoteTensorHandleData>(data_);
-  return data.OpIdAndOutputNumUntilReady(op_id, output_num);
+  return data.OpIdAndOutputNum(wait_until_ready, op_id, output_num);
 }
 
 bool TensorHandle::HasRemoteMirror(const Device* d,
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 6f9ee565c73..5e7638ae03c 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -168,10 +168,11 @@ class TensorHandle : public AbstractTensorHandleInterface,
   Status AddResourceShapeMirror(const Device* d, int64 op_id, int output_num,
                                 EagerContext* ctx);
 
-  // Return the op_id and output num if the handle refers to a remote tensor;
-  // and blocks until the remote tensor is ready on the given remote worker.
-  Status RemoteAddressUntilReady(const Device* d, int64* op_id,
-                                 int32* output_num) const;
+  // Return the op_id and output num if the handle refers to a remote tensor.
+  // If wait_until_ready is true, block until the remote tensor is ready on the
+  // given remote worker.
+  Status RemoteAddress(const Device* d, const bool wait_until_ready,
+                       int64* op_id, int32* output_num) const;
 
   // Called on an async remote tensor once it's shape has been determined. This
   // transitions the tensor handle from a non-ready to a ready state by
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 23bf324b80f..46a6181cfa9 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -970,8 +970,9 @@ TEST_F(EagerServiceImplTest, SendPackedHandleTest) {
   EXPECT_EQ(handle2->op_device()->name(), device2);
   int64 op_id;
   int32 output_num;
-  TF_ASSERT_OK(handle2->RemoteAddressUntilReady(
-      absl::get<Device*>(handle2->device()), &op_id, &output_num));
+  TF_ASSERT_OK(handle2->RemoteAddress(absl::get<Device*>(handle2->device()),
+                                      /*wait_until_ready=*/true, &op_id,
+                                      &output_num));
   EXPECT_EQ(op_id, 2);
   EXPECT_EQ(output_num, 5);
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index 5d0793b258c..090417863f3 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -147,7 +147,8 @@ void RemoteCopyNode::StartSend() {
     request.set_context_id(ctx_->GetContextId());
     auto* remote_op = request.add_queue()->mutable_operation();
     status = ctx_->RemoteMgr()->SerializeRemoteTensorHandle(
-        src_, remote_op->add_op_inputs()->mutable_remote_handle(),
+        src_, /*wait_until_ready=*/false,
+        remote_op->add_op_inputs()->mutable_remote_handle(),
         absl::get<Device*>(src_->device()),
         absl::get<Device*>(src_->DeviceOrHostCPU(*ctx_))->name());
     if (!status.ok()) {
@@ -316,7 +317,8 @@ Status SerializePackedHandle(const uint64 op_id, TensorHandle* packed_handle,
           (i == 0) && (h->dtype == DT_RESOURCE) &&
           (ctx->OnSameTask(src_device, target_device));
       TF_RETURN_IF_ERROR(ctx->RemoteMgr()->SerializeRemoteTensorHandle(
-          h, op->add_handles()->mutable_remote_handle(), src_device,
+          h, /*wait_until_ready=*/false,
+          op->add_handles()->mutable_remote_handle(), src_device,
           absl::get<Device*>(h->DeviceOrHostCPU(*ctx))->name(),
           serialize_resource_dtype_and_shape));
     } else {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 7c5115d33ef..94a4f199337 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -74,6 +74,7 @@ Status RemoteMgr::GetMirroredResourceShape(
 }
 
 Status RemoteMgr::GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
+                                        const bool wait_until_ready,
                                         int64* op_id, int32* output_num) {
   // TODO(allenl): Consider supporting remote handles on custom devices.
   VariantDevice device = handle->device();
@@ -82,8 +83,8 @@ Status RemoteMgr::GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
         "Custom devices and remote execution are currently not supported "
         "together.");
   }
-  TF_RETURN_IF_ERROR(handle->RemoteAddressUntilReady(absl::get<Device*>(device),
-                                                     op_id, output_num));
+  TF_RETURN_IF_ERROR(handle->RemoteAddress(
+      absl::get<Device*>(device), wait_until_ready, op_id, output_num));
   tensorflow::TensorHandle* h;
   TF_RETURN_IF_ERROR(
       GetTensorHandleImpl(RemoteTensorHandleInternal(*op_id, *output_num), &h));
@@ -120,13 +121,15 @@ Status RemoteMgr::DeleteTensorHandle(
 }
 
 Status RemoteMgr::SerializeRemoteTensorHandle(
-    TensorHandle* in, RemoteTensorHandle* out, Device* device,
-    const string& device_name, const bool serialize_resource_dtype_and_shape) {
+    TensorHandle* in, const bool wait_until_ready, RemoteTensorHandle* out,
+    Device* device, const string& device_name,
+    const bool serialize_resource_dtype_and_shape) {
   int64 op_id;
   int32 output_num;
-  if (!in->RemoteAddressUntilReady(device, &op_id, &output_num).ok()) {
+  if (!in->RemoteAddress(device, wait_until_ready, &op_id, &output_num).ok()) {
     tf_shared_lock l(remote_tensor_handle_mu_);
-    TF_RETURN_IF_ERROR(GetRemoteTensorHandle(in, &op_id, &output_num));
+    TF_RETURN_IF_ERROR(
+        GetRemoteTensorHandle(in, wait_until_ready, &op_id, &output_num));
   }
   out->Clear();
   out->set_op_id(op_id);
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.h b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
index 54c987d4daa..2446352c931 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.h
@@ -61,9 +61,11 @@ class RemoteMgr {
   }
 
   // Serialize a remote TensorHandle to a RemoteTensorHandle.
+  // If wait_until_ready is true, block until the remote handle is ready on a
+  // remote worker.
   Status SerializeRemoteTensorHandle(
-      TensorHandle* in, RemoteTensorHandle* out, Device* device,
-      const string& device_name,
+      TensorHandle* in, const bool wait_until_ready, RemoteTensorHandle* out,
+      Device* device, const string& device_name,
       const bool serialize_resource_dtype_and_shape = false);
 
   // Deserialize a RemoteTensorHandle to a TensorHandle(local/remote).
@@ -83,7 +85,8 @@ class RemoteMgr {
   // Returns the op_id and output_num if the given local TensorHandle exists in
   // remote_tensor_handle_map_.
   Status GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
-                               int64* op_id, int32* output_num)
+                               const bool wait_until_ready, int64* op_id,
+                               int32* output_num)
       TF_SHARED_LOCKS_REQUIRED(remote_tensor_handle_mu_);
 
   Status GetTensorHandleImpl(const RemoteTensorHandleInternal& remote_handle,
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
index e4cf6277c5a..1e33a9d0f62 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr_test.cc
@@ -81,7 +81,8 @@ TEST_F(RemoteMgrTest, SerializeLocalTensorHandleWithRemoteMirror) {
       handle->SetRemoteShape(shape, remote_device_, ctx_->GetContextViewId()));
   RemoteTensorHandle remote_handle;
   TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
-      handle, &remote_handle, remote_device_, remote_device_->name()));
+      handle, /*wait_until_ready=*/true, &remote_handle, remote_device_,
+      remote_device_->name()));
   EXPECT_EQ(op_id, remote_handle.op_id());
   EXPECT_EQ(output_num, remote_handle.output_num());
   EXPECT_EQ(remote_device_->name(), remote_handle.device());
@@ -97,7 +98,8 @@ TEST_F(RemoteMgrTest, SerializeRemoteTensorHandle) {
       op_id, output_num, DT_FLOAT, remote_device_, ctx_);
   RemoteTensorHandle remote_handle;
   TF_ASSERT_OK(remote_mgr.SerializeRemoteTensorHandle(
-      handle, &remote_handle, remote_device_, remote_device_->name()));
+      handle, /*wait_until_ready=*/true, &remote_handle, remote_device_,
+      remote_device_->name()));
   EXPECT_EQ(op_id, remote_handle.op_id());
   EXPECT_EQ(output_num, remote_handle.output_num());
   EXPECT_EQ(remote_device_->name(), remote_handle.device());
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index 6cdf6b196a2..6f4d5ada759 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -194,9 +194,12 @@ string RemoteTensorHandleData::DebugString() const {
                          " output_num: ", output_num_);
 }
 
-Status RemoteTensorHandleData::OpIdAndOutputNumUntilReady(
-    int64* op_id, int32* output_num) const {
-  TF_RETURN_IF_ERROR(WaitReady("OpIdAndOutputNumUntilReady"));
+Status RemoteTensorHandleData::OpIdAndOutputNum(const bool wait_util_ready,
+                                                int64* op_id,
+                                                int32* output_num) const {
+  if (wait_util_ready) {
+    TF_RETURN_IF_ERROR(WaitReady("OpIdAndOutputNumUntilReady"));
+  }
   *op_id = op_id_;
   *output_num = output_num_;
   return Status::OK();
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
index 37ad5e721b6..5f096677225 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -50,9 +50,10 @@ class RemoteTensorHandleData {
 
   string DebugString() const;
 
-  // Block until the remote tensor is ready on a remote worker and return the op
-  // id and output num.
-  Status OpIdAndOutputNumUntilReady(int64* op_id, int32* output_num) const;
+  // Return the op id and output num. If wait_util_ready is true, block until
+  // the remote tensor is ready on a remote worker.
+  Status OpIdAndOutputNum(const bool wait_util_ready, int64* op_id,
+                          int32* output_num) const;
 
   uint64 context_view_id() const { return context_view_id_; }
 

From 0c9e56e931ba86dc67ee76e1af9d900e42825a85 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Wed, 13 May 2020 16:08:29 -0700
Subject: [PATCH 143/412] Update attr name for Dense version Bincount.

PiperOrigin-RevId: 311423709
Change-Id: Ief7c901477be8e06b1d3f98613c7390c12e9680b
---
 .../base_api/api_def_DenseBincount.pbtxt      |  2 +-
 .../base_api/api_def_RaggedBincount.pbtxt     |  2 +-
 .../base_api/api_def_SparseBincount.pbtxt     |  2 +-
 tensorflow/core/kernels/bincount_op.cc        | 28 +++++++++----------
 tensorflow/core/ops/math_ops.cc               |  6 ++--
 .../python/kernel_tests/bincount_op_test.py   | 16 +++++------
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |  6 ++--
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |  6 ++--
 8 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
index 3f9ec2761a1..11043899ba4 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DenseBincount.pbtxt
@@ -28,7 +28,7 @@ The counts or summed weights for each value in the range [0, size).
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
 bool; Whether the kernel should count the appearance or number of occurrences.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt
index b4deaa7c430..b6299ada526 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedBincount.pbtxt
@@ -34,7 +34,7 @@ The counts or summed weights for each value in the range [0, size).
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
 bool; Whether the kernel should count the appearance or number of occurrences.
 END
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt
index cfcc432f880..12cb5f43218 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseBincount.pbtxt
@@ -40,7 +40,7 @@ The counts or summed weights for each value in the range [0, size).
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
 bool; Whether the kernel should count the appearance or number of occurrences.
 END
diff --git a/tensorflow/core/kernels/bincount_op.cc b/tensorflow/core/kernels/bincount_op.cc
index 846b43255d3..a84b25f2541 100644
--- a/tensorflow/core/kernels/bincount_op.cc
+++ b/tensorflow/core/kernels/bincount_op.cc
@@ -130,8 +130,8 @@ struct BincountFunctor<CPUDevice, Tidx, T, false> {
   }
 };
 
-template <typename Tidx, typename T, bool binary_count>
-struct BincountReduceFunctor<CPUDevice, Tidx, T, binary_count> {
+template <typename Tidx, typename T, bool binary_output>
+struct BincountReduceFunctor<CPUDevice, Tidx, T, binary_output> {
   static Status Compute(OpKernelContext* context,
                         const typename TTypes<Tidx, 2>::ConstTensor& in,
                         const typename TTypes<T, 2>::ConstTensor& weights,
@@ -148,7 +148,7 @@ struct BincountReduceFunctor<CPUDevice, Tidx, T, binary_count> {
             for (int64 j = 0; j < num_cols; ++j) {
               Tidx value = in(i, j);
               if (value < num_bins) {
-                if (binary_count) {
+                if (binary_output) {
                   out(i, value) = T(1);
                 } else {
                   if (weights.size()) {
@@ -221,7 +221,7 @@ template <typename Device, typename Tidx, typename T>
 class DenseBincountOp : public OpKernel {
  public:
   explicit DenseBincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -240,7 +240,7 @@ class DenseBincountOp : public OpKernel {
       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({size}), &out_t));
       auto out = out_t->flat<T>();
       fill(ctx->eigen_device<Device>(), out);
-      if (binary_count_) {
+      if (binary_output_) {
         OP_REQUIRES_OK(
             ctx, functor::BincountFunctor<Device, Tidx, T, true>::Compute(
                      ctx, data.flat<Tidx>(), weights.flat<T>(), out, size));
@@ -259,7 +259,7 @@ class DenseBincountOp : public OpKernel {
           ctx, ctx->allocate_output(0, TensorShape({num_rows, size}), &out_t));
       auto out = out_t->matrix<T>();
       fill(ctx->eigen_device<Device>(), out_t->flat<T>());
-      if (binary_count_) {
+      if (binary_output_) {
         OP_REQUIRES_OK(
             ctx, functor::BincountReduceFunctor<Device, Tidx, T, true>::Compute(
                      ctx, data.matrix<Tidx>(), weight_matrix, out, size));
@@ -273,7 +273,7 @@ class DenseBincountOp : public OpKernel {
   }
 
  private:
-  bool binary_count_;
+  bool binary_output_;
 };
 
 #define REGISTER_KERNELS(Tidx, T)                            \
@@ -314,7 +314,7 @@ template <typename Device, typename Tidx, typename T>
 class SparseBincountOp : public OpKernel {
  public:
   explicit SparseBincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -338,7 +338,7 @@ class SparseBincountOp : public OpKernel {
       OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({size}), &out_t));
       auto out = out_t->flat<T>();
       fill(ctx->eigen_device<Device>(), out);
-      if (binary_count_) {
+      if (binary_output_) {
         OP_REQUIRES_OK(ctx,
                        functor::BincountFunctor<Device, Tidx, T, true>::Compute(
                            ctx, values, weights, out, size));
@@ -359,7 +359,7 @@ class SparseBincountOp : public OpKernel {
         const int64 batch = indices_mat(i, 0);
         const Tidx bin = values(i);
         if (bin < size) {
-          if (binary_count_) {
+          if (binary_output_) {
             out(batch, bin) = T(1);
           } else {
             if (weights_size) {
@@ -374,7 +374,7 @@ class SparseBincountOp : public OpKernel {
   }
 
  private:
-  bool binary_count_;
+  bool binary_output_;
 };
 
 #define REGISTER_KERNELS(Tidx, T)                            \
@@ -395,7 +395,7 @@ template <typename Device, typename Tidx, typename T>
 class RaggedBincountOp : public OpKernel {
  public:
   explicit RaggedBincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -429,7 +429,7 @@ class RaggedBincountOp : public OpKernel {
       OP_REQUIRES(ctx, bin >= 0,
                   errors::InvalidArgument("Input must be non-negative"));
       if (bin < size) {
-        if (binary_count_) {
+        if (binary_output_) {
           out(batch_idx - 1, bin) = T(1);
         } else {
           T value = (weights_size > 0) ? weights(idx) : T(1);
@@ -440,7 +440,7 @@ class RaggedBincountOp : public OpKernel {
   }
 
  private:
-  bool binary_count_;
+  bool binary_output_;
 };
 
 #define REGISTER_KERNELS(Tidx, T)                            \
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 7ac003379d4..cbf03d7b045 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1657,7 +1657,7 @@ REGISTER_OP("DenseBincount")
     .Input("weights: T")
     .Attr("Tidx: {int32, int64}")
     .Attr("T: {int32, int64, float32, float64}")
-    .Attr("binary_count: bool = false")
+    .Attr("binary_output: bool = false")
     .Output("output: T")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
@@ -1704,7 +1704,7 @@ REGISTER_OP("SparseBincount")
     .Input("weights: T")
     .Attr("Tidx: {int32, int64}")
     .Attr("T: {int32, int64, float32, float64}")
-    .Attr("binary_count: bool = false")
+    .Attr("binary_output: bool = false")
     .Output("output: T")
     .SetShapeFn([](InferenceContext* c) {
       const Tensor* size_tensor = c->input_tensor(3);
@@ -1754,7 +1754,7 @@ REGISTER_OP("RaggedBincount")
     .Input("weights: T")
     .Attr("Tidx: {int32, int64}")
     .Attr("T: {int32, int64, float32, float64}")
-    .Attr("binary_count: bool = false")
+    .Attr("binary_output: bool = false")
     .Output("output: T")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->UnknownShape());
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 4178e1203e2..222716dfdfa 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -183,7 +183,7 @@ class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           np_out,
           self.evaluate(
               gen_math_ops.dense_bincount(
-                  input=inp, weights=[], size=size, binary_count=True)))
+                  input=inp, weights=[], size=size, binary_output=True)))
 
   @parameterized.parameters([{
       "dtype": np.int32,
@@ -201,7 +201,7 @@ class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           np_out,
           self.evaluate(
               gen_math_ops.dense_bincount(
-                  input=inp, weights=np_weight, size=size, binary_count=True)))
+                  input=inp, weights=np_weight, size=size, binary_output=True)))
 
   def _test_bincount_col_count(self, num_rows, num_cols, size, dtype):
     np.random.seed(42)
@@ -230,7 +230,7 @@ class BincountOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           np_out,
           self.evaluate(
               gen_math_ops.dense_bincount(
-                  input=inp, weights=[], size=size, binary_count=True)))
+                  input=inp, weights=[], size=size, binary_output=True)))
 
   def _test_bincount_col_count_with_weights(self, num_rows, num_cols, size,
                                             dtype):
@@ -401,7 +401,7 @@ class SparseBincountOpTest(test_util.TensorFlowTestCase,
                 dense_shape=[num_rows],
                 size=size,
                 weights=[],
-                binary_count=True)))
+                binary_output=True)))
 
   @parameterized.parameters([{
       "dtype": np.int32,
@@ -427,7 +427,7 @@ class SparseBincountOpTest(test_util.TensorFlowTestCase,
                 dense_shape=[num_rows],
                 size=size,
                 weights=inp_weight,
-                binary_count=True)))
+                binary_output=True)))
 
   @parameterized.parameters([{
       "dtype": np.int32,
@@ -490,7 +490,7 @@ class SparseBincountOpTest(test_util.TensorFlowTestCase,
                 dense_shape=inp_sparse.dense_shape,
                 size=size,
                 weights=[],
-                binary_count=True)))
+                binary_output=True)))
 
 
 class RaggedBincountOpTest(test_util.TensorFlowTestCase,
@@ -530,7 +530,7 @@ class RaggedBincountOpTest(test_util.TensorFlowTestCase,
                 values=x.values,
                 weights=[],
                 size=6,
-                binary_count=True)))
+                binary_output=True)))
 
   @parameterized.parameters([{
       "dtype": np.int32,
@@ -629,7 +629,7 @@ class RaggedBincountOpTest(test_util.TensorFlowTestCase,
                 values=x.values,
                 weights=[],
                 size=size,
-                binary_count=True)))
+                binary_output=True)))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index e622768979c..05b8842be66 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1074,7 +1074,7 @@ tf_module {
   }
   member_method {
     name: "DenseBincount"
-    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "DenseCountSparseOutput"
@@ -3070,7 +3070,7 @@ tf_module {
   }
   member_method {
     name: "RaggedBincount"
-    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "RaggedCountSparseOutput"
@@ -4082,7 +4082,7 @@ tf_module {
   }
   member_method {
     name: "SparseBincount"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "SparseConcat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index e622768979c..05b8842be66 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1074,7 +1074,7 @@ tf_module {
   }
   member_method {
     name: "DenseBincount"
-    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'input\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "DenseCountSparseOutput"
@@ -3070,7 +3070,7 @@ tf_module {
   }
   member_method {
     name: "RaggedBincount"
-    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "RaggedCountSparseOutput"
@@ -4082,7 +4082,7 @@ tf_module {
   }
   member_method {
     name: "SparseBincount"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'size\', \'weights\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "SparseConcat"

From a03da3516d600c769e59f9aeddb312013ffe9e54 Mon Sep 17 00:00:00 2001
From: Yunlu Li <yunluli@google.com>
Date: Wed, 13 May 2020 16:15:33 -0700
Subject: [PATCH 144/412] Register sparse FullyConnected kernel by default.

PiperOrigin-RevId: 311424830
Change-Id: Id72f75124b59fa11f9cb84447d7a886a579bae39
---
 tensorflow/lite/kernels/fully_connected.cc    | 121 +++++++-----------
 .../lite/kernels/fully_connected_test.cc      |   9 +-
 tensorflow/lite/kernels/register.cc           |   2 +-
 3 files changed, 52 insertions(+), 80 deletions(-)

diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 1cd1b14e7a8..cbc3efd5da5 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -61,8 +61,6 @@ enum KernelType {
   kReference,
   kGenericOptimized,
   kLegacyPie,  // Legacy path used by the PIE team and related clients.
-  kSparseReference,
-  kSparseOptimized,
 };
 
 struct OpData {
@@ -631,57 +629,20 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
     FullyConnectedParams op_params;
     op_params.float_activation_min = output_activation_min;
     op_params.float_activation_max = output_activation_max;
-    reference_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output));
-  } else if (kernel_type == kSparseReference) {
-    FullyConnectedParams op_params;
-    op_params.float_activation_min = output_activation_min;
-    op_params.float_activation_max = output_activation_max;
-    TF_LITE_ENSURE(context, filter->sparsity != nullptr);
-
-    const auto& sparsity = *filter->sparsity;
-    reference_ops::FullyConnectedSparseWeight(
-        sparsity, op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output));
-  } else if (kernel_type == kSparseOptimized) {
-    FullyConnectedParams op_params;
-    op_params.float_activation_min = output_activation_min;
-    op_params.float_activation_max = output_activation_max;
-    TF_LITE_ENSURE(context, filter->sparsity != nullptr);
-
-    const auto& sparsity = *filter->sparsity;
-    if (!SupportedSparsityFormat(sparsity)) {
-      TF_LITE_KERNEL_LOG(context,
-                         "Unsupported sparse fully-connected weight format.");
-      return kTfLiteError;
-    }
-
-    if (sparsity.dim_metadata_size == kDimMetadataSizeRandomSparse) {
-      // Random sparse.
-      optimized_ops::FullyConnectedSparseWeight(
-          sparsity, op_params, GetTensorShape(input),
-          GetTensorData<float>(input), GetTensorShape(filter),
-          GetTensorData<float>(filter), GetTensorShape(bias),
-          GetTensorData<float>(bias), GetTensorShape(output),
-          GetTensorData<float>(output));
-    } else if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
-               sparsity.dim_metadata[2].dense_size == 4) {
-      // Block sparse with block size of 1x4.
-      optimized_ops::FullyConnectedSparseWeight1x4(
+    if (filter->sparsity != nullptr) {
+      const auto& sparsity = *filter->sparsity;
+      reference_ops::FullyConnectedSparseWeight(
           sparsity, op_params, GetTensorShape(input),
           GetTensorData<float>(input), GetTensorShape(filter),
           GetTensorData<float>(filter), GetTensorShape(bias),
           GetTensorData<float>(bias), GetTensorShape(output),
           GetTensorData<float>(output));
     } else {
-      TF_LITE_KERNEL_LOG(context,
-                         "Unsupported sparse fully-connected weight format.");
-      return kTfLiteError;
+      reference_ops::FullyConnected(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(filter), GetTensorData<float>(filter),
+          GetTensorShape(bias), GetTensorData<float>(bias),
+          GetTensorShape(output), GetTensorData<float>(output));
     }
   } else if (kernel_type == kLegacyPie) {
     return EvalPie(context, node, params, data, input, filter, bias, output);
@@ -689,14 +650,47 @@ TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
     FullyConnectedParams op_params;
     op_params.float_activation_min = output_activation_min;
     op_params.float_activation_max = output_activation_max;
-    op_params.lhs_cacheable = IsConstantTensor(filter);
-    op_params.rhs_cacheable = IsConstantTensor(input);
-    optimized_ops::FullyConnected(
-        op_params, GetTensorShape(input), GetTensorData<float>(input),
-        GetTensorShape(filter), GetTensorData<float>(filter),
-        GetTensorShape(bias), GetTensorData<float>(bias),
-        GetTensorShape(output), GetTensorData<float>(output),
-        CpuBackendContext::GetFromContext(context));
+    if (filter->sparsity != nullptr) {
+      const auto& sparsity = *filter->sparsity;
+      if (!SupportedSparsityFormat(sparsity)) {
+        TF_LITE_KERNEL_LOG(context,
+                           "Unsupported sparse fully-connected weight format.");
+        return kTfLiteError;
+      }
+
+      if (sparsity.dim_metadata_size == kDimMetadataSizeRandomSparse) {
+        // Random sparse.
+        optimized_ops::FullyConnectedSparseWeight(
+            sparsity, op_params, GetTensorShape(input),
+            GetTensorData<float>(input), GetTensorShape(filter),
+            GetTensorData<float>(filter), GetTensorShape(bias),
+            GetTensorData<float>(bias), GetTensorShape(output),
+            GetTensorData<float>(output));
+      } else if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
+                 sparsity.dim_metadata[2].dense_size == 4) {
+        // Block sparse with block size of 1x4.
+        optimized_ops::FullyConnectedSparseWeight1x4(
+            sparsity, op_params, GetTensorShape(input),
+            GetTensorData<float>(input), GetTensorShape(filter),
+            GetTensorData<float>(filter), GetTensorShape(bias),
+            GetTensorData<float>(bias), GetTensorShape(output),
+            GetTensorData<float>(output));
+      } else {
+        TF_LITE_KERNEL_LOG(context,
+                           "Unsupported sparse fully-connected weight format.");
+        return kTfLiteError;
+      }
+
+    } else {
+      op_params.lhs_cacheable = IsConstantTensor(filter);
+      op_params.rhs_cacheable = IsConstantTensor(input);
+      optimized_ops::FullyConnected(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(filter), GetTensorData<float>(filter),
+          GetTensorShape(bias), GetTensorData<float>(bias),
+          GetTensorShape(output), GetTensorData<float>(output),
+          CpuBackendContext::GetFromContext(context));
+    }
   }
 
   return kTfLiteOk;
@@ -757,23 +751,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
 }  // namespace fully_connected
 
-// TODO(b/147449640): Clean up sparse registrations after conversion is done.
-TfLiteRegistration* Register_FULLY_CONNECTED_SPARSE_REF() {
-  static TfLiteRegistration r = {
-      fully_connected::Init, fully_connected::Free,
-      fully_connected::Prepare<fully_connected::kSparseReference>,
-      fully_connected::Eval<fully_connected::kSparseReference>};
-  return &r;
-}
-
-TfLiteRegistration* Register_FULLY_CONNECTED_SPARSE_OPT() {
-  static TfLiteRegistration r = {
-      fully_connected::Init, fully_connected::Free,
-      fully_connected::Prepare<fully_connected::kSparseOptimized>,
-      fully_connected::Eval<fully_connected::kSparseOptimized>};
-  return &r;
-}
-
 TfLiteRegistration* Register_FULLY_CONNECTED_REF() {
   static TfLiteRegistration r = {
       fully_connected::Init, fully_connected::Free,
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 34d68cf0b0d..7227b8a5e92 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -361,11 +361,6 @@ const auto kKernelMapNoPie = new std::map<string, TfLiteRegistration*>({
     {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
 });
 
-const auto kKernelMapSparse = new std::map<string, TfLiteRegistration*>({
-    {"SparseReference", ops::builtin::Register_FULLY_CONNECTED_SPARSE_REF()},
-    {"SparseOptimized", ops::builtin::Register_FULLY_CONNECTED_SPARSE_OPT()},
-});
-
 class QuantizedFullyConnectedOpTest : public SingleOpTest {
  protected:
   const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
@@ -1187,7 +1182,7 @@ class SparseFullyConnectedOpModel : public SingleOpModel {
 class SparseFullyConnectedOpTest : public SingleOpTest {
  protected:
   const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
-    return *kKernelMapSparse;
+    return *kKernelMapNoPie;
   }
 };
 
@@ -1277,7 +1272,7 @@ TEST_P(SparseFullyConnectedOpTest, Simple1x4Test) {
 
 INSTANTIATE_TEST_SUITE_P(
     SparseFullyConnectedOpTest, SparseFullyConnectedOpTest,
-    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapSparse)));
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapNoPie)));
 
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index f3a321e325b..8ca58e6a309 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -77,7 +77,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version = */ 1,
-             /* max_version = */ 7);
+             /* max_version = */ 8);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),

From 1798a5e959d6781764c2beec673e61cc58c26455 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 13 May 2020 16:18:13 -0700
Subject: [PATCH 145/412] [XLA:SPMD] Fix reshape with halo exchange

PiperOrigin-RevId: 311425288
Change-Id: Ia1e29df7b16d9eb60953aba3336022505e823d3a
---
 .../xla/service/spmd/spmd_partitioner.cc      |  2 +-
 .../xla/service/spmd/spmd_partitioner_test.cc | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index fd865342ca3..b857c8bdbe6 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -1661,7 +1661,7 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
     }
     TF_RET_CHECK(!reshard_output->dynamic_slice_index_on_output.has_value());
     CHECK_EQ(
-        reshard_output->sharded_input->shape().dimensions(input_sharded_dim),
+        reshard_output->sharded_input->shape().dimensions(output_sharded_dim),
         output_shard_shape.dimensions(output_sharded_dim));
     SetPartitionedHlo(hlo, [&] { return reshard_output->sharded_input; });
     return Status::OK();
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index 7a7f2dcc807..ca1afc816b0 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -27,6 +27,7 @@ namespace xla {
 namespace spmd {
 namespace {
 
+using ::testing::_;
 using ::testing::AllOf;
 namespace op = xla::testing::opcode_matchers;
 
@@ -1994,6 +1995,29 @@ ENTRY entry {
             op::Shape("f32[38,38,4,41]")));
 }
 
+TEST_F(SpmdPartitioningTest, ReshapeMergeDimsWithHaloExchange) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = s32[2,3,7,10] parameter(0), sharding={devices=[1,1,2,1]0,1}
+  ROOT %reshape = s32[3,2,1,14,5] reshape(%input),
+    sharding={devices=[1,1,1,2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+
+  auto reshape =
+      AllOf(op::Reshape(op::Parameter(0)), op::Shape("s32[3,2,1,8,5]"));
+  auto halo = op::CollectivePermute(op::Slice(reshape));
+  auto exchanged =
+      op::DynamicSlice(op::Concatenate(halo, reshape), _, _, _, _, _);
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(exchanged, op::Shape("s32[3,2,1,7,5]")));
+}
+
 // Produces an invalid module after transformation.
 TEST_F(SpmdPartitioningTest, InceptionV3_4_way_ReduceWindowDilated) {
   const char* const hlo_string = R"(

From 07568a96e8a6ab5c492ff5d7ebba5efca2f37a1e Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Wed, 13 May 2020 16:18:49 -0700
Subject: [PATCH 146/412] Simplify `trainable` in batchnorm layers to just use
 the python variable rather than the private _trainable_var created by
 `backend.freezable_variable`

Compile/Fit continues to reflect the updated value of trainable if you recompile after changing trainable.

This change does come with a behavior change in subtle situations though.

`backend.freezable_variable` occupied a strange no-mans land in between tf.variable and python state. It allowed you to update the value in a tf.function even after tracing occurred (like a tf.variable). But, it did not appear in lists of variables of the model/saved_models.
(It would probably act in unpredictable ways when a single batchnorm layer was used in several different tf.functions, because the layers only maintained a single freezable_variable even though each funcgraph would've needed its own. It's also unclear how it actually behaved in loaded saved_models.)

So, before this code change, disabling/enabling `trainable` after a tf.function containing batchnorm had already been traced caused existing traces to reflect the new value of `trainable`.

Now, because `trainable` is standard python state it acts the same way as other python state in tf.functions. The value will be frozen at tracing time. If you want to update `trainable` after tracing you must trace a new tf.function.

PiperOrigin-RevId: 311425391
Change-Id: I51166212efa28b56c4193f9358907a9dc54b7d2d
---
 tensorflow/python/keras/backend.py            | 47 -----------------
 .../python/keras/layers/normalization.py      | 19 ++-----
 .../python/keras/layers/normalization_test.py | 50 ++++---------------
 .../saving/saved_model/saved_model_test.py    | 31 ++++++++++++
 4 files changed, 44 insertions(+), 103 deletions(-)

diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 2700fae9e29..11e53e032ae 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -1162,53 +1162,6 @@ def is_placeholder(x):
     return False
 
 
-def freezable_variable(value, shape=None, name=None):
-  """A tensor-like object whose value can be updated only up until execution.
-
-  After creating the freezable variable, you can update its value by calling
-  `var.update_value(new_value)` (similar to a regular variable).
-  Unlike an actual variable, the value used during execution is the current
-  value at the time the execution function (`backend.function()`) was created.
-
-  This is an internal API, expected to be temporary. It is used to implement a
-  mutable `trainable` property for `BatchNormalization` layers, with a frozen
-  value after model compilation.
-
-  We don't use a plain variable in this case because we need the value used
-  in a specific model to be frozen after `compile` has been called
-  (e.g. GAN use case).
-
-  Arguments:
-    value: The initial value for the tensor-like object.
-    shape: The shape for the tensor-like object (cannot be changed).
-    name: The name for the tensor-like object.
-
-  Returns:
-    A tensor-like object with a static value that can be updated via
-    `x.update_value(new_value)`, up until creating an execution function
-    (afterwards the value is fixed).
-  """
-  graph = get_graph()
-  with graph.as_default():
-    x = array_ops.placeholder_with_default(
-        value, shape=shape, name=name)
-    x._initial_value = value
-    x._current_value = value
-
-    def update_value(new_value):
-      x._current_value = new_value
-
-    def get_value():
-      return x._current_value
-
-    x.update_value = update_value
-    x.get_value = get_value
-
-    global _FREEZABLE_VARS
-    _FREEZABLE_VARS[graph].add(x)
-  return x
-
-
 @keras_export('keras.backend.shape')
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 9a35cd86525..a6d3c3c3e1c 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -28,7 +28,6 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
@@ -234,7 +233,6 @@ class BatchNormalizationBase(Layer):
 
     self.fused = fused
     self._bessels_correction_test_only = True
-    self._trainable_var = None
     self.trainable = trainable
 
     if renorm:
@@ -294,14 +292,6 @@ class BatchNormalizationBase(Layer):
   @trainable.setter
   def trainable(self, value):
     self._trainable = value
-    if self._trainable_var is not None:
-      self._trainable_var.update_value(value)
-
-  def _get_trainable_var(self):
-    if self._trainable_var is None:
-      self._trainable_var = K.freezable_variable(
-          self._trainable, name=self.name + '_trainable')
-    return self._trainable_var
 
   @property
   def _param_dtype(self):
@@ -722,12 +712,9 @@ class BatchNormalizationBase(Layer):
     if self._USE_V2_BEHAVIOR:
       if isinstance(training, int):
         training = bool(training)
-      if base_layer_utils.is_in_keras_graph():
-        training = math_ops.logical_and(training, self._get_trainable_var())
-      elif not self.trainable:
-        # When the layer is not trainable, it overrides the value passed from
-        # model.
-        training = self.trainable
+      # When the layer is not trainable, it overrides the value passed from
+      # model.
+      training = math_ops.logical_and(training, self.trainable)
     return training
 
   def call(self, inputs, training=None):
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index ad5d00eb4d9..4d1e3213ba7 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -22,7 +22,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
-from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import wrap_function
@@ -35,7 +34,6 @@ from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.layers import normalization
 from tensorflow.python.keras.layers import normalization_v2
 from tensorflow.python.keras.mixed_precision.experimental import policy
-from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
@@ -170,6 +168,13 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
 
   @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
   def test_batchnorm_non_trainable_with_fit(self):
+    # We use the same data shape for all the data we use in this test.
+    # This will prevent any used tf.functions from retracing.
+    # This helps us verify that changing trainable and recompiling really
+    # does update the training loop, rather than a different data shape
+    # triggering a retrace.
+    data_shape = (100, 3)
+
     inputs = keras.Input((3,))
     bn = normalization_v2.BatchNormalization()
     outputs = bn(inputs)
@@ -178,10 +183,10 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
         'rmsprop',
         'mse',
         run_eagerly=testing_utils.should_run_eagerly())
-    model.fit(np.random.random((100, 3)), np.random.random((100, 3)))
+    model.fit(np.random.random(data_shape), np.random.random(data_shape))
 
-    test_data = np.random.random((10, 3))
-    test_targets = np.random.random((10, 3))
+    test_data = np.random.random(data_shape)
+    test_targets = np.random.random(data_shape)
     test_loss = model.evaluate(test_data, test_targets)
 
     bn.trainable = False
@@ -192,41 +197,6 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
     train_loss = model.train_on_batch(test_data, test_targets)
     self.assertAlmostEqual(test_loss, train_loss)
 
-  @combinations.generate(combinations.combine(mode=['graph', 'eager']))
-  def test_batchnorm_non_trainable_with_tf_function(self):
-    inputs = keras.Input((3,))
-    bn = normalization_v2.BatchNormalization()
-    outputs = bn(inputs)
-    model = keras.Model(inputs, outputs)
-    loss_fn = keras.losses.MeanSquaredError()
-    optimizer = rmsprop_v2.RMSprop()
-
-    @def_function.function()
-    def train_step(x, y):
-      with backprop.GradientTape() as tape:
-        y_pred = model(x, training=True)
-        loss = loss_fn(y, y_pred)
-      grads = tape.gradient(loss, model.trainable_weights)
-      optimizer.apply_gradients(zip(grads, model.trainable_weights))
-      return loss
-
-    @def_function.function()
-    def test_step(x, y):
-      y_pred = model(x, training=False)
-      loss = loss_fn(y, y_pred)
-      return loss
-
-    train_step(np.random.random((100, 3)), np.random.random((100, 3)))
-
-    test_data = np.random.random((10, 3))
-    test_targets = np.random.random((10, 3))
-    test_loss = test_step(test_data, test_targets)
-
-    bn.trainable = False
-    train_loss = train_step(test_data, test_targets)
-    if context.executing_eagerly():
-      self.assertAlmostEqual(test_loss.numpy(), train_loss.numpy())
-
   def test_eager_batchnorm_in_custom_model_call_with_tf_function(self):
 
     class MyModel(keras.Model):
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 9cbe8607a54..30a93e2bba3 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -391,6 +391,37 @@ class TestModelSavingAndLoadingV2(keras_parameterized.TestCase):
       self.evaluate(loaded.get_updates_for(input_arr2))
     self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
 
+  def testDisablingBatchNormTrainableBeforeSaving(self):
+    # We disable trainable on the batchnorm layers before saving
+    model = keras.models.Sequential(
+        keras.layers.BatchNormalization(input_shape=(1,)))
+    model.trainable = False
+    self.evaluate(variables.variables_initializer(model.variables))
+    saved_model_dir = self._save_model_dir()
+    model.save(saved_model_dir, save_format='tf')
+    loaded = keras_load.load(saved_model_dir)
+    self.evaluate(variables.variables_initializer(loaded.variables))
+    input_arr = array_ops.constant([[11], [12], [13]], dtype=dtypes.float32)
+    input_arr2 = array_ops.constant([[14], [15], [16]], dtype=dtypes.float32)
+    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0])
+
+    # Trainable should still be disabled after loading
+    self.evaluate(loaded(input_arr, training=True))
+    if not context.executing_eagerly():
+      self.evaluate(loaded.get_updates_for(input_arr))
+    self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.0])
+
+    # Re-enabling trainable on the loaded model should cause the batchnorm
+    # layer to start training again.
+    # Note: this only works in v2.
+    if context.executing_eagerly():
+      loaded.trainable = True
+      self.evaluate(loaded(input_arr, training=True))
+      self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
+
+      self.evaluate(loaded(input_arr2, training=False))
+      self.assertAllClose(self.evaluate(loaded.layers[-1].moving_mean), [0.12])
+
   def testSaveWithSignatures(self):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(5, input_shape=(3,),

From 4fd957d3cf0dab49d7a8c77b724560768dbfdcb2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 16:38:42 -0700
Subject: [PATCH 147/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311428968
Change-Id: Ib63776765d20322f80be9dc261f394486746eddc
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 7a07a0e78d8..bab430e1472 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25654,7 +25654,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25717,7 +25717,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25968,7 +25968,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26452,7 +26452,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45540,7 +45540,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47480,7 +47480,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47551,7 +47551,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48540,7 +48540,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From e35304c8b9dda8c46811112f106264c6e29a1e78 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Wed, 13 May 2020 16:45:39 -0700
Subject: [PATCH 148/412] Add simple canonicalizer for tf.fill

Avoids need for using fallback converter, especially given splat nature vs the cost of converting back and forth from tensor.

PiperOrigin-RevId: 311430247
Change-Id: Ia5f235176f87d355b084c95073350cb890d711c4
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    |  2 ++
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     | 25 ++++++++++++++++++-
 .../mlir/tensorflow/tests/canonicalize.mlir   | 11 ++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 2d02d0b7508..64ea0732e8c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -2907,6 +2907,8 @@ fill([2, 3], 9) ==> [[9, 9, 9]
     return Verify(*this);
   }];
 
+  let hasFolder = 1;
+
   let builders = [OpBuilder<
     "OpBuilder &builder, OperationState &result, Value dims, Value value"
   >];
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 82ddc80875a..2007824369c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -1606,7 +1606,7 @@ static ShapedType InferFillOpType(Value dims, Value value) {
 
   llvm::SmallVector<int64_t, 4> shape;
   shape.reserve(dims_attr.getNumElements());
-  for (const APInt &dim : dims_attr.getValues<APInt>()) {
+  for (const APInt dim : dims_attr.getValues<APInt>()) {
     shape.push_back(dim.getSExtValue());
   }
   return RankedTensorType::get(shape, etype);
@@ -1617,6 +1617,29 @@ void FillOp::build(OpBuilder &builder, OperationState &result, Value dims,
   FillOp::build(builder, result, InferFillOpType(dims, value), dims, value);
 }
 
+OpFoldResult FillOp::fold(ArrayRef<Attribute> operands) {
+  assert(operands.size() == 2 && "fill op has two operand");
+
+  auto value = operands[1].dyn_cast_or_null<ElementsAttr>();
+  if (!value) return {};
+
+  auto type = getType().cast<ShapedType>();
+  if (type.hasStaticShape())
+    return DenseElementsAttr::get(type, value.getValue({}));
+
+  auto dims = operands[0].dyn_cast_or_null<DenseIntElementsAttr>();
+  if (!dims) return {};
+
+  llvm::SmallVector<int64_t, 4> shape;
+  shape.reserve(dims.getNumElements());
+  for (const APInt dim : dims.getValues<APInt>()) {
+    shape.push_back(dim.getSExtValue());
+  }
+  type = RankedTensorType::get(shape, type.getElementType());
+
+  return DenseElementsAttr::get(type, value.getValue({}));
+}
+
 //===----------------------------------------------------------------------===//
 // FusedBatchNormGradOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 18f8d5f4486..e05894dc266 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -471,3 +471,14 @@ func @testRankOfRankedTensor(%arg0 : tensor<4x3x2xf32>) -> tensor<i32> {
   // CHECK: return [[VAL0]]
   return %0 : tensor<i32>
 }
+
+// CHECK-LABEL: @foldFill
+func @foldFill() -> (tensor<3x2x1xf32>, tensor<*xf32>) {
+  %0 = "tf.Const"() {value = dense<[3, 2, 1]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %1 = "tf.Const"() {value = dense<23.0> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: "tf.Const"() {value = dense<2.300000e+01> : tensor<3x2x1xf32>}
+  %2 = "tf.Fill"(%0, %1) : (tensor<3xi32>, tensor<f32>) -> tensor<3x2x1xf32>
+  // CHECK: "tf.Const"() {value = dense<2.300000e+01> : tensor<3x2x1xf32>}
+  %3 = "tf.Fill"(%0, %1) : (tensor<3xi32>, tensor<f32>) -> tensor<*xf32>
+  return %2, %3 : tensor<3x2x1xf32>, tensor<*xf32>
+}

From d0a48afee650b12dde805fadca868d6b113c3c5d Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Wed, 13 May 2020 17:03:13 -0700
Subject: [PATCH 149/412] Add a few more utility functions for TPUs

PiperOrigin-RevId: 311433350
Change-Id: I62a3bd2635f4eb07a21f3b1cdb1bbea5017e6851
---
 tensorflow/core/tpu/BUILD                     | 25 +++++++++++++
 tensorflow/core/tpu/tpu_compilation_device.cc | 24 ++++++++++++
 tensorflow/core/tpu/tpu_defs.h                |  9 +++++
 tensorflow/core/tpu/tpu_node_device_util.cc   | 37 +++++++++++++++++++
 tensorflow/core/tpu/tpu_node_device_util.h    | 30 +++++++++++++++
 5 files changed, 125 insertions(+)
 create mode 100644 tensorflow/core/tpu/tpu_compilation_device.cc
 create mode 100644 tensorflow/core/tpu/tpu_node_device_util.cc
 create mode 100644 tensorflow/core/tpu/tpu_node_device_util.h

diff --git a/tensorflow/core/tpu/BUILD b/tensorflow/core/tpu/BUILD
index 4ea5fc39929..46a8759a257 100644
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@@ -37,10 +37,35 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "tpu_compilation_device",
+    srcs = ["tpu_compilation_device.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":tpu_defs",
+        ":tpu_node_device_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "tpu_node_device_util",
+    srcs = ["tpu_node_device_util.cc"],
+    hdrs = ["tpu_node_device_util.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:tf2xla_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
 cc_library(
     name = "tpu_defs",
     srcs = ["tpu_defs.cc"],
     hdrs = ["tpu_defs.h"],
+    deps = ["//tensorflow/core:protos_all_cc"],
 )
 
 cc_library(
diff --git a/tensorflow/core/tpu/tpu_compilation_device.cc b/tensorflow/core/tpu/tpu_compilation_device.cc
new file mode 100644
index 00000000000..2b2314820bc
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_compilation_device.cc
@@ -0,0 +1,24 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/tpu/tpu_node_device_util.h"
+
+namespace tensorflow {
+
+REGISTER_XLA_BACKEND(DEVICE_TPU_XLA_JIT, kTpuAllTypes, TpuOpFilter);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_defs.h b/tensorflow/core/tpu/tpu_defs.h
index b2a6e3ce303..497afb5c392 100644
--- a/tensorflow/core/tpu/tpu_defs.h
+++ b/tensorflow/core/tpu/tpu_defs.h
@@ -18,6 +18,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_TPU_DEFS_H_
 #define TENSORFLOW_CORE_TPU_TPU_DEFS_H_
 
+#include <array>
+
+#include "tensorflow/core/framework/types.pb.h"
+
 namespace tensorflow {
 
 // Name of the TPU device, which corresponds to a single core.
@@ -43,6 +47,11 @@ extern const char* const TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR;
 // variable.
 extern const char* const TPU_FAST_MEM_ATTR;  // "_TPU_FAST_MEM"
 
+// Supported types for TPUs.
+static constexpr std::array<DataType, 11> kTpuAllTypes = {
+    {DT_INT32, DT_UINT32, DT_BFLOAT16, DT_FLOAT, DT_DOUBLE, DT_BOOL,
+     DT_COMPLEX64, DT_INT64, DT_UINT64, DT_QINT8, DT_QUINT8}};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_TPU_TPU_DEFS_H_
diff --git a/tensorflow/core/tpu/tpu_node_device_util.cc b/tensorflow/core/tpu/tpu_node_device_util.cc
new file mode 100644
index 00000000000..2dfd7d984d6
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_node_device_util.cc
@@ -0,0 +1,37 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tpu/tpu_node_device_util.h"
+
+#include "tensorflow/compiler/tf2xla/tf2xla_util.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace tensorflow {
+
+bool TpuOpFilter(KernelDef* kdef) {
+  StringPiece op(kdef->op());
+  VLOG(2) << "TpuOpFilter " << op;
+  // Enable const string operands to Assert op (b/69167214).
+  if (op == "Const") {
+    AddDtypeToKernelDefConstraint("dtype", DT_STRING, kdef);
+  }
+  if (op == "Assert") {
+    AddDtypeToKernelDefConstraint("T", DT_STRING, kdef);
+  }
+  return true;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/tpu/tpu_node_device_util.h b/tensorflow/core/tpu/tpu_node_device_util.h
new file mode 100644
index 00000000000..c6d5be9f5a6
--- /dev/null
+++ b/tensorflow/core/tpu/tpu_node_device_util.h
@@ -0,0 +1,30 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_H_
+#define TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_H_
+
+#include "tensorflow/core/framework/kernel_def.pb.h"
+
+namespace tensorflow {
+
+// This is a BackendOpFilter. (see tensorflow/compiler/tf2xla/xla_op_registry.h)
+// It returns true if the op should be registered on the device, it may
+// optionally modify the KernelDef.
+bool TpuOpFilter(KernelDef* kdef);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TPU_TPU_NODE_DEVICE_H_

From a4f82e759af213872631fd9d8e6b037e69ddaa47 Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Wed, 13 May 2020 17:20:44 -0700
Subject: [PATCH 150/412] Create per worker datasets in
 `strategy.experimental_distribute_datasets_from_function` instead of in
 `__iter__`. This will avoid tracing `dataset_fn` each time creating a new
 DistributedIterator.

PiperOrigin-RevId: 311436128
Change-Id: Ib839326c6d9e0b0fad051f4baa1ceac9eef08045
---
 tensorflow/python/distribute/input_lib.py | 49 ++++++++---------------
 1 file changed, 17 insertions(+), 32 deletions(-)

diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py
index 68e55d5a6af..26bc9a087fb 100644
--- a/tensorflow/python/distribute/input_lib.py
+++ b/tensorflow/python/distribute/input_lib.py
@@ -823,14 +823,15 @@ class DistributedDatasetsFromFunction(_IterableInput):
           "input_contexts (%d)" %
           (input_workers.num_workers, len(input_contexts)))
 
-    self._dataset_fn = dataset_fn
     self._input_workers = input_workers
     self._input_contexts = input_contexts
     self._strategy = strategy
-    self._element_spec = None
-
-    super(DistributedDatasetsFromFunction, self).__init__(
-        input_workers=input_workers)
+    self._datasets, element_spec = (
+        _create_datasets_per_worker_with_input_context(self._input_contexts,
+                                                       self._input_workers,
+                                                       dataset_fn))
+    self._element_spec = _create_distributed_tensor_spec(
+        self._strategy, element_spec)
 
   def __iter__(self):
     if (ops.executing_eagerly_outside_functions() or
@@ -842,9 +843,9 @@ class DistributedDatasetsFromFunction(_IterableInput):
       enable_legacy_iterators = getattr(self._strategy,
                                         "_enable_legacy_iterators", False)
 
-      iterators, element_spec = _create_iterators_per_worker_with_input_context(
-          self._input_contexts, self._input_workers, self._dataset_fn,
-          enable_legacy_iterators)
+      iterators = _create_iterators_per_worker(self._datasets,
+                                               self._input_workers,
+                                               enable_legacy_iterators)
 
       if enable_legacy_iterators:
         iterator = DistributedIteratorV1(self._input_workers, iterators,
@@ -852,8 +853,6 @@ class DistributedDatasetsFromFunction(_IterableInput):
       else:
         iterator = DistributedIterator(self._input_workers, iterators,
                                        self._strategy)
-      self._element_spec = _create_distributed_tensor_spec(self._strategy,
-                                                           element_spec)
       iterator._element_spec = self._element_spec  # pylint: disable=protected-access
       return iterator
 
@@ -896,13 +895,10 @@ class DistributedDatasetsFromFunctionV1(DistributedDatasetsFromFunction):
     return self._get_iterator()
 
   def _get_iterator(self):
-    iterators, element_spec = _create_iterators_per_worker_with_input_context(
-        self._input_contexts, self._input_workers, self._dataset_fn,
-        True)
+    iterators = _create_iterators_per_worker(self._datasets,
+                                             self._input_workers, True)
     iterator = DistributedIteratorV1(self._input_workers, iterators,
                                      self._strategy)
-    self._element_spec = _create_distributed_tensor_spec(self._strategy,
-                                                         element_spec)
     iterator._element_spec = self._element_spec  # pylint: disable=protected-access
     return iterator
 
@@ -1375,27 +1371,16 @@ def _create_iterators_per_worker(worker_datasets, input_workers,
   return iterators
 
 
-def _create_iterators_per_worker_with_input_context(input_contexts,
-                                                    input_workers,
-                                                    dataset_fn,
-                                                    enable_legacy_iterators):
-  """Create a multidevice iterator per workers given a dataset function."""
-  iterators = []
-  element_specs = []
+def _create_datasets_per_worker_with_input_context(input_contexts,
+                                                   input_workers, dataset_fn):
+  """Create device datasets per worker given a dataset function."""
+  datasets = []
   for i, ctx in enumerate(input_contexts):
     worker = input_workers.worker_devices[i]
     with ops.device(worker):
       dataset = dataset_fn(ctx)
-      element_specs.append(dataset.element_spec)
-      devices = input_workers.compute_devices_for_worker(i)
-      if tf2.enabled() and not enable_legacy_iterators:
-        iterator = _SingleWorkerOwnedDatasetIterator(dataset, worker,
-                                                     devices)
-      else:
-        iterator = _SingleWorkerDatasetIterator(dataset, worker,
-                                                devices)
-      iterators.append(iterator)
-  return iterators, dataset.element_spec
+      datasets.append(dataset)
+  return datasets, dataset.element_spec
 
 
 # TODO(sourabhbajaj): Remove this in lieu of distributed datasets

From f84726697e208ef30ed830e00acebbbe9bc06553 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 17:24:15 -0700
Subject: [PATCH 151/412] [tf.data] Update some maths formulas in the
 ComputeWaitTime function.

PiperOrigin-RevId: 311436667
Change-Id: Ie3537625e9daac73caba5f790b90b65507f999f7
---
 tensorflow/core/framework/model.cc      | 229 ++++++++++++++----------
 tensorflow/core/framework/model.h       |  20 +++
 tensorflow/core/framework/model_test.cc |  72 ++++++++
 3 files changed, 231 insertions(+), 90 deletions(-)

diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 7aeec28e995..b4a54029a4f 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -32,96 +32,6 @@ constexpr char kInputTimeDerivativeKey[] = "last_input_time";
 // Wrapper for the square function to reduce verbosity.
 inline double Square(double x) { return x * x; }
 
-// Given the average time between output events (`output_time`), the average
-// time between input events (`input_time`) and the buffer size, the method
-// computes the expected time an input event will have to wait.
-//
-// The wait time is approximated as the product of the probability the buffer
-// will be empty and the time it takes to produce an element into the buffer.
-//
-// The formula used for computing the probability is derived by modeling the
-// problem as an M/M/1/K queue
-// (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
-//
-// Collects derivatives of `ComputeWaitTime` w.r.t `output_time`, `input_time'
-// and `buffer_size` if the corresponding pointers are not `nullptr`.
-double ComputeWaitTime(double output_time, double input_time,
-                       double buffer_size, double* output_time_derivative,
-                       double* input_time_derivative,
-                       double* buffer_size_derivative) {
-  // Case 0: either the producer or the consumer are infinitely fast. Wait time
-  // is the time to produce an output.
-  if (output_time == 0 || input_time == 0) {
-    if (output_time_derivative) {
-      *output_time_derivative = 1.0L;
-    }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
-    }
-    if (buffer_size_derivative) {
-      *buffer_size_derivative = 0.0L;
-    }
-    return output_time;
-  }
-  // Case 1: the consumer is slower than the producer. Wait time is 0 since the
-  // buffer will be full in the long run.
-  if (input_time > output_time) {
-    if (output_time_derivative) {
-      *output_time_derivative = 0.0L;
-    }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
-    }
-    if (buffer_size_derivative) {
-      *buffer_size_derivative = 0.0L;
-    }
-    return 0;
-  }
-  // Case 2: the consumer and the producer are equally fast. Expected wait time
-  // decreases linearly with the size of the buffer.
-  if (input_time == output_time) {
-    const double p_buffer_empty = 1.0L / (buffer_size + 1.0L);
-    if (output_time_derivative) {
-      *output_time_derivative = p_buffer_empty;
-    }
-    if (input_time_derivative) {
-      *input_time_derivative = 0.0L;
-    }
-    if (buffer_size_derivative) {
-      const double p_buffer_empty_der = -1.0L / Square(buffer_size + 1.0L);
-      *buffer_size_derivative = p_buffer_empty_der * output_time;
-    }
-    return p_buffer_empty * output_time;
-  }
-  // Case 3: the producer is slower than the consumer and neither is infinitely
-  // fast.
-  const double alpha = 1.0L / input_time;
-  const double beta = 1.0L / output_time;
-  const double ratio_pow = std::pow((beta / alpha), (buffer_size + 1.0L));
-  const double p_buffer_empty = (1.0L - beta / alpha) / (1.0L - ratio_pow);
-  if (output_time_derivative) {
-    *output_time_derivative =
-        (1.0L - ratio_pow -
-         (output_time - input_time) * (buffer_size + 1.0L) * ratio_pow /
-             output_time) /
-        Square(1.0L - ratio_pow);
-  }
-  if (input_time_derivative) {
-    *input_time_derivative =
-        (ratio_pow - 1.0L +
-         (buffer_size + 1.0L) * ratio_pow * (alpha / beta - 1.0L)) /
-        Square(1.0L - ratio_pow);
-  }
-  if (buffer_size_derivative) {
-    const double p_buffer_empty_der = (1.0L - beta / alpha) * ratio_pow *
-                                      std::log(beta / alpha) /
-                                      Square(1.0L - ratio_pow);
-    *buffer_size_derivative = p_buffer_empty_der * output_time;
-  }
-
-  return p_buffer_empty * output_time;
-}
-
 // The first input of InterleaveMany corresponds to the input dataset whose
 // elements are used to create the (derived) input datasets whose elements are
 // interleaved as output.
@@ -700,6 +610,145 @@ std::shared_ptr<Node> MakeUnknownNode(Node::Args args) {
   return std::make_shared<Unknown>(std::move(args));
 }
 
+double Node::ComputeWaitTime(const double& output_time,
+                             const double& input_time,
+                             const double& buffer_size,
+                             double* output_time_derivative,
+                             double* input_time_derivative,
+                             double* buffer_size_derivative) {
+  // If we set x=`input_time`, y=`output_time`, n=`buffer_size`,
+  // p=`p_buffer_empty`, T=`wait_time`, then we have:
+  // if y = 0, then p = 0;
+  // elif x = 0, then p = 1;
+  // elif x = y, then p = 1 / (n+1);
+  // else p = [1 - x/y] / [1 - power(x/y, n+1)].
+  //
+  // We also have T = p * y, and derivatives of T w.r.t. x, y, n are computed:
+  // dT/dx = dp/dx * y,
+  // dT/dy = p + dp/dy * y,
+  // dT/dn = dp/dn * y.
+  // Then the remaining work is to compute dp/dx, dp/dy, dp/dn by considering
+  // different cases and substitute the values into above formulas.
+
+  // Case 1: if producer is infinitely fast. The buffer will always be full.
+  // Wait time will always be 0.
+  if (output_time == 0) {
+    if (output_time_derivative) {
+      // Note a common error is `*output_time_derivative = 0` since p=0 on the
+      // line y=0 doesn't imply dp/dy = 0 there. Actually to compute dp/dy at
+      // (x,0), we need to consider lim_{dy->0+} [p(x,dy)-p(x,0)] / dy, where
+      // p(x,0)=0 and p(x,dy) = [1 - x/dy] / [1 - power(x/dy, n+1)].
+      if (buffer_size == 0 || input_time == 0) {
+        *output_time_derivative = 1.0L;
+      } else {
+        *output_time_derivative = 0.0L;
+      }
+    }
+    if (input_time_derivative) {
+      *input_time_derivative = 0.0L;
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = 0.0L;
+    }
+    return 0.0L;
+  }
+
+  // Case 2: if consumer is infinitely fast. Wait time is always the time to
+  // produce an output.
+  if (input_time == 0) {
+    if (output_time_derivative) {
+      *output_time_derivative = 1.0L;
+    }
+    if (input_time_derivative) {
+      // Note a common error is `*input_time_derivative = 0` since p=1 on the
+      // line x=0 doesn't imply dp/dx = 0 there. Actually to compute dp/dx at
+      // (0,y), we need to consider lim_{dx->0+} [p(dx,y)-p(0,y)] / dx, where
+      // p(0,y)=1, p(dx,y) = [1 - dx/y] / [1 - power(dx/y, n+1)] if y!=0.
+      if (buffer_size == 0) {
+        *input_time_derivative = 0.0L;
+      } else {
+        *input_time_derivative = -1.0L;
+      }
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = 0.0L;
+    }
+    return output_time;
+  }
+
+  // Case 3: the consumer and the producer are equally fast. Expected wait time
+  // decreases linearly with the size of the buffer.
+  if (input_time == output_time) {
+    const double p_buffer_empty = 1.0L / (buffer_size + 1.0L);
+    const double p_buffer_empty_der =
+        -buffer_size / (2.0L * buffer_size + 2.0L);
+    if (output_time_derivative) {
+      // Note a common error is `*output_time_derivative = p_buffer_empty` since
+      // p=1/(n+1) on the line x=y doesn't imply dp/dy = 0 there. Actually to
+      // compute dp/dy at (y,y), we need to consider
+      // lim_{dy->0} [p(y,y+dy)-p(y,y)] / dy, where p(y,y)=1/(n+1),
+      // p(y,y+dy) = [1 - y/(y+dy)] / [1 - power(y/(y+dy), n+1)].
+      *output_time_derivative = p_buffer_empty - p_buffer_empty_der;
+    }
+    if (input_time_derivative) {
+      // Note a common error is `*input_time_derivative = 0` since
+      // p=1/(n+1) on the line x=y doesn't imply dp/dx = 0 there. Actually to
+      // compute dp/dx at (x,x), we need to consider
+      // lim_{dx->0} [p(x+dx,x)-p(x,x)] / dx, where p(x,x)=1/(n+1),
+      // p(x+dx,x) = [1 - (x+dx)/x] / [1 - power((x+dx)/x, n+1)].
+      *input_time_derivative = p_buffer_empty_der;
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = -output_time / Square(buffer_size + 1.0L);
+    }
+    return p_buffer_empty * output_time;
+  }
+
+  // Case 4: the consumer is slower than the producer and neither is infinitely
+  // fast. Case 4 and Case 5 actually follow same formula. Separate them for
+  // numerical computation reasons.
+  if (input_time > output_time) {
+    const double ratio = output_time / input_time;
+    const double ratio_pow = std::pow(ratio, buffer_size);
+    const double p_buffer_empty =
+        ratio_pow * (1.0L - ratio) / (1.0L - ratio * ratio_pow);
+    const double p_buffer_empty_der =
+        (buffer_size - (buffer_size + 1.0L) * ratio + ratio_pow * ratio) *
+        ratio_pow / ratio / Square(1.0L - ratio_pow * ratio);
+    if (output_time_derivative) {
+      *output_time_derivative = p_buffer_empty + p_buffer_empty_der * ratio;
+    }
+    if (input_time_derivative) {
+      *input_time_derivative = -p_buffer_empty_der * Square(ratio);
+    }
+    if (buffer_size_derivative) {
+      *buffer_size_derivative = p_buffer_empty / (1.0L - ratio_pow * ratio) *
+                                std::log(ratio) * output_time;
+    }
+    return p_buffer_empty * output_time;
+  }
+
+  // Case 5: the producer is slower than the consumer and neither is infinitely
+  // fast.
+  const double ratio = input_time / output_time;
+  const double ratio_pow = std::pow(ratio, buffer_size);
+  const double p_buffer_empty = (1.0L - ratio) / (1.0L - ratio_pow * ratio);
+  const double p_buffer_empty_der =
+      ((buffer_size + 1.0L - buffer_size * ratio) * ratio_pow - 1.0L) /
+      Square(1.0L - ratio_pow * ratio);
+  if (output_time_derivative) {
+    *output_time_derivative = p_buffer_empty - p_buffer_empty_der * ratio;
+  }
+  if (input_time_derivative) {
+    *input_time_derivative = p_buffer_empty_der;
+  }
+  if (buffer_size_derivative) {
+    *buffer_size_derivative = p_buffer_empty / (1.0L - ratio_pow * ratio) *
+                              ratio_pow * ratio * std::log(ratio) * output_time;
+  }
+  return p_buffer_empty * output_time;
+}
+
 void Node::CollectTunableParameters(
     absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const {
   CollectTunableParametersHelper(parameters);
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 97ac9dd35ae..a4af549fad2 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -285,6 +285,26 @@ class Node {
     autotune_.store(autotune);
   }
 
+  // Given the average time between output events (`output_time`), the average
+  // time between input events (`input_time`) and the buffer size, the method
+  // computes the expected time an input event will have to wait.
+  //
+  // The wait time is approximated as the product of the probability the buffer
+  // will be empty and the time it takes to produce an element into the buffer.
+  //
+  // The formula used for computing the probability is derived by modeling the
+  // problem as an M/M/1/K queue
+  // (https://en.wikipedia.org/wiki/Birth%E2%80%93death_process#M/M/1/K_queue).
+  //
+  // Collects derivatives of `ComputeWaitTime` w.r.t `output_time`, `input_time'
+  // and `buffer_size` if the corresponding pointers are not `nullptr`.
+  static double ComputeWaitTime(const double& output_time,
+                                const double& input_time,
+                                const double& buffer_size,
+                                double* output_time_derivative,
+                                double* input_time_derivative,
+                                double* buffer_size_derivative);
+
   // Collects tunable parameters in the subtree rooted in this node.
   void CollectTunableParameters(
       absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 92c309bd476..898594b7c81 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -757,6 +757,78 @@ TEST(SnapshotTest, Model) {
     }
   }
 }
+
+class ComputeWaitTimeTest
+    : public ::testing::TestWithParam<std::tuple<double, double, double>> {};
+
+TEST_P(ComputeWaitTimeTest, Model) {
+  const double output_time = std::get<0>(GetParam());
+  const double input_time = std::get<1>(GetParam());
+  const double buffer_size = std::get<2>(GetParam());
+
+  double output_time_derivative = 0.0L;
+  double input_time_derivative = 0.0L;
+  double buffer_size_derivative = 0.0L;
+
+  double wait_time = model::Node::ComputeWaitTime(
+      output_time, input_time, buffer_size, &output_time_derivative,
+      &input_time_derivative, &buffer_size_derivative);
+
+  double new_wait_time =
+      model::Node::ComputeWaitTime(output_time + kParameterStep, input_time,
+                                   buffer_size, nullptr, nullptr, nullptr);
+  EXPECT_NEAR(output_time_derivative,
+              (new_wait_time - wait_time) / kParameterStep,
+              kComparisonPrecision);
+
+  if (output_time >= kParameterStep) {
+    new_wait_time =
+        model::Node::ComputeWaitTime(output_time - kParameterStep, input_time,
+                                     buffer_size, nullptr, nullptr, nullptr);
+    EXPECT_NEAR(output_time_derivative,
+                (wait_time - new_wait_time) / kParameterStep,
+                kComparisonPrecision);
+  }
+
+  new_wait_time =
+      model::Node::ComputeWaitTime(output_time, input_time + kParameterStep,
+                                   buffer_size, nullptr, nullptr, nullptr);
+  EXPECT_NEAR(input_time_derivative,
+              (new_wait_time - wait_time) / kParameterStep,
+              kComparisonPrecision);
+
+  if (input_time >= kParameterStep) {
+    new_wait_time =
+        model::Node::ComputeWaitTime(output_time, input_time - kParameterStep,
+                                     buffer_size, nullptr, nullptr, nullptr);
+    EXPECT_NEAR(input_time_derivative,
+                (wait_time - new_wait_time) / kParameterStep,
+                kComparisonPrecision);
+  }
+
+  new_wait_time = model::Node::ComputeWaitTime(output_time, input_time,
+                                               buffer_size + kParameterStep,
+                                               nullptr, nullptr, nullptr);
+  EXPECT_NEAR(buffer_size_derivative,
+              (new_wait_time - wait_time) / kParameterStep,
+              kComparisonPrecision);
+
+  if (buffer_size >= kParameterStep) {
+    new_wait_time = model::Node::ComputeWaitTime(output_time, input_time,
+                                                 buffer_size - kParameterStep,
+                                                 nullptr, nullptr, nullptr);
+    EXPECT_NEAR(buffer_size_derivative,
+                (wait_time - new_wait_time) / kParameterStep,
+                kComparisonPrecision);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Test, ComputeWaitTimeTest,
+    ::testing::Combine(::testing::Values(0, 20, 40, 80, 100),
+                       ::testing::Values(0, 20, 40, 80, 100),
+                       ::testing::Values(0, 1, 2, 4, 10, 20, 40)));
+
 }  // namespace
 }  // namespace model
 }  // namespace data

From 805c399ead74b45ee5587d786e5fbd20a6592768 Mon Sep 17 00:00:00 2001
From: Stella Laurenzo <laurenzo@google.com>
Date: Wed, 13 May 2020 17:28:34 -0700
Subject: [PATCH 152/412] Legalize TF broadcasting ops via dedicated
 xla_chlo.broadcast_* ops.

* Also enable patterns to expand these ops to explicit broadcast forms.
* Cleans up some test cases that it was not clear they were adding value.
* Also adds a registration for the shape dialect to the tf2xla bridge (this was causing an assert in the TF AOT compiler tests).

PiperOrigin-RevId: 311437273
Change-Id: I2c2a1367d1b5d208b9d92f4d0feb665c3a09c786
---
 tensorflow/compiler/mlir/xla/BUILD            |   1 +
 .../compiler/mlir/xla/tests/legalize-tf.mlir  | 246 +++++-------------
 .../mlir/xla/transforms/legalize_tf.cc        |   7 +
 .../xla/transforms/legalize_tf_patterns.td    |  38 +--
 tensorflow/compiler/tf2xla/BUILD              |   1 +
 tensorflow/compiler/tf2xla/mlir_tf2xla.cc     |   2 +
 6 files changed, 102 insertions(+), 193 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 590595a668f..12334e463fa 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -132,6 +132,7 @@ cc_library(
         "transforms/legalize_tf_control_flow.cc",
     ],
     deps = [
+        ":chlo_legalize_to_hlo",
         ":convert_op_folder",
         ":hlo",
         "//tensorflow/compiler/mlir/tensorflow",
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index a5353beb772..450910b2e4d 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -426,6 +426,8 @@ func @biasAdd_dynamic(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?xi32>) -> tenso
 
 //===----------------------------------------------------------------------===//
 // Binary op legalizations.
+// Most of these expand from the same pattern. Full semantics are
+// verified for tf.Add and pattern application only for the rest.
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: func @add
@@ -439,19 +441,49 @@ func @add(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 
 // CHECK-LABEL: func @broadcast_add
+// TODO(laurenzo): Change this to a (5 + 2x1) shaped add to make the check
+// patterns unambiguous and more interesting (once broadcastable trait is
+// fixed upstream).
 func @broadcast_add(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   return %0: tensor<1x2xi32>
 }
 
 // CHECK-LABEL: func @broadcast_multi_dim_add
+// TODO(laurenzo): Change this to a (4x1x1 + 1x4x4x4) shaped add once upstream
+// broadcastable bug is fixed (helps make the CHECK matching unambiguous)
 func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  // CHECK-NEXT: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
+  // CHECK: %[[UNUSED_LHS_SHAPE:.+]] = shape.const_shape [4, 1, 1]
+  // CHECK: %[[UNUSED_RHS_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
+  // CHECK: %[[RESULT_SHAPE:.+]] = shape.const_shape [4, 4, 4, 4]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1, 2, 3]> : tensor<4xi64>}
+  // CHECK: xla_hlo.add %[[LHS_BCAST]], %[[RHS_BCAST]]
   %0 = "tf.Add"(%arg0, %arg1) : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   return %0: tensor<4x4x4x4xi32>
 }
 
+// CHECK-LABEL: func @add_dynamic
+func @add_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: xla_hlo.add %4, %5 : tensor<?x?xi32>
+  %0 = "tf.Add"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  return %0: tensor<?x?xi32>
+}
+
 // CHECK-LABEL: func @div
 func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
@@ -460,13 +492,6 @@ func @div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   return %0: tensor<2xi32>
 }
 
-// CHECK-LABEL: func @broadcast_div
-func @broadcast_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Div"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
 // CHECK-LABEL: func @shift_left
 func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK:  xla_hlo.shift_left %arg0, %arg1 : tensor<4xi32>
@@ -474,13 +499,6 @@ func @shift_left(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   return %0 : tensor<4xi32>
 }
 
-// CHECK-LABEL: func @div_dynamic
-func @div_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  // CHECK: "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Div"(%arg0, %arg1) : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
-  return %0: tensor<?x?xi32>
-}
-
 // CHECK-LABEL: func @div_unranked
 func @div_unranked(%arg0: tensor<*xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
   // CHECK: tf.Div
@@ -510,13 +528,6 @@ func @mul(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   return %0: tensor<2xi32>
 }
 
-// CHECK-LABEL: func @broadcast_mul
-func @broadcast_mul(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Mul"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
 // CHECK-LABEL: func @real_div
 func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   // CHECK-NEXT:  %0 = xla_hlo.divide %arg0, %arg0 : tensor<2xi32>
@@ -524,13 +535,6 @@ func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   return %0: tensor<2xi32>
 }
 
-// CHECK-LABEL: func @broadcast_real_div
-func @broadcast_real_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.RealDiv"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
 // CHECK-LABEL: func @sub
 func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   // CHECK-NEXT:  %0 = xla_hlo.subtract %arg0, %arg0 : tensor<2xi32>
@@ -539,13 +543,6 @@ func @sub(%arg0: tensor<2xi32>) -> tensor<2xi32> {
   return %0: tensor<2xi32>
 }
 
-// CHECK-LABEL: func @broadcast_sub
-func @broadcast_sub(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  // CHECK-NEXT: "xla_hlo.subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.Sub"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
-  return %0: tensor<1x2xi32>
-}
-
 // CHECK-LABEL: func @shift_right
 func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK:  xla_hlo.shift_right_arithmetic %arg0, %arg1 : tensor<4xi32>
@@ -553,13 +550,6 @@ func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   return %0 : tensor<4xi32>
 }
 
-// CHECK-LABEL: func @broadcast_shift_right
-func @broadcast_shift_right(%arg0: tensor<4xi32>, %arg1: tensor<2x4xi32>) -> tensor<2x4xi32> {
-  // CHECK: "xla_hlo.shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  %0 = "tf.RightShift"(%arg0, %arg1) : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
-  return %0 : tensor<2x4xi32>
-}
-
 // CHECK-LABEL: func @shift_right_unsigned
 func @shift_right_unsigned(%arg0: tensor<4xui8>, %arg1: tensor<4xui8>) -> tensor<4xui8> {
   // CHECK:  tf.RightShift
@@ -581,20 +571,6 @@ func @and(%arg0: tensor<2xi1>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// CHECK-LABEL: func @and_broadcast
-func @and_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.and"
-  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @and_dynamic
-func @and_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  // CHECK-NEXT: "xla_hlo.and"
-  %0 = "tf.LogicalAnd"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
 // CHECK-LABEL: func @and_unranked
 func @and_unranked(%arg0: tensor<*xi1>, %arg1: tensor<*xi1>) -> tensor<*xi1> {
   // CHECK: tf.LogicalAnd
@@ -609,20 +585,6 @@ func @or(%arg0: tensor<2xi1>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// CHECK-LABEL: func @or_broadcast
-func @or_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg1) : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @or_dynamic
-func @or_dynamic(%arg0: tensor<?xi1>, %arg1: tensor<1xi1>) -> tensor<?xi1> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.LogicalOr"(%arg0, %arg1) : (tensor<?xi1>, tensor<1xi1>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
 // CHECK-LABEL: func @bitwise_or
 func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK-NEXT: xla_hlo.or
@@ -630,20 +592,6 @@ func @bitwise_or(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   return %0: tensor<4xi32>
 }
 
-// CHECK-LABEL: func @bitwise_or_broadcast
-func @bitwise_or_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
-  return %0: tensor<1x4xi8>
-}
-
-// CHECK-LABEL: func @bitwise_or_dynamic
-func @bitwise_or_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  // CHECK-NEXT: xla_hlo.or
-  %0 = "tf.BitwiseOr"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-  return %0: tensor<?xi32>
-}
-
 // CHECK-LABEL: func @bitwise_and
 func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   // CHECK-NEXT: xla_hlo.and
@@ -651,20 +599,6 @@ func @bitwise_and(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi32> {
   return %0: tensor<4xi32>
 }
 
-// CHECK-LABEL: func @bitwise_and_broadcast
-func @bitwise_and_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  // CHECK-NEXT: xla_hlo.and
-  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
-  return %0: tensor<1x4xi8>
-}
-
-// CHECK-LABEL: func @bitwise_and_dynamic
-func @bitwise_and_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi32> {
-  // CHECK-NEXT: xla_hlo.and
-  %0 = "tf.BitwiseAnd"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi32>
-  return %0: tensor<?xi32>
-}
-
 // CHECK-LABEL: func @pow
 func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   // CHECK-NEXT:  xla_hlo.power
@@ -672,13 +606,6 @@ func @pow(%arg0: tensor<2xf32>) -> tensor<2xf32> {
   return %0: tensor<2xf32>
 }
 
-// CHECK-LABEL: func @pow_dynamic
-func @pow_dynamic(%arg0: tensor<?xf32>) -> tensor<?xf32> {
-  // CHECK-NEXT:  xla_hlo.power
-  %0 = "tf.Pow"(%arg0, %arg0) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  return %0: tensor<?xf32>
-}
-
 // CHECK-LABEL: func @diag_part
 // CHECK-SAME: %[[ARG:.*]]: tensor<4x3x4x3xf32>
 func @diag_part(%arg0: tensor<4x3x4x3xf32>) -> tensor<4x3xf32> {
@@ -862,6 +789,8 @@ func @broadcast_to(%arg0: tensor<16xf32>) -> tensor<16x16x16x16xf32> {
 
 //===----------------------------------------------------------------------===//
 // Equality op legalizations.
+// tf.Equal and tf.NotEqual expand from the same pattern. Full semantics are
+// verified for tf.Equal and pattern application only for tf.NotEqual
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: func @equal
@@ -873,14 +802,26 @@ func @equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
 
 // CHECK-LABEL: func @equal_dynamic
 func @equal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "EQ"}
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
   return %0: tensor<?xi1>
 }
 
 // CHECK-LABEL: func @equal_broadcast
 func @equal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "EQ"}
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "EQ"}
   %0 = "tf.Equal"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0: tensor<1x2xi1>
 }
@@ -927,70 +868,42 @@ func @notequal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// CHECK-LABEL: func @notequal_dynamic
-func @notequal_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg1) {comparison_direction = "NE"}
-  %0 = "tf.NotEqual"(%arg0, %arg1) : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @notequal_broadcast
-func @notequal_broadcast(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"}
-  %0 = "tf.NotEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @notequal_broadcast_no_incompatible_shapes_error
-func @notequal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
-// CHECK-LABEL: func @notequal_incompatible_shape_broadcastable
-func @notequal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: tensor<1xi32>) -> tensor<?xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<1xi32>) -> tensor<?xi1>
-  return %0: tensor<?xi1>
-}
-
-// CHECK-LABEL: func @notequal_incompatible_shape_dynamic
-func @notequal_incompatible_shape_dynamic(%arg0: tensor<2xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<2xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
-// CHECK-LABEL: func @notequal_incompatible_shape_both_dynamic
-func @notequal_incompatible_shape_both_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<*xi1> {
-  // CHECK-NEXT: "tf.NotEqual"(%arg0, %arg1) {incompatible_shape_error = false}
-  %0 = "tf.NotEqual"(%arg0, %arg1) { incompatible_shape_error = false } : (tensor<?xi32>, tensor<?xi32>) -> tensor<*xi1>
-  return %0: tensor<*xi1>
-}
-
 //===----------------------------------------------------------------------===//
 // Compare op legalizations.
+// These expand from the same pattern. Full semantics are checked for
+// tf.Greater. Others just check that the pattern applied.
 //===----------------------------------------------------------------------===//
 
 // CHECK-LABEL: func @greater
 func @greater(%arg0: tensor<2xi32>) -> tensor<2xi1> {
-  // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
+  // CHECK: "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
   %0 = "tf.Greater"(%arg0, %arg0) : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi1>
   return %0: tensor<2xi1>
 }
 
 // CHECK-LABEL: func @broadcast_greater
 func @broadcast_greater(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GT"}
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.const_shape [1]
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = shape.const_shape [1, 2]
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
   %0 = "tf.Greater"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   return %0: tensor<1x2xi1>
 }
 
 // CHECK-LABEL: func @greater_dynamic
-func @greater_dynamic(%arg0: tensor<?xi32>) -> tensor<?xi1> {
-  // CHECK:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "GT"}
-  %0 = "tf.Greater"(%arg0, %arg0) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
+func @greater_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi1> {
+  // CHECK-DAG: %[[LHS_SHAPE:.+]] = shape.shape_of %arg0
+  // CHECK-DAG: %[[RHS_SHAPE:.+]] = shape.shape_of %arg1
+  // CHECK-DAG: %[[RESULT_SHAPE:.+]] = "shape.broadcast"(%[[LHS_SHAPE]], %[[RHS_SHAPE]])
+  // CHECK-DAG: %[[RESULT_EXTENTS:.+]] = "shape.to_extent_tensor"(%[[RESULT_SHAPE]])
+  // CHECK-DAG: %[[LHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK-DAG: %[[RHS_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[RESULT_EXTENTS]]) {broadcast_dimensions = dense<0> : tensor<1xi64>}
+  // CHECK: "xla_hlo.compare"(%[[LHS_BCAST]], %[[RHS_BCAST]]) {comparison_direction = "GT"}
+  %0 = "tf.Greater"(%arg0, %arg1) : (tensor<?xi32>, tensor<?xi32>) -> tensor<?xi1>
   return %0: tensor<?xi1>
 }
 
@@ -1008,13 +921,6 @@ func @greater_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// CHECK-LABEL: func @broadcast_greater_equal
-func @broadcast_greater_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "GE"}
-  %0 = "tf.GreaterEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
 // CHECK-LABEL: func @less
 func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LT"}
@@ -1022,13 +928,6 @@ func @less(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// CHECK-LABEL: func @broadcast_less
-func @broadcast_less(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LT"}
-  %0 = "tf.Less"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
 // CHECK-LABEL: func @less_equal
 func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   // CHECK-NEXT:  "xla_hlo.compare"(%arg0, %arg0) {comparison_direction = "LE"}
@@ -1036,13 +935,6 @@ func @less_equal(%arg0: tensor<2xi32>) -> tensor<2xi1> {
   return %0: tensor<2xi1>
 }
 
-// CHECK-LABEL: func @broadcast_less_equal
-func @broadcast_less_equal(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  // CHECK-NEXT: "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "LE"}
-  %0 = "tf.LessEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
-  return %0: tensor<1x2xi1>
-}
-
 
 //===----------------------------------------------------------------------===//
 // Complex op legalizations.
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
index a0a5e47ad65..10bac232b0f 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc
@@ -44,9 +44,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/xla/convert_op_folder.h"
+#include "tensorflow/compiler/mlir/xla/ir/chlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
 #include "tensorflow/compiler/mlir/xla/ir/hlo_utils.h"
 #include "tensorflow/compiler/mlir/xla/transforms/passes.h"
+#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -4955,7 +4957,12 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) {
       ConvertRandomShuffleOp, ConvertVariableShapeOp, ConvertXlaShardingOp,
       ConvertXlaDynamicUpdateSliceOp>(op->getContext());
 
+  // Populate with CHLO->HLO lowerings to account for TF ops legalized to
+  // CHLO first.
+  xla_chlo::PopulateLegalizeChloToHloPatterns(context, &patterns);
+
   ConversionTarget target(*context);
+  target.addIllegalDialect<xla_chlo::XlaHloClientDialect>();
   target.addLegalDialect<XlaHloDialect>();
   target.addLegalDialect<StandardOpsDialect>();
   target.addLegalDialect<shape::ShapeDialect>();
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
index 2a27c1f2966..959902692dc 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td
@@ -18,6 +18,7 @@ limitations under the License.
 include "mlir/IR/OpBase.td"
 include "mlir/Dialect/StandardOps/IR/Ops.td"
 include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td"
+include "tensorflow/compiler/mlir/xla/ir/chlo_ops.td"
 include "tensorflow/compiler/mlir/xla/ir/hlo_ops.td"
 
 def SignedIntTensor : TensorOf<[I1, I8, I16, I32, I64]>;
@@ -80,6 +81,9 @@ def BiasAddFeatureDimension : NativeCodeCall<
 
 // $input needs to be a ranked tensor to identify index of the feature
 // dimension depending on the data_format 'NHWC' or 'NCHW'.
+// TODO(laurenzo): This should be converted to do explicit broadcasting since
+// it can generate broadcast dimensions that are not compatible with the simple
+// xla_chlo.add broadcast_dims.
 def : Pat<(TF_BiasAddOp AnyRankedTensor:$input, $bias, $data_format),
           (HLO_AddOp $input, $bias,
               (BiasAddFeatureDimension $data_format, $input))>;
@@ -96,16 +100,16 @@ class DirectBinaryPat<Op FromOp, Op ToOp>
   : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r),
         (ToOp $l, $r, (BinBroadcastDimensions $l, $r))>;
 
-foreach fromToBinPair = [[TF_AddOp, HLO_AddOp],
-                         [TF_AddV2Op, HLO_AddOp],
-                         [TF_DivOp, HLO_DivOp],
-                         [TF_LeftShiftOp, HLO_ShiftLeftOp],
-                         [TF_MaximumOp, HLO_MaxOp],
-                         [TF_MinimumOp, HLO_MinOp],
-                         [TF_MulOp, HLO_MulOp],
-                         [TF_PowOp, HLO_PowOp],
-                         [TF_RealDivOp, HLO_DivOp],
-                         [TF_SubOp, HLO_SubOp]] in
+foreach fromToBinPair = [[TF_AddOp, HLOClient_BroadcastAddOp],
+                         [TF_AddV2Op, HLOClient_BroadcastAddOp],
+                         [TF_DivOp, HLOClient_BroadcastDivOp],
+                         [TF_LeftShiftOp, HLOClient_BroadcastShiftLeftOp],
+                         [TF_MaximumOp, HLOClient_BroadcastMaxOp],
+                         [TF_MinimumOp, HLOClient_BroadcastMinOp],
+                         [TF_MulOp, HLOClient_BroadcastMulOp],
+                         [TF_PowOp, HLOClient_BroadcastPowOp],
+                         [TF_RealDivOp, HLOClient_BroadcastDivOp],
+                         [TF_SubOp, HLOClient_BroadcastSubOp]] in
   def : DirectBinaryPat<fromToBinPair[0], fromToBinPair[1]>;
 
 def LowerRightShiftSigned :
@@ -196,10 +200,10 @@ class DirectLogicalBinaryPat<Op FromOp, Op ToOp>
         (ToOp $l, $r, (BinBroadcastDimensions $l, $r)),
         [(SignedIntTensor $l)]>;
 
-foreach fromToBinPair = [[TF_LogicalAndOp, HLO_AndOp],
-                         [TF_LogicalOrOp, HLO_OrOp],
-                         [TF_BitwiseOrOp, HLO_OrOp],
-                         [TF_BitwiseAndOp, HLO_AndOp]] in
+foreach fromToBinPair = [[TF_LogicalAndOp, HLOClient_BroadcastAndOp],
+                         [TF_LogicalOrOp, HLOClient_BroadcastOrOp],
+                         [TF_BitwiseOrOp, HLOClient_BroadcastOrOp],
+                         [TF_BitwiseAndOp, HLOClient_BroadcastAndOp]] in
   def : DirectLogicalBinaryPat<fromToBinPair[0], fromToBinPair[1]>;
 
 //===----------------------------------------------------------------------===//
@@ -208,7 +212,8 @@ foreach fromToBinPair = [[TF_LogicalAndOp, HLO_AndOp],
 
 class DirectComparePat<Op FromOp, StrEnumAttrCase direction>
   : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r),
-        (HLO_CompareOp $l, $r, (BinBroadcastDimensions $l, $r), direction)>;
+        (HLOClient_BroadcastCompareOp
+           $l, $r, (BinBroadcastDimensions $l, $r), direction)>;
 
 def : DirectComparePat<TF_GreaterOp, HLO_COMPARISON_DIRECTION_GT>;
 def : DirectComparePat<TF_GreaterEqualOp, HLO_COMPARISON_DIRECTION_GE>;
@@ -218,7 +223,8 @@ def : DirectComparePat<TF_LessEqualOp, HLO_COMPARISON_DIRECTION_LE>;
 class EqualityPat<Op FromOp, StrEnumAttrCase direction>
     : Pat<(FromOp AnyRankedTensor:$l, AnyRankedTensor:$r,
            TrueBoolAttr:$incompatible_shape_error),
-        (HLO_CompareOp $l, $r, (BinBroadcastDimensions $l, $r), direction),
+        (HLOClient_BroadcastCompareOp
+         $l, $r, (BinBroadcastDimensions $l, $r), direction),
         [(AreBroadcastCompatible $l, $r)]>;
 
 def : EqualityPat<TF_EqualOp, HLO_COMPARISON_DIRECTION_EQ>;
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index c2ad1255a35..897528b6de9 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -182,6 +182,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Shape",
         "@llvm-project//mlir:StandardOps",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
index daf261fa5d8..43793be56a7 100644
--- a/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/mlir_tf2xla.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -95,6 +96,7 @@ static void RegisterDialects() {
     mlir::registerDialect<mlir::TF::TensorFlowDialect>();
     mlir::registerDialect<mlir::StandardOpsDialect>();
     mlir::registerDialect<mlir::xla_hlo::XlaHloDialect>();
+    mlir::registerDialect<mlir::shape::ShapeDialect>();
     return true;
   }();
   (void)init_once;

From e989b132d4ed9625dee8a3896844f81bc54d1101 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Wed, 13 May 2020 17:42:23 -0700
Subject: [PATCH 153/412] Simplify error logging in XNNPACK delegate

Use TF_LITE_MAYBE_KERNEL_LOG to remove redundant if blocks

PiperOrigin-RevId: 311439195
Change-Id: I3f75e6178061b63d01a7b935e6d23739651f37d0
---
 .../delegates/xnnpack/xnnpack_delegate.cc     | 327 +++++++-----------
 1 file changed, 128 insertions(+), 199 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 388509c9873..6d9b4dac8f8 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -245,10 +245,9 @@ class Subgraph {
         *flags = 0;
         return kTfLiteOk;
       default:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context, "invalid padding mode (%d) in node #%d",
-                             static_cast<int>(padding), node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "invalid padding mode (%d) in node #%d",
+                                 static_cast<int>(padding), node_index);
         return kTfLiteError;
     }
   }
@@ -274,32 +273,24 @@ class Subgraph {
         *output_max = 6.0f;
         return kTfLiteOk;
       case kTfLiteActTanh:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unsupported fused activation (Tanh) in node #%d",
-                             node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Tanh) in node #%d",
+            node_index);
         return kTfLiteError;
       case kTfLiteActSignBit:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unsupported fused activation (Sign) in node #%d",
-                             node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Sign) in node #%d",
+            node_index);
         return kTfLiteError;
       case kTfLiteActSigmoid:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(
-              context, "unsupported fused activation (Sigmoid) in node #%d",
-              node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context, "unsupported fused activation (Sigmoid) in node #%d",
+            node_index);
         return kTfLiteError;
       default:
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "invalid fused activation (%d) in node #%d",
-                             static_cast<int>(activation), node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "invalid fused activation (%d) in node #%d",
+                                 static_cast<int>(activation), node_index);
         return kTfLiteError;
     }
   }
@@ -308,34 +299,26 @@ class Subgraph {
                                              const TfLiteConvParams* params,
                                              int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
     if (params->dilation_width_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation width factor %d in node #%d",
-                           params->dilation_width_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation width factor %d in node #%d",
+                               params->dilation_width_factor, node_index);
       return kTfLiteError;
     }
     if (params->dilation_height_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation height factor %d in node #%d",
-                           params->dilation_height_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation height factor %d in node #%d",
+                               params->dilation_height_factor, node_index);
       return kTfLiteError;
     }
 
@@ -346,52 +329,41 @@ class Subgraph {
       TfLiteContext* context, const TfLiteDepthwiseConvParams* params,
       int output_channels, int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
     if (params->depth_multiplier <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid depth multiplier %d in node #%d",
-                           params->depth_multiplier, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid depth multiplier %d in node #%d",
+                               params->depth_multiplier, node_index);
       return kTfLiteError;
     }
     if (output_channels % params->depth_multiplier != 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "depth multiplier %d is incompatible with "
-                           "number of output channels %d in node #%d",
-                           params->depth_multiplier, output_channels,
-                           node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "depth multiplier %d is incompatible with "
+                               "number of output channels %d in node #%d",
+                               params->depth_multiplier, output_channels,
+                               node_index);
       return kTfLiteError;
     }
 
     if (params->dilation_width_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation width factor %d in node #%d",
-                           params->dilation_width_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation width factor %d in node #%d",
+                               params->dilation_width_factor, node_index);
       return kTfLiteError;
     }
     if (params->dilation_height_factor <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid dilation height factor %d in node #%d",
-                           params->dilation_height_factor, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "invalid dilation height factor %d in node #%d",
+                               params->dilation_height_factor, node_index);
       return kTfLiteError;
     }
 
@@ -402,17 +374,13 @@ class Subgraph {
       TfLiteContext* context, const TfLiteTransposeConvParams* params,
       int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
@@ -502,11 +470,9 @@ class Subgraph {
       TfLiteContext* context, const TfLiteFullyConnectedParams* params,
       int node_index) {
     if (params->weights_format != kTfLiteFullyConnectedWeightsFormatDefault) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unsupported non-default weights format in node #%d",
-                           node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unsupported non-default weights format in node #%d",
+          node_index);
       return kTfLiteError;
     }
 
@@ -517,39 +483,29 @@ class Subgraph {
                                          const TfLitePoolParams* params,
                                          int node_index) {
     if (params->stride_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
-                           params->stride_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride width %d in node #%d",
+                               params->stride_width, node_index);
       return kTfLiteError;
     }
     if (params->stride_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
-                           params->stride_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid stride height %d in node #%d",
+                               params->stride_height, node_index);
       return kTfLiteError;
     }
 
     if (params->filter_width <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid filter width %d in node #%d",
-                           params->filter_width, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid filter width %d in node #%d",
+                               params->filter_width, node_index);
       return kTfLiteError;
     }
     if (params->filter_height <= 0) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "invalid filter height %d in node #%d",
-                           params->filter_height, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "invalid filter height %d in node #%d",
+                               params->filter_height, node_index);
       return kTfLiteError;
     }
     if (params->filter_width == 1 && params->filter_height == 1) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context, "meaningless 1x1 pooling in node #%d",
-                           node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context, "meaningless 1x1 pooling in node #%d",
+                               node_index);
       return kTfLiteError;
     }
 
@@ -562,19 +518,15 @@ class Subgraph {
                                                int expected_num_outputs,
                                                int node_index) {
     if (node->inputs->size != expected_num_inputs) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unexpected number of inputs (%d != %d) in node #%d",
-                           node->inputs->size, expected_num_inputs, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unexpected number of inputs (%d != %d) in node #%d",
+          node->inputs->size, expected_num_inputs, node_index);
       return kTfLiteError;
     }
     if (node->outputs->size != expected_num_outputs) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            context, "unexpected number of output (%d != %d) in node #%d",
-            node->outputs->size, expected_num_outputs, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unexpected number of output (%d != %d) in node #%d",
+          node->outputs->size, expected_num_outputs, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -584,11 +536,9 @@ class Subgraph {
                                            const TfLiteTensor& tensor,
                                            int tensor_index, int node_index) {
     if (tensor.type != kTfLiteFloat32) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            context, "unsupported type %s in tensor #%d in node #%d",
-            TfLiteTypeGetName(tensor.type), tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context, "unsupported type %s in tensor #%d in node #%d",
+          TfLiteTypeGetName(tensor.type), tensor_index, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -599,21 +549,17 @@ class Subgraph {
                                        int expected_num_dims,
                                        int tensor_index) {
     if (tensor.dims->size != expected_num_dims) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            context,
-            "unexpected number of shape dimensions (%d != %d) in tensor #%d",
-            tensor.dims->size, expected_num_dims, tensor_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context,
+          "unexpected number of shape dimensions (%d != %d) in tensor #%d",
+          tensor.dims->size, expected_num_dims, tensor_index);
       return kTfLiteError;
     }
     for (int i = 0; i < tensor.dims->size; i++) {
       if (tensor.dims->data[i] <= 0) {
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "invalid dimension #%d (%d) in tensor #%d", i,
-                             tensor.dims->data[i], tensor_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(context,
+                                 "invalid dimension #%d (%d) in tensor #%d", i,
+                                 tensor.dims->data[i], tensor_index);
         return kTfLiteError;
       }
     }
@@ -624,25 +570,22 @@ class Subgraph {
                                             const TfLiteTensor& tensor,
                                             int tensor_index, int node_index) {
     if (tensor.dims->size < 1) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "unexpected number of shape dimensions (%d) in "
-                           "tensor #%d in node #%d: "
-                           "expected at least a 1D tensor",
-                           tensor.dims->size, tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(context,
+                               "unexpected number of shape dimensions (%d) in "
+                               "tensor #%d in node #%d: "
+                               "expected at least a 1D tensor",
+                               tensor.dims->size, tensor_index, node_index);
       return kTfLiteError;
     }
     // Validate that all non-channel dimensions (if any) are exactly 1.
     for (int i = 0; i < tensor.dims->size - 1; i++) {
       if (tensor.dims->data[i] != 1) {
-        if (context != nullptr) {
-          TF_LITE_KERNEL_LOG(context,
-                             "unexpected value %d of shape dimension #%d in "
-                             "tensor #%d in node #%d: "
-                             "expected 1 for non-channel dimensions",
-                             tensor.dims[i], i, tensor_index, node_index);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            context,
+            "unexpected value %d of shape dimension #%d in "
+            "tensor #%d in node #%d: "
+            "expected 1 for non-channel dimensions",
+            tensor.dims[i], i, tensor_index, node_index);
         return kTfLiteError;
       }
     }
@@ -654,12 +597,11 @@ class Subgraph {
       int node_index) {
     // TODO(b/149120844): remove checks once dynamic tensors are supported
     if (tensor.allocation_type == kTfLiteDynamic) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid allocation type in tensor #%d in node #%d: "
-                           "expected non-dynamic tensor",
-                           tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context,
+          "invalid allocation type in tensor #%d in node #%d: "
+          "expected non-dynamic tensor",
+          tensor_index, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -671,12 +613,11 @@ class Subgraph {
                                                   int node_index) {
     if (tensor.allocation_type != kTfLiteMmapRo ||
         tensor.data.raw_const == nullptr) {
-      if (context != nullptr) {
-        TF_LITE_KERNEL_LOG(context,
-                           "invalid allocation type in tensor #%d in node #%d: "
-                           "expected static read-only tensor",
-                           tensor_index, node_index);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          context,
+          "invalid allocation type in tensor #%d in node #%d: "
+          "expected static read-only tensor",
+          tensor_index, node_index);
       return kTfLiteError;
     }
     return kTfLiteOk;
@@ -1134,23 +1075,19 @@ class Subgraph {
     const int32_t input_channels = filter_tensor.dims->data[1];
 
     if (input_tensor.dims->size == 0) {
-      if (logging_context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            logging_context,
-            "unexpected number of shape dimensions %d in tensor #%d",
-            input_tensor.dims->size, node->inputs->data[0]);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "unexpected number of shape dimensions %d in tensor #%d",
+          input_tensor.dims->size, node->inputs->data[0]);
       return kTfLiteError;
     }
 
     int32_t num_input_elements = 1;
     for (int i = 0; i < input_tensor.dims->size; i++) {
       if (input_tensor.dims->data[i] <= 0) {
-        if (logging_context != nullptr) {
-          TF_LITE_KERNEL_LOG(logging_context,
-                             "invalid dimension #%d (%d) in tensor #%d", i,
-                             input_tensor.dims->data[i], node->inputs->data[0]);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context, "invalid dimension #%d (%d) in tensor #%d", i,
+            input_tensor.dims->data[i], node->inputs->data[0]);
         return kTfLiteError;
       }
       num_input_elements *= input_tensor.dims->data[i];
@@ -1163,55 +1100,47 @@ class Subgraph {
 
       for (int i = 0; i < input_tensor.dims->size - 1; i++) {
         if (input_tensor.dims->data[i] != output_tensor.dims->data[i]) {
-          if (logging_context != nullptr) {
-            TF_LITE_KERNEL_LOG(
-                logging_context,
-                "mismatch in shape dimension %d (%d != %d) in input and output "
-                "tensors of FULLY_CONNECTED operator #%d",
-                i, input_tensor.dims->data[i], output_tensor.dims->data[i],
-                node_index);
-          }
+          TF_LITE_MAYBE_KERNEL_LOG(
+              logging_context,
+              "mismatch in shape dimension %d (%d != %d) in input and output "
+              "tensors of FULLY_CONNECTED operator #%d",
+              i, input_tensor.dims->data[i], output_tensor.dims->data[i],
+              node_index);
           return kTfLiteError;
         }
       }
     } else {
       if (num_input_elements % input_channels != 0) {
-        if (logging_context != nullptr) {
-          TF_LITE_KERNEL_LOG(
-              logging_context,
-              "number of elements in input tensor #%d in FULLY_CONNECTED "
-              "operator is not divisible by input channels (%d)",
-              node->inputs->data[0], input_channels);
-          return kTfLiteError;
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "number of elements in input tensor #%d in FULLY_CONNECTED "
+            "operator is not divisible by input channels (%d)",
+            node->inputs->data[0], input_channels);
+        return kTfLiteError;
       }
 
       TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, output_tensor, 2,
                                              node->outputs->data[0]));
 
       if (output_tensor.dims->data[0] != num_input_elements / input_channels) {
-        if (logging_context != nullptr) {
-          TF_LITE_KERNEL_LOG(
-              logging_context,
-              "batch size %d in output tensor #%d in FULLY_CONNECTED operator "
-              "does not match batch size %d in reshaped input tensor #%d",
-              output_tensor.dims->data[0], node->outputs->data[0],
-              num_input_elements / input_channels, node->inputs->data[0]);
-        }
+        TF_LITE_MAYBE_KERNEL_LOG(
+            logging_context,
+            "batch size %d in output tensor #%d in FULLY_CONNECTED operator "
+            "does not match batch size %d in reshaped input tensor #%d",
+            output_tensor.dims->data[0], node->outputs->data[0],
+            num_input_elements / input_channels, node->inputs->data[0]);
         return kTfLiteError;
       }
     }
 
     if (output_tensor.dims->data[output_tensor.dims->size - 1] !=
         output_channels) {
-      if (logging_context != nullptr) {
-        TF_LITE_KERNEL_LOG(
-            logging_context,
-            "number of channels %d in output tensor #%d does not match output "
-            "channels %d in filter tensor #%d",
-            output_tensor.dims->data[output_tensor.dims->size - 1],
-            node->outputs->data[0], output_channels, node->inputs->data[1]);
-      }
+      TF_LITE_MAYBE_KERNEL_LOG(
+          logging_context,
+          "number of channels %d in output tensor #%d does not match output "
+          "channels %d in filter tensor #%d",
+          output_tensor.dims->data[output_tensor.dims->size - 1],
+          node->outputs->data[0], output_channels, node->inputs->data[1]);
       return kTfLiteError;
     }
 

From a76e002aa82c40d272c66ef56d43a77c5cc106b8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 17:42:35 -0700
Subject: [PATCH 154/412] Internal change

PiperOrigin-RevId: 311439232
Change-Id: Ieed3345eef5dc74a2b0cc4805ed5269bf775a405
---
 tensorflow/python/ops/math_ops.py      | 4 ----
 tensorflow/python/ops/math_ops_test.py | 6 ------
 2 files changed, 10 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 749aa89593a..4c4982c6fd5 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -82,7 +82,6 @@ from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_data_flow_ops
@@ -439,9 +438,6 @@ def divide(x, y, name=None):
     # override names. Use a dummy class to track the runtime division behavior
     return DivideDelegateWithName(x, name) / y
   else:
-    # We do conversion here to make sure at least x is a tensor.
-    if not tensor_util.is_tensor(x):
-      x = ops.convert_to_tensor(x)
     return x / y
 
 
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 6171ea037d9..2405eec9e49 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -495,12 +495,6 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     # Consistent with desire to get numerator
     self.assertAllEqual(tf_result, expanded_nums)
 
-  def testWithPythonValue(self):
-    # Test case for GitHub issue 39475:
-    # https://github.com/tensorflow/tensorflow/issues/39475
-    x = math_ops.divide(5, 2)
-    self.assertTrue(isinstance(x, ops.Tensor))
-
 
 @test_util.run_all_in_graph_and_eager_modes
 class DivNoNanTest(test_util.TensorFlowTestCase):

From 43691f9b891045f41b59ec1afbe06637c19d2377 Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Wed, 13 May 2020 17:52:25 -0700
Subject: [PATCH 155/412] [XLA] Add use_names column to buffer info debug
 string to help with debugging.

PiperOrigin-RevId: 311440576
Change-Id: I060aed0171625c79bfa7d8ae821f098670d6c84f
---
 .../xla/service/memory_space_assignment.cc    | 23 +++++++++++++++----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 8752e870bb7..742de71e74c 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -585,23 +585,35 @@ void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
   // definition_time: int. Logical time this value was defined in the schedule.
   // use_times: string. This is a semicolon-separated list of integers for all
   // the use times.
+  // use_names: string. This is a semicolon-separated list of string
+  // representation of uses.
   if (debug_str->empty()) {
     // Append the column names.
     absl::StrAppend(debug_str,
-                    "buffer_id,buffer_name,alt_mem_benefit,size,definition_"
-                    "time,use_times\n");
+                    "buffer_id,buffer_name,alt_mem_benefit,size,"
+                    "definition_time,use_times,use_names\n");
   }
   const HloBuffer& buffer =
       alias_analysis_.GetBufferContainingValue(*interval.buffer);
   const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
   int64 definition_time =
       instruction_schedule.at(interval.buffer->defining_position().instruction);
-  std::set<int64> use_times;
+  std::vector<std::pair<int64, std::string>> uses;
   for (const HloValue* value : buffer.values()) {
     for (const HloUse& use : value->uses()) {
-      use_times.insert(instruction_schedule.at(use.instruction));
+      uses.push_back(
+          {instruction_schedule.at(use.instruction), use.ToString()});
     }
   }
+  absl::c_sort(uses);
+  std::vector<int64> use_times;
+  std::vector<std::string> use_names;
+  use_times.reserve(uses.size());
+  use_names.reserve(uses.size());
+  for (auto use : uses) {
+    use_times.push_back(use.first);
+    use_names.push_back(use.second);
+  }
 
   absl::StrAppend(debug_str, buffer.id(), ",");
   absl::StrAppend(debug_str, "\"", interval.buffer->ToShortString(), "\",");
@@ -612,7 +624,8 @@ void AlternateMemoryBestFitHeap::AppendBufferInfoDebugString(
       debug_str, alternate_memory_benefit ? *alternate_memory_benefit : 0, ",");
   absl::StrAppend(debug_str, interval.size, ",");
   absl::StrAppend(debug_str, definition_time, ",");
-  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_times, ";"), "\"");
+  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_times, ";"), "\",");
+  absl::StrAppend(debug_str, "\"", absl::StrJoin(use_names, ";"), "\"");
   absl::StrAppend(debug_str, "\n");
 }
 

From 649c80888967bc3f0d9e60f51ff69e5c173537ec Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Wed, 13 May 2020 18:35:48 -0700
Subject: [PATCH 156/412] Reuse existing util functions in
 ReplicateToIslandPass and add back a test for remapping results (NFC).

PiperOrigin-RevId: 311446065
Change-Id: Iba9516da76f6df9459b5ad323d9ac9fd7563ded7
---
 .../tensorflow/tests/replicate_to_island.mlir | 22 ++++++++++++++++
 .../transforms/replicate_to_island.cc         | 26 +++++++------------
 2 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
index c8b4ad2cb9f..8da252fc832 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/replicate_to_island.mlir
@@ -119,3 +119,25 @@ func @replicate_control() {
 // CHECK: %[[REPLICA_1:.*]] = tf_executor.island
 // CHECK: %[[SINK:.*]] = tf_executor.island(%[[REPLICA_0]], %[[REPLICA_1]])
 // CHECK: tf_executor.fetch %[[SINK]]
+
+
+// Tests replicate results are remapped correctly.
+// CHECK-LABEL: func @replicate_result
+func @replicate_result(%arg0: tensor<i1>, %arg1: tensor<i1>) {
+  %0:4 = tf_executor.graph {
+    %1:5 = tf_executor.island {
+      %2:4 = tf_device.replicate([%arg0, %arg1] as %arg2: tensor<i1>) {n = 2 : i32} {
+        %3 = "tf.opA"(%arg2) : (tensor<i1>) -> tensor<f32>
+        %4 = "tf.opB"(%arg2) : (tensor<i1>) -> tensor<i32>
+        tf_device.return %3, %4 : tensor<f32>, tensor<i32>
+      }
+      tf_executor.yield %2#0, %2#1, %2#2, %2#3 : tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
+    }
+    tf_executor.fetch %1#0, %1#1, %1#2, %1#3 : tensor<f32>, tensor<f32>, tensor<i32>, tensor<i32>
+  }
+  return
+}
+
+// CHECK: %[[REPLICA_0:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: %[[REPLICA_1:.*]]:2, %{{.*}} = tf_executor.island
+// CHECK: tf_executor.fetch %[[REPLICA_0]]#0, %[[REPLICA_1]]#0, %[[REPLICA_0]]#1, %[[REPLICA_1]]#1
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
index fe9283d6932..2fd230005d0 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/replicate_to_island.cc
@@ -156,9 +156,9 @@ llvm::SmallVector<tf_executor::IslandOp, 8> ExpandReplicateIntoReplicas(
 //   }) {device = "/DEVICE:3"} : () -> tensor<i1>
 //   tf_executor.yield %a1, %b1 : tensor<i1>, tensor<i1>
 // }
-LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
-                                         tf_executor::IslandOp island_op,
-                                         tf_device::ReplicateOp replicate_op) {
+void CreateIslandsFromReplicate(const Dialect* tf_dialect,
+                                tf_executor::IslandOp island_op,
+                                tf_device::ReplicateOp replicate_op) {
   OpBuilder builder(island_op);
   const int num_replicas = replicate_op.n().getLimitedValue();
 
@@ -199,21 +199,17 @@ LogicalResult CreateIslandsFromReplicate(const Dialect* tf_dialect,
   }
 
   island_op.erase();
-  return success();
 }
 
 // Finds islands with a single `tf_device.replicate` and create individual
 // islands per replica of the replicate.
-LogicalResult LowerSingleIslandReplicateToIslands(
-    const Dialect* tf_dialect, tf_executor::IslandOp island_op) {
-  if (!hasSingleElement(island_op.GetBody().without_terminator()))
-    return success();
+void LowerSingleIslandReplicateToIslands(const Dialect* tf_dialect,
+                                         tf_executor::IslandOp island_op) {
+  if (!island_op.WrapsSingleOp()) return;
 
   if (auto replicate_op =
           llvm::dyn_cast<tf_device::ReplicateOp>(&island_op.GetBody().front()))
-    return CreateIslandsFromReplicate(tf_dialect, island_op, replicate_op);
-
-  return success();
+    CreateIslandsFromReplicate(tf_dialect, island_op, replicate_op);
 }
 
 void ReplicateToIslandPass::runOnFunction() {
@@ -223,13 +219,9 @@ void ReplicateToIslandPass::runOnFunction() {
     getFunction().emitError() << "'tf' dialect is not registered";
   }
 
-  auto result = getFunction().walk([&](tf_executor::IslandOp island_op) {
-    if (failed(LowerSingleIslandReplicateToIslands(tf_dialect, island_op)))
-      return WalkResult::interrupt();
-    return WalkResult::advance();
+  getFunction().walk([&](tf_executor::IslandOp island_op) {
+    LowerSingleIslandReplicateToIslands(tf_dialect, island_op);
   });
-
-  if (result.wasInterrupted()) return signalPassFailure();
 }
 }  // anonymous namespace
 

From 22eb1624cb3b2eb4e2369bcba35bd9156aa080d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 18:39:40 -0700
Subject: [PATCH 157/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311446541
Change-Id: I40afd0237cbf7947fe620390ff2788dbbb3203e9
---
 tensorflow/go/op/wrappers.go | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index bab430e1472..598e3a48bfe 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -17329,13 +17329,13 @@ func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
 // SparseBincountAttr is an optional argument to SparseBincount.
 type SparseBincountAttr func(optionalAttr)
 
-// SparseBincountBinaryCount sets the optional binary_count attribute to value.
+// SparseBincountBinaryOutput sets the optional binary_output attribute to value.
 //
 // value: bool; Whether the kernel should count the appearance or number of occurrences.
 // If not specified, defaults to false
-func SparseBincountBinaryCount(value bool) SparseBincountAttr {
+func SparseBincountBinaryOutput(value bool) SparseBincountAttr {
 	return func(m optionalAttr) {
-		m["binary_count"] = value
+		m["binary_output"] = value
 	}
 }
 
@@ -17434,13 +17434,13 @@ func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 // DenseBincountAttr is an optional argument to DenseBincount.
 type DenseBincountAttr func(optionalAttr)
 
-// DenseBincountBinaryCount sets the optional binary_count attribute to value.
+// DenseBincountBinaryOutput sets the optional binary_output attribute to value.
 //
 // value: bool; Whether the kernel should count the appearance or number of occurrences.
 // If not specified, defaults to false
-func DenseBincountBinaryCount(value bool) DenseBincountAttr {
+func DenseBincountBinaryOutput(value bool) DenseBincountAttr {
 	return func(m optionalAttr) {
-		m["binary_count"] = value
+		m["binary_output"] = value
 	}
 }
 
@@ -38654,13 +38654,13 @@ func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output,
 // RaggedBincountAttr is an optional argument to RaggedBincount.
 type RaggedBincountAttr func(optionalAttr)
 
-// RaggedBincountBinaryCount sets the optional binary_count attribute to value.
+// RaggedBincountBinaryOutput sets the optional binary_output attribute to value.
 //
 // value: bool; Whether the kernel should count the appearance or number of occurrences.
 // If not specified, defaults to false
-func RaggedBincountBinaryCount(value bool) RaggedBincountAttr {
+func RaggedBincountBinaryOutput(value bool) RaggedBincountAttr {
 	return func(m optionalAttr) {
-		m["binary_count"] = value
+		m["binary_output"] = value
 	}
 }
 

From 43adb4ee3b96e250e05533bc2470813c2f70272c Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Wed, 13 May 2020 18:46:10 -0700
Subject: [PATCH 158/412] Remove nanopb dependency as it should not be needed
 anymore (grpc version on master no longer needs it)

PiperOrigin-RevId: 311447262
Change-Id: Icb8b98542188be57e5402cdbe4d9fb90c2c84f8d
---
 tensorflow/workspace.bzl | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6a958e1b00f..9b745656125 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -643,17 +643,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
-    tf_http_archive(
-        name = "com_github_nanopb_nanopb",
-        sha256 = "18234d9f01b57248472a9bfa65c3379352b5d66c15b0ef1c2b4feece4b5670fe",
-        build_file = "@com_github_grpc_grpc//third_party:nanopb.BUILD",
-        strip_prefix = "nanopb-0.4.1",
-        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/nanopb/nanopb/archive/0.4.1.tar.gz",
-            "https://github.com/nanopb/nanopb/archive/0.4.1.tar.gz",
-        ],
-    )
-
     tf_http_archive(
         name = "linenoise",
         build_file = clean_dep("//third_party:linenoise.BUILD"),

From d2c578c71901275323ba3c00c57ec2e91531a698 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Wed, 13 May 2020 18:50:29 -0700
Subject: [PATCH 159/412] [XLA:SPMD] Avoid designated initializer.

It broke external build.

PiperOrigin-RevId: 311447720
Change-Id: I460624dc2242deead277eb70fbd1c6a0701250f6
---
 .../xla/service/spmd/spmd_partitioner.h         | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
index 09d2c4af908..f22f564be73 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -370,14 +370,15 @@ class SpmdPartitioningVisitor : public DfsHloVisitorWithDefault {
   int64 NewChannel() { return (*next_channel_id_)++; }
 
   PartitionedHlo::PartitioningState MakePartitioningState() {
-    return PartitionedHlo::PartitioningState{
-        .b = &b_,
-        .module = module_,
-        .num_replicas = num_replicas_,
-        .partition_id = partition_id_,
-        .collective_ops_creator = collective_ops_creator_,
-        .next_channel_id = next_channel_id_,
-        .reshard_cache = &reshard_cache_};
+    PartitionedHlo::PartitioningState state;
+    state.b = &b_;
+    state.module = module_;
+    state.num_replicas = num_replicas_;
+    state.partition_id = partition_id_;
+    state.collective_ops_creator = collective_ops_creator_;
+    state.next_channel_id = next_channel_id_;
+    state.reshard_cache = &reshard_cache_;
+    return state;
   }
 
   SpmdBuilder* builder() { return &b_; }

From cde93f014c5b42800d3a43ffea53001ccc635e29 Mon Sep 17 00:00:00 2001
From: ShengYang1 <yang.sheng@intel.com>
Date: Thu, 14 May 2020 10:53:58 +0800
Subject: [PATCH 160/412] update

---
 .../core/common_runtime/mkl_layout_pass.cc    |   2 +-
 .../common_runtime/mkl_layout_pass_test.cc    |  44 +--
 .../grappler/optimizers/mkl_remapper_test.cc  | 277 ++++++++----------
 .../core/grappler/optimizers/remapper.cc      |  11 +-
 tensorflow/core/kernels/BUILD                 |   4 -
 5 files changed, 145 insertions(+), 193 deletions(-)

diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 3695c4ca7f9..3374113465f 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -1680,7 +1680,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   }
 
   static bool FusedBatchNormExRewrite(const Node* n) {
-    CHECK_NOTNULL(n);
+    DCHECK(n);
 
     int num_side_inputs;
     TF_CHECK_OK(GetNodeAttr(n->def(), "num_side_inputs", &num_side_inputs));
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
index 3f02c4b1512..71ab786f8a5 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass_test.cc
@@ -3216,18 +3216,17 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Negative) {
             "B->F:1;C->F:2;D->F:3;E->F:4;F->G:1");
 }
 
+// clang-format off
 #ifdef ENABLE_MKLDNN_V1
 #define REGISTER_TEST(NAME, T, INPUT)                                        \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                    \
-    InitGraph("node { name: 'A' op: '" #INPUT                                \
-              "'}"                                                           \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                           \
               "node { name: 'B' op: 'Input'}"                                \
               "node { name: 'C' op: 'Input'}"                                \
               "node { name: 'D' op: 'Input'}"                                \
               "node { name: 'E' op: 'Input'}"                                \
               "node { name: 'F' op: '_FusedBatchNormEx'"                     \
-              " attr { key: 'T'               value { type: " #T             \
-              " } }"                                                         \
+              " attr { key: 'T'               value { type: " #T " } }"      \
               " attr { key: 'U'               value { type: DT_FLOAT } }"    \
               " attr { key: 'data_format'     value { s: 'NCHW' } }"         \
               " attr { key: 'epsilon'         value { f: 0.0001 } }"         \
@@ -3236,12 +3235,10 @@ TEST_F(MklLayoutPassTest, NodeRewrite_FusedBatchNormV3_Negative) {
               " attr { key: 'activation_mode' value { s: 'Relu' } }"         \
               " input: ['A', 'B', 'C', 'D', 'E'] }"                          \
               "node { name: 'G' op: 'Zeta'"                                  \
-              " attr { key: 'T' value { type: " #T                           \
-              " } }"                                                         \
+              " attr { key: 'T' value { type: " #T " } }"                    \
               " input: ['A', 'F'] }");                                       \
     EXPECT_EQ(DoMklLayoutOptimizationPass(),                                 \
-              "A(" #INPUT                                                    \
-              ");B(Input);C(Input);D(Input);"                                \
+              "A(" #INPUT ");B(Input);C(Input);D(Input);"                    \
               "DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);DMT/_3(Const);"     \
               "DMT/_4(Const);E(Input);"                                      \
               "F(_MklFusedBatchNormEx);G(Zeta)|A->F;A->G;"                   \
@@ -3257,17 +3254,14 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Positive);
 // Rewrite test for _FusedBatchNormEx Op with side input
 #define REGISTER_TEST(NAME, T, INPUT)                                     \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                 \
-    InitGraph("node { name: 'A' op: '" #INPUT                             \
-              "'}"                                                        \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                        \
               "node { name: 'B' op: 'Input'}"                             \
               "node { name: 'C' op: 'Input'}"                             \
               "node { name: 'D' op: 'Input'}"                             \
               "node { name: 'E' op: 'Input'}"                             \
-              "node { name: 'F' op: '" #INPUT                             \
-              "'}"                                                        \
+              "node { name: 'F' op: '" #INPUT "'}"                        \
               "node { name: 'G' op: '_FusedBatchNormEx'"                  \
-              " attr { key: 'T'               value { type: " #T          \
-              " } }"                                                      \
+              " attr { key: 'T'               value { type: " #T " } }"   \
               " attr { key: 'U'               value { type: DT_FLOAT } }" \
               " attr { key: 'data_format'     value { s: 'NCHW' } }"      \
               " attr { key: 'epsilon'         value { f: 0.0001 } }"      \
@@ -3276,14 +3270,11 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Positive);
               " attr { key: 'activation_mode' value { s: 'Relu' } }"      \
               " input: ['A', 'B', 'C', 'D', 'E', 'F'] }"                  \
               "node { name: 'H' op: 'Zeta'"                               \
-              " attr { key: 'T' value { type: " #T                        \
-              " } }"                                                      \
+              " attr { key: 'T' value { type: " #T " } }"                 \
               " input: ['A', 'G'] }");                                    \
     EXPECT_EQ(DoMklLayoutOptimizationPass(),                              \
-              "A(" #INPUT                                                 \
-              ");B(Input);C(Input);D(Input);E(Input);"                    \
-              "F(" #INPUT                                                 \
-              ");G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"                 \
+              "A(" #INPUT ");B(Input);C(Input);D(Input);E(Input);"        \
+              "F(" #INPUT ");G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"     \
               "B->G:1;C->G:2;D->G:3;E->G:4;F->G:5;G->H:1");               \
   }
 REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative1);
@@ -3292,15 +3283,13 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative1);
 // Rewrite test for _FusedBatchNormEx Op with Identity activation
 #define REGISTER_TEST(NAME, T, INPUT)                                     \
   TEST_F(MklLayoutPassTest, NAME##_##T) {                                 \
-    InitGraph("node { name: 'A' op: '" #INPUT                             \
-              "'}"                                                        \
+    InitGraph("node { name: 'A' op: '" #INPUT "'}"                        \
               "node { name: 'B' op: 'Input'}"                             \
               "node { name: 'C' op: 'Input'}"                             \
               "node { name: 'D' op: 'Input'}"                             \
               "node { name: 'E' op: 'Input'}"                             \
               "node { name: 'G' op: '_FusedBatchNormEx'"                  \
-              " attr { key: 'T'               value { type: " #T          \
-              " } }"                                                      \
+              " attr { key: 'T'               value { type: " #T " } }"   \
               " attr { key: 'U'               value { type: DT_FLOAT } }" \
               " attr { key: 'data_format'     value { s: 'NCHW' } }"      \
               " attr { key: 'epsilon'         value { f: 0.0001 } }"      \
@@ -3309,18 +3298,17 @@ REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative1);
               " attr { key: 'activation_mode' value { s: 'Identity' } }"  \
               " input: ['A', 'B', 'C', 'D', 'E'] }"                       \
               "node { name: 'H' op: 'Zeta'"                               \
-              " attr { key: 'T' value { type: " #T                        \
-              " } }"                                                      \
+              " attr { key: 'T' value { type: " #T " } }"                 \
               " input: ['A', 'G'] }");                                    \
     EXPECT_EQ(DoMklLayoutOptimizationPass(),                              \
-              "A(" #INPUT                                                 \
-              ");B(Input);C(Input);D(Input);E(Input);"                    \
+              "A(" #INPUT ");B(Input);C(Input);D(Input);E(Input);"        \
               "G(_FusedBatchNormEx);H(Zeta)|A->G;A->H;"                   \
               "B->G:1;C->G:2;D->G:3;E->G:4;G->H:1");                      \
   }
 REGISTER_TEST_ALL_TYPES(NodeRewrite_FusedBatchNormEx_Negative2);
 #undef REGISTER_TEST
 #endif  // ENABLE_MKLDNN_V1
+// clang-format on
 
 TEST_F(MklLayoutPassTest, NodeRewrite_QuantizedDepthwiseConv2D_Positive) {
   InitGraph(
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index 85d802a2e38..cf1953fcdb2 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -300,169 +300,136 @@ TEST_F(MklRemapperTest, FuseBatchNormWithRelu) {
   using ::tensorflow::ops::Placeholder;
 
   for (bool is_training : {true, false}) {
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    for (bool has_side_input : {true, false}) {
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-    const int num_channels = 24;
+      const int num_channels = 24;
 
-    TensorShape channel_shape({num_channels});
-    TensorShape empty_shape({0});
+      TensorShape channel_shape({num_channels});
+      TensorShape empty_shape({0});
 
-    auto input = Placeholder(s.WithOpName("input"), DT_FLOAT,
-                             ops::Placeholder::Shape({2, 8, 8, num_channels}));
-    auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_FLOAT);
-    auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
-    auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
-    auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
-    auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
+      auto input =
+          Placeholder(s.WithOpName("input"), DT_FLOAT,
+                      ops::Placeholder::Shape({2, 8, 8, num_channels}));
+      auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_FLOAT);
+      auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
+      auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
+      auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
+      auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
 
-    float epsilon = 0.1f;
-    auto fbn = ops::FusedBatchNormV3(
-        s.WithOpName("fused_batch_norm"), input_cast, scale, offset, mean, var,
-        ops::FusedBatchNormV3::IsTraining(is_training)
-            .Epsilon(epsilon)
-            .DataFormat("NHWC"));
-    auto relu = ops::Relu(s.WithOpName("relu"), fbn.y);
-    auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+      float epsilon = 0.1f;
+      auto fbn =
+          ops::FusedBatchNormV3(s.WithOpName("fused_batch_norm"), input_cast,
+                                scale, offset, mean, var,
+                                ops::FusedBatchNormV3::IsTraining(is_training)
+                                    .Epsilon(epsilon)
+                                    .DataFormat("NHWC"));
 
-    auto input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
-    auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
-    auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
-    auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
-                                                             : channel_shape);
-    auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
-                                                            : channel_shape);
+      if (has_side_input) {
+        auto side_input =
+            Placeholder(s.WithOpName("side_input"), DT_FLOAT,
+                        ops::Placeholder::Shape({2, 8, 8, num_channels}));
+        auto side_input_cast =
+            ops::Cast(s.WithOpName("side_input_cast"), side_input, DT_FLOAT);
+        auto add = ops::Add(s.WithOpName("add"), fbn.y, side_input_cast);
+        auto relu = ops::Relu(s.WithOpName("relu"), add);
+      } else {
+        auto relu = ops::Relu(s.WithOpName("relu"), fbn.y);
+      }
 
-    GrapplerItem item;
-    item.fetch = {"fetch"};
-    item.feed = {{"input", input_t},
-                 {"scale", scale_t},
-                 {"offset", offset_t},
-                 {"mean", mean_t},
-                 {"var", var_t}};
-    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+      auto input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
+      auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+      auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
+      auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                               : channel_shape);
+      auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
+                                                              : channel_shape);
+      auto side_input_t =
+          GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
 
-    // Place all nodes on CPU.
-    for (int i = 0; i < item.graph.node_size(); ++i) {
-      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+      GrapplerItem item;
+      item.fetch = {"relu"};
+      if (has_side_input)
+        item.feed = {{"input", input_t},   {"scale", scale_t},
+                     {"offset", offset_t}, {"mean", mean_t},
+                     {"var", var_t},       {"side_input", side_input_t}};
+      else
+        item.feed = {{"input", input_t},
+                     {"scale", scale_t},
+                     {"offset", offset_t},
+                     {"mean", mean_t},
+                     {"var", var_t}};
+      TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+      // Place all nodes on CPU.
+      for (int i = 0; i < item.graph.node_size(); ++i) {
+        item.graph.mutable_node(i)->set_device("/device:CPU:0");
+      }
+
+      Remapper optimizer(RewriterConfig::AGGRESSIVE);
+      GraphDef output;
+      TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+      int found = 0;
+      if (has_side_input) {
+        for (const NodeDef& node : output.node()) {
+          if (node.name() == "add") {
+            EXPECT_EQ(node.op(), "Add");
+            ASSERT_EQ(node.input_size(), 2);
+            EXPECT_EQ(node.input(0), "fused_batch_norm");
+            EXPECT_EQ(node.input(1), "side_input_cast");
+            found++;
+          }
+          if (node.name() == "relu") {
+            EXPECT_EQ(node.op(), "Relu");
+            ASSERT_EQ(node.input_size(), 1);
+            EXPECT_EQ(node.input(0), "add");
+            found++;
+          }
+          if (node.name() == "fused_batch_norm") {
+            EXPECT_EQ(node.op(), "FusedBatchNormV3");
+            ASSERT_EQ(node.input_size(), 5);
+            EXPECT_EQ(node.input(0), "input_cast");
+            EXPECT_EQ(node.input(1), "scale");
+            EXPECT_EQ(node.input(2), "offset");
+            EXPECT_EQ(node.input(3), "mean");
+            EXPECT_EQ(node.input(4), "var");
+            found++;
+          }
+        }
+        EXPECT_EQ(found, 3);
+      } else {
+        for (const NodeDef& node : output.node()) {
+          if (node.name() == "relu") {
+            EXPECT_EQ(node.op(), "Identity");
+            ASSERT_EQ(node.input_size(), 1);
+            EXPECT_EQ(node.input(0), "fused_batch_norm");
+            found++;
+          }
+          if (node.name() == "fused_batch_norm") {
+            EXPECT_EQ(node.op(), "_FusedBatchNormEx");
+            ASSERT_EQ(node.input_size(), 5);
+            EXPECT_EQ(node.input(0), "input_cast");
+            EXPECT_EQ(node.input(1), "scale");
+            EXPECT_EQ(node.input(2), "offset");
+            EXPECT_EQ(node.input(3), "mean");
+            EXPECT_EQ(node.input(4), "var");
+
+            auto attr = node.attr();
+            EXPECT_EQ(attr["num_side_inputs"].i(), 0);
+            EXPECT_EQ(attr["activation_mode"].s(), "Relu");
+            found++;
+          }
+        }
+        EXPECT_EQ(found, 2);
+      }
+
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+      ASSERT_EQ(tensors_expected.size(), 1);
+      auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+      ASSERT_EQ(tensors.size(), 1);
+      test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
     }
-
-    Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
-    GraphDef output;
-    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-
-    int found = 0;
-    for (const NodeDef& node : output.node()) {
-      if (node.name() == "relu") {
-        EXPECT_EQ(node.op(), "Identity");
-        ASSERT_EQ(node.input_size(), 1);
-        EXPECT_EQ(node.input(0), "fused_batch_norm");
-        found++;
-      }
-      if (node.name() == "fused_batch_norm") {
-        EXPECT_EQ(node.op(), "_FusedBatchNormEx");
-        ASSERT_EQ(node.input_size(), 5);
-        EXPECT_EQ(node.input(0), "input_cast");
-        EXPECT_EQ(node.input(1), "scale");
-        EXPECT_EQ(node.input(2), "offset");
-        EXPECT_EQ(node.input(3), "mean");
-        EXPECT_EQ(node.input(4), "var");
-
-        auto attr = node.attr();
-        EXPECT_EQ(attr["num_side_inputs"].i(), 0);
-        EXPECT_EQ(attr["activation_mode"].s(), "Relu");
-        found++;
-      }
-    }
-    EXPECT_EQ(found, 2);
-  }
-}
-
-TEST_F(MklRemapperTest, FuseBatchNormWithAddAndRelu) {
-  using ::tensorflow::ops::Placeholder;
-
-  for (bool is_training : {true, false}) {
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-
-    const int num_channels = 24;
-
-    TensorShape input_shape({2, 8, 8, num_channels});
-    TensorShape channel_shape({num_channels});
-    TensorShape empty_shape({0});
-
-    auto input = Placeholder(s.WithOpName("input"), DT_FLOAT,
-                             ops::Placeholder::Shape(input_shape));
-    auto input_cast = ops::Cast(s.WithOpName("input_cast"), input, DT_FLOAT);
-    auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT);
-    auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT);
-    auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT);
-    auto var = Placeholder(s.WithOpName("var"), DT_FLOAT);
-    auto side_input = Placeholder(s.WithOpName("side_input"), DT_FLOAT,
-                                  ops::Placeholder::Shape(input_shape));
-    auto side_input_cast =
-        ops::Cast(s.WithOpName("side_input_cast"), side_input, DT_FLOAT);
-
-    float epsilon = 0.1f;
-    auto fbn = ops::FusedBatchNormV3(
-        s.WithOpName("fused_batch_norm"), input_cast, scale, offset, mean, var,
-        ops::FusedBatchNormV3::IsTraining(is_training)
-            .Epsilon(epsilon)
-            .DataFormat("NHWC"));
-    auto add = ops::Add(s.WithOpName("add"), fbn.y, side_input_cast);
-    auto relu = ops::Relu(s.WithOpName("relu"), add);
-    auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
-
-    auto input_t = GenerateRandomTensor<DT_FLOAT>(input_shape);
-    auto scale_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
-    auto offset_t = GenerateRandomTensor<DT_FLOAT>(channel_shape);
-    auto mean_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
-                                                             : channel_shape);
-    auto var_t = GenerateRandomTensor<DT_FLOAT>(is_training ? empty_shape
-                                                            : channel_shape);
-    auto side_input_t = GenerateRandomTensor<DT_FLOAT>({2, 8, 8, num_channels});
-
-    GrapplerItem item;
-    item.fetch = {"fetch"};
-    item.feed = {{"input", input_t},   {"scale", scale_t},
-                 {"offset", offset_t}, {"mean", mean_t},
-                 {"var", var_t},       {"side_input", side_input_t}};
-    TF_ASSERT_OK(s.ToGraphDef(&item.graph));
-
-    // Place all nodes on CPU.
-    for (int i = 0; i < item.graph.node_size(); ++i) {
-      item.graph.mutable_node(i)->set_device("/device:CPU:0");
-    }
-
-    Remapper optimizer(RewriterConfig::AGGRESSIVE);  // trust placeholders shape
-    GraphDef output;
-    TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
-
-    int found = 0;
-    for (const NodeDef& node : output.node()) {
-      if (node.name() == "add") {
-        EXPECT_EQ(node.op(), "Add");
-        ASSERT_EQ(node.input_size(), 2);
-        EXPECT_EQ(node.input(0), "fused_batch_norm");
-        EXPECT_EQ(node.input(1), "side_input_cast");
-        found++;
-      }
-      if (node.name() == "relu") {
-        EXPECT_EQ(node.op(), "Relu");
-        ASSERT_EQ(node.input_size(), 1);
-        EXPECT_EQ(node.input(0), "add");
-        found++;
-      }
-      if (node.name() == "fused_batch_norm") {
-        EXPECT_EQ(node.op(), "FusedBatchNormV3");
-        ASSERT_EQ(node.input_size(), 5);
-        EXPECT_EQ(node.input(0), "input_cast");
-        EXPECT_EQ(node.input(1), "scale");
-        EXPECT_EQ(node.input(2), "offset");
-        EXPECT_EQ(node.input(3), "mean");
-        EXPECT_EQ(node.input(4), "var");
-        found++;
-      }
-    }
-    EXPECT_EQ(found, 3);
   }
 }
 #endif  // ENABLE_MKLDNN_V1
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index eeaaefc52c0..9a7d1953105 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -796,8 +796,9 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
       [&](const utils::MutableNodeView& fused_batch_norm) -> bool {
     const auto* fused_batch_norm_node_def = fused_batch_norm.node();
     if (!IsFusedBatchNorm(*fused_batch_norm_node_def)) return false;
-// We fuse FusedBatchNorm on GPU or MKL CPU.
+
 #ifndef ENABLE_MKLDNN_V1
+    // We fuse FusedBatchNorm on GPU or MKL CPU.
     if (!NodeIsOnGpu(fused_batch_norm_node_def)) return false;
 #endif
 
@@ -868,8 +869,8 @@ bool FindFusedBatchNormEx(const RemapperContext& ctx, int node_index,
 
   // Input to a Relu can be an Add node with FusedBatchNorm as one of the inputs
   if (IsAdd(*relu_fanin_0_node_def)) {
-// Currently no CPU implementation for "FusedBatchNorm + SideInput +
-// <Activation>""
+    // Currently no CPU implementation for "FusedBatchNorm + SideInput +
+    // <Activation>""
 #ifdef ENABLE_MKLDNN_V1
     return false;
 #endif
@@ -959,10 +960,10 @@ void CopyFusedBatchNormAttributes(const NodeDef& fused_batch_norm,
 
   // FusedBatchNormV2 and V3 have an extra type parameter.
   if (fused_batch_norm.op() != "FusedBatchNorm") {
-    (*attr)["U"] = src_attr.at("U");
+    SetAttrValue(src_attr.at("U"), &(*attr)["U"]);
   } else {
 #ifndef ENABLE_MKLDNN_V1
-    (*attr)["U"] = src_attr.at("T");
+    SetAttrValue(src_attr.at("T"), &(*attr)["U"]);
 #else
     SetAttrValue(DT_FLOAT, &(*attr)["U"]);
 #endif
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index b009cbfb565..d5d59329c9a 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -8225,10 +8225,6 @@ tf_mkl_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_fused_batch_norm_op",
     srcs = ["mkl_fused_batch_norm_op.cc"],
-    hdrs = [
-        "fused_batch_norm_op.h",
-        "no_op.h",
-    ],
     deps = NN_DEPS + [
         ":fused_batch_norm_op",
         ":no_op",

From db9b247cd1f3ff046359f7b64ca60c2d697fe2e1 Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Wed, 13 May 2020 19:53:08 -0700
Subject: [PATCH 161/412] Fix the functional model loading with nested
 sequential model.

The nested sequential model is created with _is_graph_network = False, the current instance check is not strong enough.

PiperOrigin-RevId: 311454248
Change-Id: I3b36cc037474587c134eab567d42694129c5cf52
---
 tensorflow/python/keras/engine/functional.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index 80eb6cb27d5..c79e2849c4f 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -1017,7 +1017,9 @@ def _map_subgraph_network(inputs, outputs):
 def _should_skip_first_node(layer):
   """Returns True if the first layer node should not be saved or loaded."""
   # Networks start with a pre-existing node linking their input to output.
-  return isinstance(layer, Functional)
+  # For a sequential model, it is first created with _is_graph_network = False,
+  # we have to keep the _is_graph_network check here.
+  return isinstance(layer, Functional) and layer._is_graph_network
 
 
 def _deserialize_keras_tensors(kwargs, layer_map):

From 1549473a2e9e50ed9d3c751f25eaf7ee6db180d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 19:55:41 -0700
Subject: [PATCH 162/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311454462
Change-Id: If1168947a389a398dc9f2d50279c35212d21a973
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 598e3a48bfe..c6d67c9ad44 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25654,7 +25654,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25717,7 +25717,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25968,7 +25968,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26452,7 +26452,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45540,7 +45540,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47480,7 +47480,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47551,7 +47551,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48540,7 +48540,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 5e6479904941624cf7ce58ab3d236375c8012ef4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 20:19:40 -0700
Subject: [PATCH 163/412] Switch weights from per-value to per-input-item.

PiperOrigin-RevId: 311457055
Change-Id: I533b66dad37855bb264c73703c71d15da2ee2511
---
 .../api_def_DenseCountSparseOutput.pbtxt      |  23 +-
 .../api_def_RaggedCountSparseOutput.pbtxt     |  27 +-
 .../api_def_SparseCountSparseOutput.pbtxt     |  29 ++-
 tensorflow/core/kernels/count_ops.cc          | 246 +++++++-----------
 tensorflow/core/ops/count_ops.cc              |  39 +--
 tensorflow/python/ops/bincount.py             | 151 +++++++++--
 tensorflow/python/ops/bincount_test.py        | 188 +++++++++----
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v1/tensorflow.sparse.pbtxt     |   2 +-
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v2/tensorflow.sparse.pbtxt     |   2 +-
 11 files changed, 441 insertions(+), 278 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
index 416da1ccaab..8296bfe6d7b 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
@@ -4,61 +4,62 @@ op {
   in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing data to count.
+Tensor containing data to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values. May
+also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
+Values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a tf.tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
index 1763aea1fa6..37224d841de 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
@@ -4,67 +4,68 @@ op {
   in_arg {
     name: "splits"
     description: <<END
-int64; Tensor containing the row splits of the ragged tensor to count.
+Tensor containing the row splits of the ragged tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing values of the sparse tensor to count.
+Tensor containing values of the sparse tensor to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values.
+May also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
-  END
+Values tensor for the resulting sparse tensor object.
+END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
   END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a ragged tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
index 62538e36a45..a346710c8b3 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
@@ -4,73 +4,74 @@ op {
   in_arg {
     name: "indices"
     description: <<END
-int64; Tensor containing the indices of the sparse tensor to count.
+Tensor containing the indices of the sparse tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing values of the sparse tensor to count.
+Tensor containing values of the sparse tensor to count.
 END
   }
 in_arg {
     name: "dense_shape"
     description: <<END
-int64; Tensor containing the dense shape of the sparse tensor to count.
+Tensor containing the dense shape of the sparse tensor to count.
 END
   }
-  in_arg {
+ in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values.
+May also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_values"
       description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
+Values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_dense_shape"
       description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a sparse tensor input."
diff --git a/tensorflow/core/kernels/count_ops.cc b/tensorflow/core/kernels/count_ops.cc
index e7cc18ac454..7c85b050039 100644
--- a/tensorflow/core/kernels/count_ops.cc
+++ b/tensorflow/core/kernels/count_ops.cc
@@ -16,17 +16,20 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-using BatchedIntMap = std::vector<absl::flat_hash_map<int64, int64>>;
+template <class T>
+using BatchedMap = std::vector<absl::flat_hash_map<int64, T>>;
 
 namespace {
 // TODO(momernick): Extend this function to work with outputs of rank > 2.
-Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
+template <class T>
+Status OutputSparse(const BatchedMap<T>& per_batch_counts, int num_values,
                     bool is_1d, OpKernelContext* context) {
   int total_values = 0;
   int num_batches = per_batch_counts.size();
@@ -44,12 +47,12 @@ Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
       context->allocate_output(1, TensorShape({total_values}), &values));
 
   auto output_indices = indices->matrix<int64>();
-  auto output_values = values->flat<int64>();
+  auto output_values = values->flat<T>();
   int64 value_loc = 0;
   for (int b = 0; b < num_batches; ++b) {
     const auto& per_batch_count = per_batch_counts[b];
-    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
-                                           per_batch_count.end());
+    std::vector<std::pair<int, T>> pairs(per_batch_count.begin(),
+                                         per_batch_count.end());
     std::sort(pairs.begin(), pairs.end());
     for (const auto& x : pairs) {
       if (is_1d) {
@@ -77,85 +80,19 @@ Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
   return Status::OK();
 }
 
-Status OutputWeightedSparse(const BatchedIntMap& per_batch_counts,
-                            int num_values, const Tensor& weights, bool is_1d,
-                            OpKernelContext* context) {
-  if (!TensorShapeUtils::IsVector(weights.shape())) {
-    return errors::InvalidArgument(
-        "Weights must be a 1-dimensional tensor. Got: ",
-        weights.shape().DebugString());
-  }
-
-  if (num_values > weights.dim_size(0)) {
-    return errors::InvalidArgument("The maximum array value was ", num_values,
-                                   ", but the weight array has size ",
-                                   weights.shape().DebugString());
-  }
-  auto weight_values = weights.flat<float>();
-
-  int total_values = 0;
-  int num_batches = per_batch_counts.size();
-  for (const auto& per_batch_count : per_batch_counts) {
-    total_values += per_batch_count.size();
-  }
-
-  Tensor* indices;
-  int inner_dim = is_1d ? 1 : 2;
-  TF_RETURN_IF_ERROR(context->allocate_output(
-      0, TensorShape({total_values, inner_dim}), &indices));
-
-  Tensor* values;
-  TF_RETURN_IF_ERROR(
-      context->allocate_output(1, TensorShape({total_values}), &values));
-
-  auto output_indices = indices->matrix<int64>();
-  auto output_values = values->flat<float>();
-  int64 value_loc = 0;
-  for (int b = 0; b < num_batches; ++b) {
-    const auto& per_batch_count = per_batch_counts[b];
-    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
-                                           per_batch_count.end());
-    std::sort(pairs.begin(), pairs.end());
-    for (const auto& x : pairs) {
-      if (is_1d) {
-        output_indices(value_loc, 0) = x.first;
-      } else {
-        output_indices(value_loc, 0) = b;
-        output_indices(value_loc, 1) = x.first;
-      }
-      output_values(value_loc) = x.second * weight_values(x.first);
-      ++value_loc;
-    }
-  }
-
-  Tensor* dense_shape;
-  if (is_1d) {
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({1}), &dense_shape));
-    dense_shape->flat<int64>().data()[0] = num_values;
-  } else {
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({2}), &dense_shape));
-    dense_shape->flat<int64>().data()[0] = num_batches;
-    dense_shape->flat<int64>().data()[1] = num_values;
-  }
-  return Status::OK();
-}
-
-template <class T>
-T GetOutputSize(T max_seen, T max_length, T min_length) {
+int GetOutputSize(int max_seen, int max_length, int min_length) {
   return max_length > 0 ? max_length : std::max((max_seen + 1), min_length);
 }
 
 }  // namespace
 
-template <class T>
+template <class T, class W>
 class DenseCount : public OpKernel {
  public:
   explicit DenseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -170,6 +107,15 @@ class DenseCount : public OpKernel {
                     "Input must be a 1 or 2-dimensional tensor. Got: ",
                     data.shape().DebugString()));
 
+    if (use_weights) {
+      OP_REQUIRES(
+          context, weights.shape() == data.shape(),
+          errors::InvalidArgument(
+              "Weights and data must have the same shape. Weight shape: ",
+              weights.shape().DebugString(),
+              "; data shape: ", data.shape().DebugString()));
+    }
+
     bool is_1d = TensorShapeUtils::IsVector(data.shape());
     int negative_valued_axis = -1;
     int num_batch_dimensions = (data.shape().dims() + negative_valued_axis);
@@ -179,19 +125,23 @@ class DenseCount : public OpKernel {
       num_batch_elements *= data.shape().dim_size(i);
     }
     int num_value_elements = data.shape().num_elements() / num_batch_elements;
-    auto per_batch_counts = BatchedIntMap(num_batch_elements);
+    auto per_batch_counts = BatchedMap<W>(num_batch_elements);
+
     T max_value = 0;
 
     const auto data_values = data.flat<T>();
+    const auto weight_values = weights.flat<W>();
     int i = 0;
     for (int b = 0; b < num_batch_elements; ++b) {
       for (int v = 0; v < num_value_elements; ++v) {
         const auto& value = data_values(i);
         if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-          if (binary_count_) {
-            (per_batch_counts[b])[value] = 1;
+          if (binary_output_) {
+            per_batch_counts[b][value] = 1;
+          } else if (use_weights) {
+            per_batch_counts[b][value] += weight_values(i);
           } else {
-            (per_batch_counts[b])[value]++;
+            per_batch_counts[b][value]++;
           }
           if (value > max_value) {
             max_value = value;
@@ -201,30 +151,24 @@ class DenseCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, is_1d, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           is_1d, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
 };
 
-template <class T>
+template <class T, class W>
 class SparseCount : public OpKernel {
  public:
   explicit SparseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -235,23 +179,27 @@ class SparseCount : public OpKernel {
     bool use_weights = weights.NumElements() > 0;
 
     bool is_1d = shape.NumElements() == 1;
-    const auto indices_values = indices.matrix<int64>();
-    const auto values_values = values.flat<T>();
-
     int num_batches = is_1d ? 1 : shape.flat<int64>()(0);
     int num_values = values.NumElements();
 
-    auto per_batch_counts = BatchedIntMap(num_batches);
+    const auto indices_values = indices.matrix<int64>();
+    const auto values_values = values.flat<T>();
+    const auto weight_values = weights.flat<W>();
+
+    auto per_batch_counts = BatchedMap<W>(num_batches);
+
     T max_value = 0;
 
     for (int idx = 0; idx < num_values; ++idx) {
       int batch = is_1d ? 0 : indices_values(idx, 0);
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_count_) {
-          (per_batch_counts[batch])[value] = 1;
+        if (binary_output_) {
+          per_batch_counts[batch][value] = 1;
+        } else if (use_weights) {
+          per_batch_counts[batch][value] += weight_values(idx);
         } else {
-          (per_batch_counts[batch])[value]++;
+          per_batch_counts[batch][value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -259,30 +207,25 @@ class SparseCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, is_1d, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           is_1d, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+  bool validate_;
 };
 
-template <class T>
+template <class T, class W>
 class RaggedCount : public OpKernel {
  public:
   explicit RaggedCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -290,13 +233,15 @@ class RaggedCount : public OpKernel {
     const Tensor& values = context->input(1);
     const Tensor& weights = context->input(2);
     bool use_weights = weights.NumElements() > 0;
+    bool is_1d = false;
 
     const auto splits_values = splits.flat<int64>();
     const auto values_values = values.flat<T>();
+    const auto weight_values = weights.flat<W>();
     int num_batches = splits.NumElements() - 1;
     int num_values = values.NumElements();
 
-    auto per_batch_counts = BatchedIntMap(num_batches);
+    auto per_batch_counts = BatchedMap<W>(num_batches);
     T max_value = 0;
     int batch_idx = 0;
 
@@ -306,10 +251,12 @@ class RaggedCount : public OpKernel {
       }
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_count_) {
-          (per_batch_counts[batch_idx - 1])[value] = 1;
+        if (binary_output_) {
+          per_batch_counts[batch_idx - 1][value] = 1;
+        } else if (use_weights) {
+          per_batch_counts[batch_idx - 1][value] += weight_values(idx);
         } else {
-          (per_batch_counts[batch_idx - 1])[value]++;
+          per_batch_counts[batch_idx - 1][value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -317,42 +264,47 @@ class RaggedCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, false, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           false, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+  bool validate_;
 };
 
-#define REGISTER(TYPE)                                    \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")  \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          DenseCount<TYPE>)               \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput") \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          SparseCount<TYPE>)              \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput") \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          RaggedCount<TYPE>)
+#define REGISTER_W(W_TYPE) \
+  REGISTER(int32, W_TYPE)  \
+  REGISTER(int64, W_TYPE)
 
-REGISTER(int32);
-REGISTER(int64);
+#define REGISTER(I_TYPE, W_TYPE)                                     \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")             \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          DenseCount<I_TYPE, W_TYPE>)                \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput")            \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          SparseCount<I_TYPE, W_TYPE>)               \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput")            \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          RaggedCount<I_TYPE, W_TYPE>)
+
+TF_CALL_INTEGRAL_TYPES(REGISTER_W);
+TF_CALL_float(REGISTER_W);
+TF_CALL_double(REGISTER_W);
+
+#undef REGISTER_W
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/count_ops.cc b/tensorflow/core/ops/count_ops.cc
index c9fbe1f8d8e..8de0a2ef954 100644
--- a/tensorflow/core/ops/count_ops.cc
+++ b/tensorflow/core/ops/count_ops.cc
@@ -19,12 +19,21 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
-  int32 rank = c->Rank(c->input(0));
-  DimensionHandle nvals = c->UnknownDim();
+  auto values = c->input(0);
+  auto weights = c->input(1);
+  ShapeHandle output;
+  auto num_weights = c->NumElements(weights);
+  if (c->ValueKnown(num_weights) && c->Value(num_weights) == 0) {
+    output = values;
+  } else {
+    TF_RETURN_IF_ERROR(c->Merge(weights, values, &output));
+  }
+  auto rank = c->Rank(output);
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -32,8 +41,8 @@ Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
 }
 
 Status SparseCountSparseOutputShapeFn(InferenceContext *c) {
-  DimensionHandle rank = c->Dim(c->input(0), 1);
-  DimensionHandle nvals = c->UnknownDim();
+  auto rank = c->Dim(c->input(0), 1);
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -45,7 +54,7 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
   if (rank != c->kUnknownRank) {
     ++rank;  // Add the ragged dimension
   }
-  DimensionHandle nvals = c->UnknownDim();
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -54,12 +63,12 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
 
 REGISTER_OP("DenseCountSparseOutput")
     .Input("values: T")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(DenseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -69,12 +78,12 @@ REGISTER_OP("SparseCountSparseOutput")
     .Input("indices: int64")
     .Input("values: T")
     .Input("dense_shape: int64")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(SparseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -83,12 +92,12 @@ REGISTER_OP("SparseCountSparseOutput")
 REGISTER_OP("RaggedCountSparseOutput")
     .Input("splits: int64")
     .Input("values: T")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(RaggedCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
diff --git a/tensorflow/python/ops/bincount.py b/tensorflow/python/ops/bincount.py
index e1b3bebaaaa..68950eaf596 100644
--- a/tensorflow/python/ops/bincount.py
+++ b/tensorflow/python/ops/bincount.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_count_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util.tf_export import tf_export
@@ -33,7 +33,7 @@ def sparse_bincount(values,
                     axis=0,
                     minlength=None,
                     maxlength=None,
-                    binary_count=False,
+                    binary_output=False,
                     name=None):
   """Count the number of times an integer value appears in a tensor.
 
@@ -58,8 +58,9 @@ def sparse_bincount(values,
     maxlength: If given, skips `values` that are greater than or equal to
       `maxlength`, and ensures that the output has a `dense_shape` of at most
       `maxlength` in the inner dimension.
-    binary_count: Whether to do a binary count. When True, this op will return 1
-      for any value that exists instead of counting the number of occurrences.
+    binary_output: If True, this op will output 1 instead of the number of times
+      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
+      reduce_add). Defaults to False.
     name: A name for this op.
 
   Returns:
@@ -78,7 +79,7 @@ def sparse_bincount(values,
   SparseTensor) and returns a SparseTensor where the value of (i,j) is the
   number of times value j appears in batch i.
 
-  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
   >>> output = tf.sparse.bincount(data, axis=-1)
   >>> print(output)
   SparseTensor(indices=tf.Tensor(
@@ -102,7 +103,7 @@ def sparse_bincount(values,
   dense shape is [2, 500] instead of [2,10002] or [2, 102].
 
   >>> minlength = maxlength = 500
-  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
   >>> output = tf.sparse.bincount(
   ...    data, axis=-1, minlength=minlength, maxlength=maxlength)
   >>> print(output)
@@ -123,8 +124,8 @@ def sparse_bincount(values,
   some values (like 20 in batch 1 and 11 in batch 2) appear more than once,
   the 'values' tensor is all 1s.
 
-  >>> dense = [[10, 20, 30, 20], [11, 101, 11, 10001]]
-  >>> output = tf.sparse.bincount(dense, binary_count=True, axis=-1)
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(data, binary_output=True, axis=-1)
   >>> print(output)
   SparseTensor(indices=tf.Tensor(
   [[    0    10]
@@ -136,20 +137,42 @@ def sparse_bincount(values,
    values=tf.Tensor([1 1 1 1 1 1], shape=(6,), dtype=int64),
    dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
 
+  **Weighted bin-counting**
+
+  This example takes two inputs - a values tensor and a weights tensor. These
+  tensors must be identically shaped, and have the same row splits or indices
+  in the case of RaggedTensors or SparseTensors. When performing a weighted
+  count, the op will output a SparseTensor where the value of (i, j) is the
+  sum of the values in the weight tensor's batch i in the locations where
+  the values tensor has the value j. In this case, the output dtype is the
+  same as the dtype of the weights tensor.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> weights = [[2, 0.25, 15, 0.5], [2, 17, 3, 0.9]]
+  >>> output = tf.sparse.bincount(data, weights=weights, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([2. 0.75 15. 5. 17. 0.9], shape=(6,), dtype=float32),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
   """
   with ops.name_scope(name, "count", [values, weights]):
     if not isinstance(values, sparse_tensor.SparseTensor):
       values = ragged_tensor.convert_to_tensor_or_ragged_tensor(
           values, name="values")
+    if weights is not None:
+      if not isinstance(weights, sparse_tensor.SparseTensor):
+        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+            weights, name="weights")
 
-    if weights is not None and binary_count:
-      raise ValueError("binary_count and weights are mutually exclusive.")
-
-    if weights is None:
-      weights = []
-      output_type = dtypes.int64
-    else:
-      output_type = dtypes.float32
+    if weights is not None and binary_output:
+      raise ValueError("binary_output and weights are mutually exclusive.")
 
     if axis is None:
       axis = 0
@@ -162,38 +185,114 @@ def sparse_bincount(values,
     maxlength_value = maxlength if maxlength is not None else -1
 
     if axis == 0:
-      if isinstance(values,
-                    (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
+      if isinstance(values, sparse_tensor.SparseTensor):
+        if weights is not None:
+          weights = validate_sparse_weights(values, weights)
+        values = values.values
+      elif isinstance(values, ragged_tensor.RaggedTensor):
+        if weights is not None:
+          weights = validate_ragged_weights(values, weights)
         values = values.values
       else:
+        if weights is not None:
+          weights = array_ops.reshape(weights, [-1])
         values = array_ops.reshape(values, [-1])
 
     if isinstance(values, sparse_tensor.SparseTensor):
+      weights = validate_sparse_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.sparse_count_sparse_output(
           values.indices,
           values.values,
           values.dense_shape,
-          weights=weights,
+          weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
+          binary_output=binary_output)
     elif isinstance(values, ragged_tensor.RaggedTensor):
+      weights = validate_ragged_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.ragged_count_sparse_output(
           values.row_splits,
           values.values,
-          weights=weights,
+          weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
+          binary_output=binary_output)
     else:
+      weights = validate_dense_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.dense_count_sparse_output(
           values,
           weights=weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
+          binary_output=binary_output)
 
     return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
+
+
+def validate_dense_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.dtype)
+
+  if not isinstance(weights, ops.Tensor):
+    raise ValueError(
+        "`weights` must be a tf.Tensor if `values` is a tf.Tensor.")
+
+  return weights
+
+
+def validate_sparse_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, sparse_tensor.SparseTensor):
+    raise ValueError(
+        "`weights` must be a SparseTensor if `values` is a SparseTensor.")
+
+  checks = []
+  if weights.dense_shape is not values.dense_shape:
+    checks.append(
+        check_ops.assert_equal(
+            weights.dense_shape,
+            values.dense_shape,
+            message="'weights' and 'values' must have the same dense shape."))
+  if weights.indices is not values.indices:
+    checks.append(
+        check_ops.assert_equal(
+            weights.indices,
+            values.indices,
+            message="'weights' and 'values' must have the same indices.")
+    )
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
+
+
+def validate_ragged_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, ragged_tensor.RaggedTensor):
+    raise ValueError(
+        "`weights` must be a RaggedTensor if `values` is a RaggedTensor.")
+
+  checks = []
+  if weights.row_splits is not values.row_splits:
+    checks.append(
+        check_ops.assert_equal(
+            weights.row_splits,
+            values.row_splits,
+            message="'weights' and 'values' must have the same row splits."))
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
diff --git a/tensorflow/python/ops/bincount_test.py b/tensorflow/python/ops/bincount_test.py
index 776b65b72d0..839af8dcc35 100644
--- a/tensorflow/python/ops/bincount_test.py
+++ b/tensorflow/python/ops/bincount_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
 from tensorflow.python.ops import bincount
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -65,7 +67,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 6],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_maxlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -73,7 +75,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 7],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_minlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -82,7 +84,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 9],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_minlength_larger_values_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -91,40 +93,40 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 8],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_no_maxlength_weights",
           "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
-          "expected_values": [1, 2, 3, 8, 5],
+          "expected_values": [2, 1, 0.5, 9, 3],
           "expected_shape": [2, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights": [[0.5, 1, 2], [3, 4, 5]]
       }, {
           "testcase_name": "_maxlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "maxlength": 7,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
-          "expected_values": [1, 2, 3, 0.5, 8],
+          "expected_values": [2, 1, 0.5, 3, 9],
           "expected_shape": [2, 7],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights": [[0.5, 1, 2, 11], [7, 3, 4, 5]]
       }, {
           "testcase_name": "_minlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 9,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
           "expected_shape": [2, 9],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
       }, {
           "testcase_name": "_minlength_larger_values_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 3,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
           "expected_shape": [2, 8],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
       }, {
           "testcase_name": "_1d",
           "x": np.array([3, 2, 1, 1], dtype=np.int32),
@@ -146,7 +148,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                        expected_shape,
                        minlength=None,
                        maxlength=None,
-                       binary_count=False,
+                       binary_output=False,
                        weights=None,
                        axis=-1):
     y = bincount.sparse_bincount(
@@ -154,7 +156,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
         weights=weights,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -216,7 +218,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
           "expected_values": [1, 1, 1, 1],
           "expected_shape": [3, 6],
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -230,7 +232,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -244,7 +246,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -258,7 +260,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -268,9 +270,10 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 8, 5],
+          "expected_values": [2, 6, 7, 10],
           "expected_shape": [3, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 0, 0], [10, 0, 3.5, 3.5]]),
       },
       {
           "testcase_name":
@@ -279,11 +282,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 8, 5],
+          "expected_values": [2, 6, 7, 10],
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 14, 0], [10, 0, 3.5, 3.5]]),
       },
       {
           "testcase_name":
@@ -292,11 +296,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 7, 8, 5],
+          "expected_values": [2, 6, 14, 6.5, 10],
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
       },
       {
           "testcase_name":
@@ -305,11 +310,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 7, 8, 5],
+          "expected_values": [2, 6, 14, 6.5, 10],
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
       },
       {
           "testcase_name": "_1d",
@@ -338,16 +344,17 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_count=False,
+                        binary_output=False,
                         weights=None,
                         axis=-1):
     x_sparse = sparse_ops.from_dense(x)
+    w_sparse = sparse_ops.from_dense(weights) if weights is not None else None
     y = bincount.sparse_bincount(
         x_sparse,
-        weights=weights,
+        weights=w_sparse,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -393,7 +400,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 6],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_maxlength_binary",
@@ -402,7 +409,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 7],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_minlength_binary",
@@ -412,13 +419,13 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 9],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_minlength_larger_values_binary",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "minlength": 3,
-          "binary_count": True,
+          "binary_output": True,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
@@ -428,18 +435,18 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "testcase_name": "_no_maxlength_weights",
           "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
           "expected_shape": [5, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights": [[], [], [6, 0.5, 2], [], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_maxlength_weights",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "maxlength": 7,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
           "expected_shape": [5, 7],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_minlength_weights",
@@ -447,9 +454,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 9,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
           "expected_shape": [5, 9],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_minlength_larger_values_weights",
@@ -457,9 +464,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 3,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
           "expected_shape": [5, 8],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_1d",
@@ -484,21 +491,114 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_count=False,
+                        binary_output=False,
                         weights=None,
                         axis=-1):
     x_ragged = ragged_factory_ops.constant(x)
+    w = ragged_factory_ops.constant(weights) if weights is not None else None
     y = bincount.sparse_bincount(
         x_ragged,
-        weights=weights,
+        weights=w,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
     self.assertAllEqual(expected_shape, y.dense_shape)
 
 
+class TestSparseCountFailureModes(test.TestCase):
+
+  def test_dense_input_sparse_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_ragged_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_wrong_shape_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = np.array([[3, 2], [5, 4], [4, 3]])
+    # Note: Eager mode and graph mode throw different errors here. Graph mode
+    # will fail with a ValueError from the shape checking logic, while Eager
+    # will fail with an InvalidArgumentError from the kernel itself.
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must have the same shape"):
+        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+    else:
+      with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
+        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_dense_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_ragged_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 0, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same indices"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_too_many_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Incompatible shapes"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_shape_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4], [0, 0, 0, 0]],
+                 dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same dense shape"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_dense_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_sparse_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_different_shape_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same row splits"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 05b8842be66..44fb74ac63a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1078,7 +1078,7 @@ tf_module {
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -3074,7 +3074,7 @@ tf_module {
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -4094,7 +4094,7 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 4c4f6c62291..f8f8edb26a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 05b8842be66..44fb74ac63a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1078,7 +1078,7 @@ tf_module {
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -3074,7 +3074,7 @@ tf_module {
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -4094,7 +4094,7 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index a9ad81920dd..67235bb2cf2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -10,7 +10,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"

From 486a076444e6aea7fabca8c2b984d1b6c2e50daa Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Wed, 13 May 2020 20:41:31 -0700
Subject: [PATCH 164/412] support tpu strategy for crossing.

PiperOrigin-RevId: 311459380
Change-Id: I51e71d267147c6db2cba449788be63066a4f37bb
---
 .../preprocessing/categorical_crossing.py     |  4 ---
 .../categorical_crossing_distribution_test.py | 27 ++++++++++++++++---
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
index 88b552e23b7..68848458bb2 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 import itertools
 
-from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
@@ -110,9 +109,6 @@ class CategoryCrossing(Layer):
       self._depth_tuple = depth
     elif depth is not None:
       self._depth_tuple = tuple([i for i in range(1, depth + 1)])
-    strategy = ds_context.get_strategy()
-    if strategy.__class__.__name__.startswith('TPUStrategy'):
-      raise ValueError('TPU strategy is not support for this layer yet.')
 
   def partial_crossing(self, partial_inputs, ragged_out, sparse_out):
     """Gets the crossed output from a partial list/tuple of inputs."""
diff --git a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
index e1ba91e3558..57dea6edf4a 100644
--- a/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/categorical_crossing_distribution_test.py
@@ -21,8 +21,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import tpu_strategy
 from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import keras_parameterized
@@ -31,10 +33,22 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util
 from tensorflow.python.platform import test
 
 
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
+  # TPUs currently require fully defined input shapes, drop_remainder ensures
+  # the input will have fully defined shapes.
+  if isinstance(distribution,
+                (tpu_strategy.TPUStrategy, tpu_strategy.TPUStrategyV1)):
+    return dataset.batch(batch_size, drop_remainder=True)
+  else:
+    return dataset.batch(batch_size)
+
+
 @combinations.generate(
     combinations.combine(
         # Investigate why crossing is not supported with TPU.
-        distribution=strategy_combinations.strategies_minus_tpu,
+        distribution=strategy_combinations.all_strategies,
         mode=['eager', 'graph']))
 class CategoryCrossingDistributionTest(
     keras_parameterized.TestCase,
@@ -43,6 +57,9 @@ class CategoryCrossingDistributionTest(
   def test_distribution(self, distribution):
     input_array_1 = np.array([['a', 'b'], ['c', 'd']])
     input_array_2 = np.array([['e', 'f'], ['g', 'h']])
+    inp_dataset = dataset_ops.DatasetV2.from_tensor_slices(
+        {'input_1': input_array_1, 'input_2': input_array_2})
+    inp_dataset = batch_wrapper(inp_dataset, 2, distribution)
 
     # pyformat: disable
     expected_output = [[b'a_X_e', b'a_X_f', b'b_X_e', b'b_X_f'],
@@ -50,13 +67,15 @@ class CategoryCrossingDistributionTest(
     config.set_soft_device_placement(True)
 
     with distribution.scope():
-      input_data_1 = keras.Input(shape=(2,), dtype=dtypes.string)
-      input_data_2 = keras.Input(shape=(2,), dtype=dtypes.string)
+      input_data_1 = keras.Input(shape=(2,), dtype=dtypes.string,
+                                 name='input_1')
+      input_data_2 = keras.Input(shape=(2,), dtype=dtypes.string,
+                                 name='input_2')
       input_data = [input_data_1, input_data_2]
       layer = categorical_crossing.CategoryCrossing()
       int_data = layer(input_data)
       model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict([input_array_1, input_array_2])
+    output_dataset = model.predict(inp_dataset)
     self.assertAllEqual(expected_output, output_dataset)
 
 
From fc56619a9b43fd6df93f4ee234a303fe77f05fb1 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Wed, 13 May 2020 20:44:56 -0700
Subject: [PATCH 165/412] Add the newly-added 'nnapi_allow_fp16' option to
 README.

PiperOrigin-RevId: 311459737
Change-Id: I1f132096008a142f403c811289f387852225d3e5
---
 tensorflow/lite/tools/benchmark/README.md | 1 +
 tensorflow/lite/tools/delegates/README.md | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index c44129cbbd3..ae7e4ae150d 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -73,6 +73,7 @@ Note when `use_legacy_nnapi` is selected, this parameter won't work.
     `/data/local/tmp/` and this benchmark tool will not correctly use NNAPI.
 *   `nnapi_accelerator_name`: `str` (default="")
 *   `disable_nnapi_cpu`: `bool` (default=false)
+*   `nnapi_allow_fp16`: `bool` (default=false)
 
 #### Hexagon delegate
 * `use_hexagon`: `bool` (default=false)
diff --git a/tensorflow/lite/tools/delegates/README.md b/tensorflow/lite/tools/delegates/README.md
index 709fcffb24d..bc1bffd49b6 100644
--- a/tensorflow/lite/tools/delegates/README.md
+++ b/tensorflow/lite/tools/delegates/README.md
@@ -73,6 +73,8 @@ TFLite delegate.
     [NNAPI CPU reference implementation](https://developer.android.com/ndk/guides/neuralnetworks#device-assignment)
     from the possible devices to be used by NNAPI to execute the model. This
     option is ignored if `nnapi_accelerator_name` is specified.
+*   `nnapi_allow_fp16`: `bool` (default=false) \
+    Whether to allow FP32 computation to be run in FP16.
 
 ### Hexagon delegate provider
 *   `use_hexagon`: `bool` (default=false) \

From 3f197f3f0e562be8b5ca04acb66487c8864ae5e6 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Wed, 13 May 2020 20:48:37 -0700
Subject: [PATCH 166/412] Optimize broadcast int8 max.

PiperOrigin-RevId: 311460102
Change-Id: Id1b3f64deca0d9aca7608985393be5814763817f
---
 .../internal/optimized/optimized_ops.h        | 156 ++++++++++++++++++
 tensorflow/lite/kernels/maximum_minimum.cc    |  64 +++++--
 2 files changed, 205 insertions(+), 15 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index a6d37f4f1ed..c72400f33a5 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -7921,6 +7921,162 @@ void Transpose(const TransposeParams& unshrinked_params,
                       shrinked_output_shape, output_data);
 }
 
+// Assume input1 & input2 have the same scale & zero point.
+inline void MaximumElementwise(int size, const ArithmeticParams& params,
+                               const int8* input1_data, const int8* input2_data,
+                               int8* output_data) {
+  ruy::profiler::ScopeLabel label("MaximumElementwiseInt8/8bit");
+
+  int i = 0;
+#ifdef USE_NEON
+  for (; i <= size - 8; i += 8) {
+    const int8x8_t input1_val_original = vld1_s8(input1_data + i);
+    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
+    const int8x8_t max_data = vmax_s8(input1_val_original, input2_val_original);
+    vst1_s8(output_data + i, max_data);
+  }
+#endif  // NEON
+  for (; i < size; ++i) {
+    const int8 input1_val = input1_data[i];
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::max(input1_val, input2_val);
+  }
+}
+
+inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
+                                   int8 input1_data, const int8* input2_data,
+                                   int8* output_data) {
+  ruy::profiler::ScopeLabel label("MaximumScalarBroadcastInt8/8bit");
+  int i = 0;
+
+#ifdef USE_NEON
+  const int8x8_t input1_val_original = vdup_n_s8(input1_data);
+  for (; i <= size - 8; i += 8) {
+    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
+    const int8x8_t max_data = vmax_s8(input1_val_original, input2_val_original);
+    vst1_s8(output_data + i, max_data);
+  }
+#endif  // NEON
+  for (; i < size; ++i) {
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::max(input1_data, input2_val);
+  }
+}
+
+inline void BroadcastMaximumFivefold(
+    const ArithmeticParams& unswitched_params,
+    const RuntimeShape& unswitched_input1_shape,
+    const int8* unswitched_input1_data,
+    const RuntimeShape& unswitched_input2_shape,
+    const int8* unswitched_input2_data, const RuntimeShape& output_shape,
+    int8* output_data) {
+  ruy::profiler::ScopeLabel label("BroadcastMaximumFivefoldInt8/8bit");
+
+  ArithmeticParams switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+      unswitched_params.broadcast_category ==
+      tflite::BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const ArithmeticParams& params =
+      use_unswitched ? unswitched_params : switched_params;
+  const int8* input1_data =
+      use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const int8* input2_data =
+      use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  int8* output_data_ptr = output_data;
+  const int8* input1_data_ptr = input1_data;
+  const int8* input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
+  // Put another way,
+  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1) {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const int8* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          for (int i3 = 0; i3 < y3; ++i3) {
+            MaximumElementwise(y4, params, input1_data_ptr, input2_data_ptr,
+                               output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  } else {
+    // Special case of y4 == 1, in which the innermost loop is a single element
+    // and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except simplified
+    // for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0) {
+      const int8* input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1) {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2) {
+          MaximumScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
+                                 output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+}
+
+// TODO(b/156140316): Try to unify the broadcast dispatch logic for binary ops.
+template <typename Op>
+inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
+                                     const RuntimeShape& input1_shape,
+                                     const int8* input1_data,
+                                     const RuntimeShape& input2_shape,
+                                     const int8* input2_data,
+                                     const RuntimeShape& output_shape,
+                                     int8* output_data, Op op) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::MaximumMinimumBroadcastSlow(
+        input1_shape, input1_data, input2_shape, input2_data, output_shape,
+        output_data, op);
+  }
+
+  BroadcastMaximumFivefold(params, input1_shape, input1_data, input2_shape,
+                           input2_data, output_shape, output_data);
+}
+
 }  // namespace optimized_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index 3c6c524c13d..abe9647f69e 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -31,6 +32,7 @@ namespace maximum_minimum {
 // This file has a reference implementation of TFMaximum/TFMinimum.
 enum KernelType {
   kReference,
+  kGenericOptimized,
 };
 
 constexpr int kInputTensor1 = 0;
@@ -85,7 +87,7 @@ struct MinimumOp {
   }
 };
 
-template <typename data_type, typename op_type>
+template <KernelType kernel_type, typename data_type, typename op_type>
 void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
                      const OpContext& op_context) {
   reference_ops::MaximumMinimumBroadcastSlow(
@@ -98,29 +100,57 @@ void TFLiteOperation(TfLiteContext* context, TfLiteNode* node,
       op_type::template op<data_type>);
 }
 
+// Maximum generic opt int8.
+template <>
+void TFLiteOperation<maximum_minimum::kGenericOptimized, int8, MaximumOp>(
+    TfLiteContext* context, TfLiteNode* node, const OpContext& op_context) {
+  tflite::ArithmeticParams op_params;
+  const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+      GetTensorShape(op_context.input1), GetTensorShape(op_context.input2),
+      &op_params);
+  if (need_broadcast) {
+    optimized_ops::BroadcastMaximumDispatch(
+        op_params, GetTensorShape(op_context.input1),
+        GetTensorData<int8>(op_context.input1),
+        GetTensorShape(op_context.input2),
+        GetTensorData<int8>(op_context.input2),
+        GetTensorShape(op_context.output),
+        GetTensorData<int8>(op_context.output), MaximumOp::template op<int8>);
+    return;
+  }
+  reference_ops::MaximumMinimumBroadcastSlow(
+      GetTensorShape(op_context.input1), GetTensorData<int8>(op_context.input1),
+      GetTensorShape(op_context.input2), GetTensorData<int8>(op_context.input2),
+      GetTensorShape(op_context.output), GetTensorData<int8>(op_context.output),
+      MaximumOp::template op<int8>);
+}
+
 template <KernelType kernel_type, typename OpType>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
 
-  if (kernel_type == kReference) {
     switch (op_context.output->type) {
       case kTfLiteFloat32:
-        TFLiteOperation<float, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, float, OpType>(context, node, op_context);
         break;
       case kTfLiteUInt8:
-        TFLiteOperation<uint8_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, uint8_t, OpType>(context, node,
+                                                      op_context);
         break;
       case kTfLiteInt8:
-        TFLiteOperation<int8_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int8_t, OpType>(context, node, op_context);
         break;
       case kTfLiteInt32:
-        TFLiteOperation<int32_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int32_t, OpType>(context, node,
+                                                      op_context);
         break;
       case kTfLiteInt64:
-        TFLiteOperation<int64_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int64_t, OpType>(context, node,
+                                                      op_context);
         break;
       case kTfLiteInt16:
-        TFLiteOperation<int16_t, OpType>(context, node, op_context);
+        TFLiteOperation<kernel_type, int16_t, OpType>(context, node,
+                                                      op_context);
         break;
       default:
         context->ReportError(context,
@@ -128,12 +158,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                              op_context.output->type);
         return kTfLiteError;
     }
-  } else {
-    context->ReportError(context,
-                         "Type %d is currently not supported by Maximum.",
-                         op_context.output->type);
-    return kTfLiteError;
-  }
   return kTfLiteOk;
 }
 
@@ -147,6 +171,14 @@ TfLiteRegistration* Register_MAXIMUM_REF() {
   return &r;
 }
 
+TfLiteRegistration* Register_MAXIMUM_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kGenericOptimized,
+                            maximum_minimum::MaximumOp>};
+  return &r;
+}
+
 TfLiteRegistration* Register_MINIMUM_REF() {
   static TfLiteRegistration r = {
       nullptr, nullptr, maximum_minimum::Prepare,
@@ -154,7 +186,9 @@ TfLiteRegistration* Register_MINIMUM_REF() {
                             maximum_minimum::MinimumOp>};
   return &r;
 }
-TfLiteRegistration* Register_MAXIMUM() { return Register_MAXIMUM_REF(); }
+TfLiteRegistration* Register_MAXIMUM() {
+  return Register_MAXIMUM_GENERIC_OPT();
+}
 TfLiteRegistration* Register_MINIMUM() { return Register_MINIMUM_REF(); }
 
 }  // namespace builtin

From 3f75577690a2a1b420727ceef7f46e40697a6ce4 Mon Sep 17 00:00:00 2001
From: Benoit Jacob <benoitjacob@google.com>
Date: Wed, 13 May 2020 21:10:03 -0700
Subject: [PATCH 167/412] Drop the dependency on ruy:detect_arm, which is
 becoming private.

PiperOrigin-RevId: 311462480
Change-Id: I74c62386997b34022301e673275856b77992a1b2
---
 tensorflow/lite/kernels/internal/BUILD        |  5 +-
 .../kernels/internal/optimized/cpu_check.cc   | 50 +++++++++++++++++++
 .../kernels/internal/optimized/cpu_check.h    |  8 +--
 .../internal/optimized/neon_tensor_utils.cc   |  3 +-
 4 files changed, 57 insertions(+), 9 deletions(-)
 create mode 100644 tensorflow/lite/kernels/internal/optimized/cpu_check.cc

diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 5958a9c1098..93292fbb640 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -654,7 +654,6 @@ cc_library(
         "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:cpu_backend_gemm",
         "@ruy//ruy",
-        "@ruy//ruy:detect_arm",
     ],
 )
 
@@ -1039,6 +1038,7 @@ cc_test(
 
 cc_library(
     name = "cpu_check",
+    srcs = ["optimized/cpu_check.cc"],
     hdrs = [
         "optimized/cpu_check.h",
         "optimized/neon_check.h",
@@ -1058,9 +1058,6 @@ cc_library(
         ":windows": tflite_deps_intel,
         "//conditions:default": [],
     },
-    deps = [
-        "@ruy//ruy:detect_arm",  # safe to use regardless of arch.
-    ],
 )
 
 cc_test(
diff --git a/tensorflow/lite/kernels/internal/optimized/cpu_check.cc b/tensorflow/lite/kernels/internal/optimized/cpu_check.cc
new file mode 100644
index 00000000000..8fd17a7e33a
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/cpu_check.cc
@@ -0,0 +1,50 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
+
+#if defined __linux__ && defined __aarch64__
+#include <sys/auxv.h>
+#endif
+
+namespace tflite {
+
+namespace {
+
+// The implementation of dotprod detection is copied from ruy's internal
+// function DetectDotprod().
+// At the moment it's only implemented on Linux ARM64. Consider syncing again
+// with ruy in the future to share improvements.
+#if defined __linux__ && defined __aarch64__
+bool DetectDotprodByLinuxAuxvMethod() {
+  // This is the value of HWCAP_ASIMDDP in sufficiently recent Linux headers,
+  // however we need to support building against older headers for the time
+  // being.
+  const int kLocalHwcapAsimddp = 1 << 20;
+  return getauxval(AT_HWCAP) & kLocalHwcapAsimddp;
+}
+#endif
+
+}  // namespace
+
+bool DetectArmNeonDotprod() {
+#if defined __linux__ && defined __aarch64__
+  return DetectDotprodByLinuxAuxvMethod();
+#endif
+
+  return false;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/cpu_check.h b/tensorflow/lite/kernels/internal/optimized/cpu_check.h
index 2c02e756f14..b39371a3e2f 100644
--- a/tensorflow/lite/kernels/internal/optimized/cpu_check.h
+++ b/tensorflow/lite/kernels/internal/optimized/cpu_check.h
@@ -15,8 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CPU_CHECK_H_
 
-#include "ruy/detect_arm.h"  // from @ruy
-
 // This include is superfluous. However, it's been here for a while, and a
 // number of files have been relying on it to include neon_check.h for them.
 // This should be removed, but with a global run of presubmits to catch
@@ -25,12 +23,16 @@ limitations under the License.
 
 namespace tflite {
 
+// On A64, returns true if the dotprod extension is present.
+// On other architectures, returns false unconditionally.
+bool DetectArmNeonDotprod();
+
 struct CpuFlags {
   bool neon_dotprod = false;
 };
 
 inline void GetCpuFlags(CpuFlags* cpu_flags) {
-  cpu_flags->neon_dotprod = ruy::DetectDotprod();
+  cpu_flags->neon_dotprod = DetectArmNeonDotprod();
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 4d8c20074d5..4c90cd86a56 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <limits>
 #include <utility>
 
-#include "ruy/detect_arm.h"  // from @ruy
 #include "ruy/ruy.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/cpu_backend_gemm.h"
@@ -80,7 +79,7 @@ inline void* aligned_alloc(size_t alignment, size_t size,
 }
 
 bool HasSdotInstruction() {
-  static const bool has_dotprod = ruy::DetectDotprod();
+  static const bool has_dotprod = DetectArmNeonDotprod();
   return has_dotprod;
 }
 

From 5bd2ae7b8a491055842d7f8c0dd8dccc947fa4d5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 21:15:51 -0700
Subject: [PATCH 168/412] Legalize tflite CustomOp

PiperOrigin-RevId: 311463041
Change-Id: I1a8eda844814ce08b247c94ad8ec1fb5debea033
---
 .../compiler/mlir/lite/transforms/lower_static_tensor_list.cc    | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
index a69b0a3c624..49be29065fe 100644
--- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc
@@ -859,6 +859,7 @@ LogicalResult LowerStaticTensorListPass::RewriteFunction(
   target.addLegalOp<ConstantOp>();
   target.addLegalOp<FuncOp>();
   target.addLegalOp<ReturnOp>();
+  target.addLegalOp<TFL::CustomOp>();
   // Register fused LSTM/RNN ops as legal.
   target.addLegalOp<TFL::LSTMOp>();
   target.addLegalOp<TFL::UnidirectionalSequenceLSTMOp>();

From 63262ea46da769530412d2591cf692f9d018e6ab Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 21:47:35 -0700
Subject: [PATCH 169/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311466009
Change-Id: Id2a01503a9a383b197047968be3c385bbd5238ea
---
 tensorflow/go/op/wrappers.go | 124 +++++++++++++++++------------------
 1 file changed, 59 insertions(+), 65 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c6d67c9ad44..e6725269279 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -4715,7 +4715,7 @@ type DenseCountSparseOutputAttr func(optionalAttr)
 
 // DenseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4727,7 +4727,7 @@ func DenseCountSparseOutputMinlength(value int64) DenseCountSparseOutputAttr {
 
 // DenseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4742,20 +4742,20 @@ func DenseCountSparseOutputMaxlength(value int64) DenseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	values: int32 or int64; Tensor containing data to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	values: Tensor containing data to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values. May
+// also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
-func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_output bool, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -8607,7 +8607,7 @@ type RaggedCountSparseOutputAttr func(optionalAttr)
 
 // RaggedCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8619,7 +8619,7 @@ func RaggedCountSparseOutputMinlength(value int64) RaggedCountSparseOutputAttr {
 
 // RaggedCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8634,33 +8634,27 @@ func RaggedCountSparseOutputMaxlength(value int64) RaggedCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	splits: int64; Tensor containing the row splits of the ragged tensor to count.
-//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	splits: Tensor containing the row splits of the ragged tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//   END
-//   }
-//   out_arg {
-//     name: "output_dense_shape"
-//     description: <<END
-// int64; shape tensor for the resulting sparse tensor object.
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
 //   END
 //   }
 //   attr {
 //     name: "T"
 //     description: <<END
-// dtype; dtype of the input values tensor.
-//	output_dense_shape
-func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+// Dtype of the input values tensor.
+func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_output bool, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -12059,7 +12053,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12064,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -13706,7 +13700,7 @@ type SparseCountSparseOutputAttr func(optionalAttr)
 
 // SparseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13718,7 +13712,7 @@ func SparseCountSparseOutputMinlength(value int64) SparseCountSparseOutputAttr {
 
 // SparseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13733,22 +13727,22 @@ func SparseCountSparseOutputMaxlength(value int64) SparseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	indices: int64; Tensor containing the indices of the sparse tensor to count.
-//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
-//	dense_shape: int64; Tensor containing the dense shape of the sparse tensor to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	indices: Tensor containing the indices of the sparse tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	dense_shape: Tensor containing the dense shape of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
-func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_output bool, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -18975,7 +18969,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18980,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19384,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20455,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21627,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22335,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22531,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22600,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22715,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22774,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22948,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23325,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25654,7 +25648,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25717,7 +25711,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25968,7 +25962,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26452,7 +26446,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45540,7 +45534,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47480,7 +47474,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47551,7 +47545,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48540,7 +48534,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 1b215ce9f3236d2de1c679530332ffe773ac4168 Mon Sep 17 00:00:00 2001
From: Pete Warden <petewarden@google.com>
Date: Wed, 13 May 2020 22:00:02 -0700
Subject: [PATCH 170/412] Update speech training notebook to use quantized
 inputs and outputs

PiperOrigin-RevId: 311467379
Change-Id: Id8df2f2a5a72f4dd2f5b8c3178ab9980a22dfff9
---
 .../examples/micro_speech/train/train_micro_speech_model.ipynb  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
index 2a64ecd7078..bfe75bdd9f7 100644
--- a/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
+++ b/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb
@@ -1 +1 @@
-{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"train_micro_speech_model.ipynb","provenance":[{"file_id":"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb","timestamp":1587690382292}],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"pO4-CY_TCZZS","colab_type":"text"},"source":["# Train a Simple Audio Recognition Model"]},{"cell_type":"markdown","metadata":{"id":"BaFfr7DHRmGF","colab_type":"text"},"source":["This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n","\n","The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n","\n","<table class=\"tfo-notebook-buttons\" align=\"left\">\n","  <td>\n","    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n","  </td>\n","  <td>\n","    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n","  </td>\n","</table>\n"]},{"cell_type":"markdown","metadata":{"id":"XaVtYN4nlCft","colab_type":"text"},"source":["**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n","\n","## Configure Defaults\n","\n","**MODIFY** the following constants for your specific use case."]},{"cell_type":"code","metadata":{"id":"ludfxbNIaegy","colab_type":"code","outputId":"1667d949-267c-4588-fe25-c0674d1dd074","executionInfo":{"status":"ok","timestamp":1588895159583,"user_tz":420,"elapsed":3711,"user":{"displayName":"Pete Warden","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gg9RGhKK9hlUJPY0U8OJIEUEeTc3V08ZIBIs175=s64","userId":"17073007660171926128"}},"colab":{"base_uri":"https://localhost:8080/","height":85}},"source":["# A comma-delimited list of the words you want to train for.\n","# The options are: yes,no,up,down,left,right,on,off,stop,go\n","# All the other words will be used to train an \"unknown\" label and silent\n","# audio data with no spoken words will be used to train a \"silence\" label.\n","WANTED_WORDS = \"yes,no\"\n","\n","# The number of steps and learning rates can be specified as comma-separated\n","# lists to define the rate at each stage. For example,\n","# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n","# will run 12,000 training loops in total, with a rate of 0.001 for the first\n","# 8,000, and 0.0001 for the final 3,000.\n","TRAINING_STEPS = \"12000,3000\"\n","LEARNING_RATE = \"0.001,0.0001\"\n","\n","# Calculate the total number of steps, which is used to identify the checkpoint\n","# file name.\n","TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n","\n","# Print the configuration to confirm it\n","!echo \"Training these words:\" $WANTED_WORDS\n","!echo \"Training steps in each stage:\" $TRAINING_STEPS\n","!echo \"Learning rate in each stage:\" $LEARNING_RATE\n","!echo \"Total number of training steps:\" $TOTAL_STEPS"],"execution_count":1,"outputs":[{"output_type":"stream","text":["Training these words: yes,no\n","Training steps in each stage: 12000,3000\n","Learning rate in each stage: 0.001,0.0001\n","Total number of training steps: 15000\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"gCgeOpvY9pAi","colab_type":"text"},"source":["**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."]},{"cell_type":"code","metadata":{"id":"Nd1iM1o2ymvA","colab_type":"code","colab":{}},"source":["# Calculate the percentage of 'silence' and 'unknown' training samples required\n","# to ensure that we have equal number of samples for each label.\n","number_of_labels = WANTED_WORDS.count(',') + 1\n","number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n","equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n","SILENT_PERCENTAGE = equal_percentage_of_training_samples\n","UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n","\n","# Constants which are shared during training and inference\n","PREPROCESS = 'micro'\n","WINDOW_STRIDE =20\n","MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n","                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n","\n","# Constants used during training only\n","VERBOSITY = 'WARN'\n","EVAL_STEP_INTERVAL = '1000'\n","SAVE_STEP_INTERVAL = '1000'\n","\n","# Constants for training directories and filepaths\n","DATASET_DIR =  'dataset/'\n","LOGS_DIR = 'logs/'\n","TRAIN_DIR = 'train/' # for training checkpoints and other files.\n","\n","# Constants for inference directories and filepaths\n","import os\n","MODELS_DIR = 'models'\n","if not os.path.exists(MODELS_DIR):\n","  os.mkdir(MODELS_DIR)\n","MODEL_TF = os.path.join(MODELS_DIR, 'model.pb')\n","MODEL_TFLITE = os.path.join(MODELS_DIR, 'model.tflite')\n","FLOAT_MODEL_TFLITE = os.path.join(MODELS_DIR, 'float_model.tflite')\n","MODEL_TFLITE_MICRO = os.path.join(MODELS_DIR, 'model.cc')\n","SAVED_MODEL = os.path.join(MODELS_DIR, 'saved_model')\n","\n","QUANT_INPUT_MIN = 0.0\n","QUANT_INPUT_MAX = 9.8077\n","QUANT_INPUT_RANGE = QUANT_INPUT_MAX - QUANT_INPUT_MIN"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6rLYpvtg9P4o","colab_type":"text"},"source":["## Setup Environment\n","\n","Install Dependencies"]},{"cell_type":"code","metadata":{"id":"ed_XpUrU5DvY","colab_type":"code","colab":{}},"source":["%tensorflow_version 1.x\n","import tensorflow as tf"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"T9Ty5mR58E4i","colab_type":"text"},"source":["**DELETE** any old data from previous runs\n"]},{"cell_type":"code","metadata":{"id":"APGx0fEh7hFF","colab_type":"code","colab":{}},"source":["!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"GfEUlfFBizio","colab_type":"text"},"source":["Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."]},{"cell_type":"code","metadata":{"id":"yZArmzT85SLq","colab_type":"code","colab":{}},"source":["!git clone -q --depth 1 https://github.com/tensorflow/tensorflow"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"nS9swHLSi7Bi","colab_type":"text"},"source":["Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"]},{"cell_type":"code","metadata":{"id":"q4qF1VxP3UE4","colab_type":"code","colab":{}},"source":["%load_ext tensorboard\n","%tensorboard --logdir {LOGS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"x1J96Ron-O4R","colab_type":"text"},"source":["## Training\n","\n","The following script downloads the dataset and begin training."]},{"cell_type":"code","metadata":{"id":"VJsEZx6lynbY","colab_type":"code","colab":{}},"source":["!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n","--data_dir={DATASET_DIR} \\\n","--wanted_words={WANTED_WORDS} \\\n","--silence_percentage={SILENT_PERCENTAGE} \\\n","--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n","--preprocess={PREPROCESS} \\\n","--window_stride={WINDOW_STRIDE} \\\n","--model_architecture={MODEL_ARCHITECTURE} \\\n","--how_many_training_steps={TRAINING_STEPS} \\\n","--learning_rate={LEARNING_RATE} \\\n","--train_dir={TRAIN_DIR} \\\n","--summaries_dir={LOGS_DIR} \\\n","--verbosity={VERBOSITY} \\\n","--eval_step_interval={EVAL_STEP_INTERVAL} \\\n","--save_step_interval={SAVE_STEP_INTERVAL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"XQUJLrdS-ftl","colab_type":"text"},"source":["## Generate a TensorFlow Model for Inference\n","\n","Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."]},{"cell_type":"code","metadata":{"id":"xyc3_eLh9sAg","colab_type":"code","colab":{}},"source":["!rm -rf {SAVED_MODEL}\n","!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n","--wanted_words=$WANTED_WORDS \\\n","--window_stride_ms=$WINDOW_STRIDE \\\n","--preprocess=$PREPROCESS \\\n","--model_architecture=$MODEL_ARCHITECTURE \\\n","--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'$TOTAL_STEPS \\\n","--save_format=saved_model \\\n","--output_file={SAVED_MODEL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_DBGDxVI-nKG","colab_type":"text"},"source":["## Generate a TensorFlow Lite Model\n","\n","Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n","\n","The following cell will also print the model size, which will be under 20 kilobytes."]},{"cell_type":"code","metadata":{"id":"RIitkqvGWmre","colab_type":"code","colab":{}},"source":["import sys\n","# We add this path so we can import the speech processing modules.\n","sys.path.append(\"/content/tensorflow/tensorflow/examples/speech_commands/\")\n","import input_data\n","import models\n","import numpy as np"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"kzqECqMxgBh4","colab_type":"code","colab":{}},"source":["SAMPLE_RATE = 16000\n","CLIP_DURATION_MS = 1000\n","WINDOW_SIZE_MS = 30.0\n","FEATURE_BIN_COUNT = 40\n","BACKGROUND_FREQUENCY = 0.8\n","BACKGROUND_VOLUME_RANGE = 0.1\n","TIME_SHIFT_MS = 100.0\n","\n","DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'\n","VALIDATION_PERCENTAGE = 10\n","TESTING_PERCENTAGE = 10"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"rNQdAplJV1fz","colab_type":"code","colab":{}},"source":["model_settings = models.prepare_model_settings(\n","    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),\n","    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,\n","    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)\n","audio_processor = input_data.AudioProcessor(\n","    DATA_URL, DATASET_DIR,\n","    SILENT_PERCENTAGE, UNKNOWN_PERCENTAGE,\n","    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,\n","    TESTING_PERCENTAGE, model_settings, LOGS_DIR)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lBj_AyCh1cC0","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  float_converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  float_tflite_model = float_converter.convert()\n","  float_tflite_model_size = open(FLOAT_MODEL_TFLITE, \"wb\").write(float_tflite_model)\n","  print(\"Float model is %d bytes\" % float_tflite_model_size)\n","\n","  converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  converter.optimizations = [tf.lite.Optimize.DEFAULT]\n","  converter.quantized_input_stats = {\"Reshape_1\": (QUANT_INPUT_MIN, QUANT_INPUT_MAX)}\n","  def representative_dataset_gen():\n","    for i in range(100):\n","      data, _ = audio_processor.get_data(1, i*1, model_settings,\n","                                         BACKGROUND_FREQUENCY, \n","                                         BACKGROUND_VOLUME_RANGE,\n","                                         TIME_SHIFT_MS,\n","                                         'testing',\n","                                         sess)\n","      flattened_data = np.array(data.flatten(), dtype=np.float32).reshape(1, 1960)\n","      yield [flattened_data]\n","  converter.representative_dataset = representative_dataset_gen\n","  tflite_model = converter.convert()\n","  tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n","  print(\"Quantized model is %d bytes\" % tflite_model_size)\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"EeLiDZTbLkzv","colab_type":"text"},"source":["# Testing the TensorFlow Lite model's accuracy\n","\n","Verify that the model we've exported is still accurate, using the TF Lite Python API and our test set."]},{"cell_type":"code","metadata":{"id":"wQsEteKRLryJ","colab_type":"code","outputId":"d4a7c3eb-3d74-40e6-9eb5-7d2ffc5e3b6d","executionInfo":{"status":"ok","timestamp":1588901109389,"user_tz":420,"elapsed":9673,"user":{"displayName":"Pete Warden","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gg9RGhKK9hlUJPY0U8OJIEUEeTc3V08ZIBIs175=s64","userId":"17073007660171926128"}},"colab":{"base_uri":"https://localhost:8080/","height":51}},"source":["with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","float_interpreter = tf.lite.Interpreter(FLOAT_MODEL_TFLITE)\n","float_interpreter.allocate_tensors()\n","\n","float_input_index = float_interpreter.get_input_details()[0][\"index\"]\n","\n","float_output_index = float_interpreter.get_output_details()[0][\"index\"]\n","float_model_output = float_interpreter.tensor(float_output_index)\n","\n","float_correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  flattened_input = np.array(current_input.flatten(), dtype=np.float32).reshape(1, 1960)\n","  float_interpreter.set_tensor(float_input_index, flattened_input)\n","  float_interpreter.invoke()\n","  top_prediction = float_model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    float_correct_predictions += 1\n","\n","print('Float accuracy is %f%% (N=%d)' % ((float_correct_predictions * 100) / len(test_data), len(test_data)))\n","\n","interpreter = tf.lite.Interpreter(MODEL_TFLITE)\n","interpreter.allocate_tensors()\n","\n","input_index = interpreter.get_input_details()[0][\"index\"]\n","\n","output_index = interpreter.get_output_details()[0][\"index\"]\n","model_output = interpreter.tensor(output_index)\n","\n","with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  flattened_input = np.array(current_input.flatten(), dtype=np.float32).reshape(1, 1960)\n","  interpreter.set_tensor(input_index, flattened_input)\n","  interpreter.invoke()\n","  top_prediction = model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    correct_predictions += 1\n","\n","print('Quantized accuracy is %f%% (N=%d)' % ((correct_predictions * 100) / len(test_data), len(test_data)))\n"],"execution_count":15,"outputs":[{"output_type":"stream","text":["Float accuracy is 91.343042% (N=1236)\n","Quantized accuracy is 90.857605% (N=1236)\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"dt6Zqbxu-wIi","colab_type":"text"},"source":["## Generate a TensorFlow Lite for MicroControllers Model\n","Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."]},{"cell_type":"code","metadata":{"id":"XohZOTjR8ZyE","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":442},"outputId":"415d733c-86c4-4f19-9aa0-edc4112e6efb","executionInfo":{"status":"ok","timestamp":1588901187730,"user_tz":420,"elapsed":11964,"user":{"displayName":"Pete Warden","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gg9RGhKK9hlUJPY0U8OJIEUEeTc3V08ZIBIs175=s64","userId":"17073007660171926128"}}},"source":["# Install xxd if it is not available\n","!apt-get update && apt-get -qq install xxd\n","# Convert to a C source file\n","!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n","# Update variable names\n","REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n","!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"],"execution_count":16,"outputs":[{"output_type":"stream","text":["Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]\n","Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease\n","Get:3 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran35/ InRelease [3,626 B]\n","Hit:4 http://archive.ubuntu.com/ubuntu bionic InRelease\n","Hit:5 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease\n","Ign:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease\n","Hit:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release\n","Hit:8 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release\n","Get:9 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]\n","Get:10 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic InRelease [15.4 kB]\n","Get:11 http://security.ubuntu.com/ubuntu bionic-security/main amd64 Packages [908 kB]\n","Get:12 http://security.ubuntu.com/ubuntu bionic-security/universe amd64 Packages [844 kB]\n","Get:13 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [74.6 kB]\n","Get:16 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic/main Sources [1,814 kB]\n","Get:17 http://archive.ubuntu.com/ubuntu bionic-updates/universe amd64 Packages [1,376 kB]\n","Get:18 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 Packages [1,205 kB]\n","Get:19 http://ppa.launchpad.net/marutter/c2d4u3.5/ubuntu bionic/main amd64 Packages [875 kB]\n","Fetched 7,294 kB in 3s (2,429 kB/s)\n","Reading package lists... Done\n","Selecting previously unselected package xxd.\n","(Reading database ... 144429 files and directories currently installed.)\n","Preparing to unpack .../xxd_2%3a8.0.1453-1ubuntu1.3_amd64.deb ...\n","Unpacking xxd (2:8.0.1453-1ubuntu1.3) ...\n","Setting up xxd (2:8.0.1453-1ubuntu1.3) ...\n","Processing triggers for man-db (2.8.3-2ubuntu0.1) ...\n"],"name":"stdout"}]},{"cell_type":"markdown","metadata":{"id":"2pQnN0i_-0L2","colab_type":"text"},"source":["## Deploy to a Microcontroller\n","\n","Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n","\n","**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook. \n","\n","**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."]},{"cell_type":"code","metadata":{"id":"eoYyh0VU8pca","colab_type":"code","outputId":"dbaba37d-8a8d-4e11-d780-478971d9ee95","colab":{"base_uri":"https://localhost:8080/","height":1000},"executionInfo":{"status":"ok","timestamp":1588901241295,"user_tz":420,"elapsed":1288,"user":{"displayName":"Pete Warden","photoUrl":"https://lh3.googleusercontent.com/a-/AOh14Gg9RGhKK9hlUJPY0U8OJIEUEeTc3V08ZIBIs175=s64","userId":"17073007660171926128"}}},"source":["# Print the C source file\n","!cat {MODEL_TFLITE_MICRO}"],"execution_count":17,"outputs":[{"output_type":"stream","text":["unsigned char g_model[] = {\n","  0x1c, 0x00, 0x00, 0x00, 0x54, 0x46, 0x4c, 0x33, 0x00, 0x00, 0x12, 0x00,\n","  0x1c, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00,\n","  0x00, 0x00, 0x18, 0x00, 0x12, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n","  0x64, 0x49, 0x00, 0x00, 0x34, 0x42, 0x00, 0x00, 0x1c, 0x42, 0x00, 0x00,\n","  0x3c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x0c, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x04, 0x00, 0x08, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,\n","  0x13, 0x00, 0x00, 0x00, 0x6d, 0x69, 0x6e, 0x5f, 0x72, 0x75, 0x6e, 0x74,\n","  0x69, 0x6d, 0x65, 0x5f, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6f, 0x6e, 0x00,\n","  0x0c, 0x00, 0x00, 0x00, 0xd4, 0x41, 0x00, 0x00, 0xb4, 0x41, 0x00, 0x00,\n","  0x24, 0x03, 0x00, 0x00, 0xf4, 0x02, 0x00, 0x00, 0xec, 0x02, 0x00, 0x00,\n","  0xe4, 0x02, 0x00, 0x00, 0xc4, 0x02, 0x00, 0x00, 0xbc, 0x02, 0x00, 0x00,\n","  0x2c, 0x00, 0x00, 0x00, 0x24, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0xee, 0xbc, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n","  0x05, 0x00, 0x00, 0x00, 0x31, 0x2e, 0x35, 0x2e, 0x30, 0x00, 0x00, 0x00,\n","  0xd0, 0xb9, 0xff, 0xff, 0xd4, 0xb9, 0xff, 0xff, 0x0a, 0xbd, 0xff, 0xff,\n","  0x04, 0x00, 0x00, 0x00, 0x80, 0x02, 0x00, 0x00, 0xd4, 0x3e, 0x2e, 0xa6,\n","  0xd9, 0x4c, 0x23, 0x25, 0xd3, 0x2f, 0x09, 0xcb, 0xf6, 0x04, 0xc4, 0x1d,\n","  0xe5, 0x46, 0xf2, 0xcf, 0xd5, 0x53, 0x0c, 0x2b, 0x28, 0x06, 0xf8, 0xe9,\n","  0xe1, 0xdb, 0xdd, 0xf0, 0xbe, 0x0c, 0xfc, 0xa5, 0xb9, 0x1b, 0xca, 0x13,\n","  0x0d, 0xed, 0x0b, 0xd3, 0xff, 0xc8, 0x0d, 0xee, 0x04, 0xfe, 0xe1, 0x08,\n","  0xd9, 0xec, 0x26, 0x06, 0x0c, 0xcb, 0x1b, 0xc3, 0xf8, 0x81, 0xd5, 0xbc,\n","  0xc8, 0x48, 0xe6, 0x46, 0x0e, 0x34, 0x09, 0x0c, 0xea, 0x23, 0xe0, 0x14,\n","  0x17, 0xf5, 0xe0, 0x07, 0xe2, 0x3a, 0xaa, 0xea, 0x05, 0x5f, 0x26, 0x31,\n","  0x4e, 0xf6, 0xce, 0xe6, 0x0b, 0xed, 0xa7, 0xea, 0xbe, 0x08, 0xa4, 0x1b,\n","  0xd0, 0x50, 0x11, 0x2a, 0x16, 0xd3, 0xca, 0x11, 0xeb, 0xd8, 0xcb, 0xeb,\n","  0xfc, 0xee, 0xa5, 0x12, 0xda, 0x19, 0xfd, 0x1e, 0x1e, 0xc1, 0xc8, 0xe7,\n","  0xfc, 0x99, 0xae, 0xca, 0xe9, 0x57, 0x19, 0xe8, 0x1e, 0xff, 0xc4, 0xef,\n","  0xdc, 0x0d, 0x25, 0xef, 0x1c, 0xef, 0x2e, 0xed, 0xf3, 0x39, 0xd6, 0x76,\n","  0xe5, 0x4b, 0xb2, 0x2d, 0x4a, 0xf0, 0xf5, 0xcb, 0xc7, 0xf4, 0xbe, 0xea,\n","  0xcb, 0xed, 0xce, 0x0a, 0xa4, 0x69, 0x1a, 0x34, 0x0a, 0xdc, 0xca, 0x37,\n","  0xd4, 0xdf, 0x34, 0xe6, 0xf1, 0xd2, 0xb9, 0x1d, 0xb1, 0x42, 0xa3, 0x3a,\n","  0x0f, 0xc0, 0xc3, 0x0a, 0xcf, 0xc4, 0xe7, 0xd2, 0xfa, 0x62, 0x14, 0x18,\n","  0x49, 0xe1, 0x07, 0xe2, 0xec, 0x29, 0x4c, 0xd0, 0x53, 0xda, 0xdb, 0xe8,\n","  0xf9, 0x2f, 0x0e, 0xf6, 0x17, 0x2a, 0x23, 0x29, 0x7d, 0xec, 0x04, 0x2b,\n","  0x27, 0xf8, 0xb2, 0xdc, 0xbf, 0xec, 0xec, 0xb0, 0xe4, 0x62, 0x01, 0x42,\n","  0x28, 0xe2, 0x13, 0xe7, 0x13, 0xf3, 0xd3, 0xe1, 0xf7, 0xc3, 0xee, 0xf9,\n","  0xc4, 0x62, 0xfc, 0x58, 0x12, 0xc5, 0x02, 0x19, 0xe3, 0xe1, 0xf0, 0xe8,\n","  0xc4, 0x5e, 0xf9, 0xf3, 0x31, 0xce, 0xf0, 0xc0, 0xf8, 0x2e, 0x34, 0x37,\n","  0x7f, 0xc7, 0xa1, 0xdf, 0xf3, 0x31, 0xf8, 0xed, 0x27, 0x11, 0xc9, 0x19,\n","  0x72, 0xf3, 0x18, 0x1b, 0x2b, 0xe6, 0xef, 0xd8, 0xd1, 0xd4, 0x14, 0xf8,\n","  0xd5, 0x51, 0x40, 0x42, 0x2d, 0xe5, 0x0b, 0x94, 0x03, 0xf4, 0xde, 0xdf,\n","  0xf1, 0xc0, 0x08, 0xf9, 0xc4, 0x71, 0xf5, 0x75, 0x20, 0xc8, 0xf9, 0xcb,\n","  0xe0, 0x0c, 0x81, 0xf5, 0xc2, 0x6f, 0x25, 0xe3, 0x15, 0xca, 0x40, 0xac,\n","  0xe6, 0x37, 0x60, 0xb4, 0x30, 0xb8, 0x19, 0xdb, 0xf1, 0x22, 0x56, 0xfe,\n","  0x02, 0xf7, 0xfb, 0x0e, 0x68, 0xe6, 0x5e, 0x81, 0x15, 0xe4, 0xc5, 0xd9,\n","  0xc3, 0xbd, 0x42, 0xe5, 0xbe, 0x2f, 0xde, 0x3d, 0x04, 0xe3, 0x4a, 0x97,\n","  0xdb, 0xf6, 0xb1, 0xdf, 0xe5, 0xb2, 0x4b, 0xf2, 0xbc, 0x5e, 0x22, 0x7f,\n","  0xfd, 0xd7, 0x37, 0xda, 0xd2, 0x1a, 0x22, 0xf8, 0xbf, 0x69, 0x1b, 0x22,\n","  0x07, 0xcc, 0x11, 0xa3, 0xf8, 0x2c, 0x35, 0xdf, 0x60, 0xc8, 0xc9, 0xd9,\n","  0xeb, 0x0c, 0x4e, 0x2e, 0x28, 0xe4, 0x44, 0x02, 0x7f, 0xda, 0x62, 0x25,\n","  0x14, 0xe6, 0xbd, 0xe1, 0xcf, 0x9c, 0x50, 0x17, 0xff, 0x1e, 0xc3, 0x3c,\n","  0x25, 0xde, 0x4c, 0x14, 0xf7, 0xfc, 0x02, 0xe1, 0xdd, 0xd3, 0x3d, 0xf8,\n","  0xef, 0x49, 0x0c, 0x7b, 0x0a, 0xff, 0x24, 0x34, 0xfe, 0x2b, 0x14, 0x0b,\n","  0xb6, 0x4f, 0xc5, 0x23, 0xe6, 0xe2, 0x12, 0x9f, 0xeb, 0x21, 0xc9, 0x45,\n","  0x35, 0xcc, 0xbf, 0xea, 0x01, 0xf4, 0xe0, 0x15, 0x0e, 0xe8, 0x9d, 0xff,\n","  0x54, 0xc7, 0xec, 0x27, 0x32, 0xed, 0xe3, 0xef, 0xd6, 0xa7, 0xf5, 0xea,\n","  0xfa, 0x09, 0xc3, 0x32, 0x1d, 0xfd, 0x05, 0x19, 0x03, 0xf6, 0x05, 0xe9,\n","  0xed, 0xe6, 0x05, 0x64, 0xf0, 0x35, 0xdc, 0x61, 0x12, 0x1d, 0x20, 0x3c,\n","  0x0f, 0x33, 0xf8, 0x12, 0xa1, 0x1c, 0x81, 0x1d, 0xdc, 0xe1, 0x0a, 0x99,\n","  0xd1, 0xf7, 0x9f, 0xc9, 0x1b, 0xd8, 0x32, 0xf2, 0xee, 0xb3, 0xaf, 0x0f,\n","  0x01, 0xdd, 0x49, 0xf8, 0x7c, 0xa6, 0xbd, 0xac, 0x36, 0xeb, 0x0f, 0x01,\n","  0xdb, 0xca, 0xb8, 0xb8, 0xf8, 0xf6, 0xf9, 0x27, 0x32, 0xf8, 0xde, 0xef,\n","  0x19, 0xff, 0xf9, 0xf7, 0xf3, 0xde, 0xc7, 0x93, 0xfb, 0x1e, 0x1d, 0x50,\n","  0xf3, 0x31, 0xc5, 0x00, 0x18, 0x27, 0xb8, 0x1a, 0x9e, 0xdf, 0xd0, 0x2c,\n","  0xce, 0xe0, 0xa3, 0xa9, 0x9d, 0xb8, 0xaf, 0x67, 0x13, 0xd3, 0x19, 0xf7,\n","  0xed, 0x81, 0xb1, 0x3d, 0xe9, 0xd5, 0x00, 0xf4, 0x45, 0x93, 0xcd, 0x62,\n","  0x1e, 0xd6, 0x3a, 0x08, 0xd9, 0xb9, 0xd2, 0x1e, 0xeb, 0xe9, 0xbb, 0x1e,\n","  0x1f, 0xf9, 0xe0, 0x20, 0xf6, 0xf2, 0x30, 0xf9, 0xfe, 0xfb, 0xe9, 0x66,\n","  0xeb, 0xf5, 0x13, 0x40, 0xcf, 0x2d, 0xce, 0x0f, 0xe9, 0x06, 0x9a, 0x0c,\n","  0x64, 0xbc, 0xff, 0xff, 0x9a, 0xbf, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n","  0x10, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0x31, 0x00, 0x00, 0x00,\n","  0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0xbc, 0xff, 0xff,\n","  0x88, 0xbc, 0xff, 0xff, 0xbe, 0xbf, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n","  0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe1, 0xfe, 0xff, 0xff,\n","  0x78, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x43, 0xfd, 0xff, 0xff, 0xa9, 0xff, 0xff, 0xff, 0x97, 0xfc, 0xff, 0xff,\n","  0xea, 0xbf, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00, 0x80, 0x3e, 0x00, 0x00,\n","  0xf5, 0xf9, 0xff, 0x08, 0xea, 0x05, 0x0f, 0x0c, 0xf3, 0x0e, 0xf6, 0x0f,\n","  0xfa, 0x01, 0x11, 0xf1, 0xf6, 0xea, 0xfc, 0x0f, 0xfc, 0xf1, 0xdd, 0x0e,\n","  0x1c, 0xef, 0xe6, 0xff, 0x05, 0xe8, 0x03, 0x11, 0xf6, 0xf1, 0x11, 0x0c,\n","  0xd7, 0x08, 0xf5, 0x30, 0xd9, 0x10, 0x14, 0x11, 0x10, 0x17, 0xee, 0x23,\n","  0x0c, 0xeb, 0x00, 0x06, 0xf6, 0xf7, 0x18, 0x0e, 0x18, 0x13, 0xfe, 0xfa,\n","  0xf3, 0xdd, 0xfa, 0xfb, 0x01, 0xfd, 0xe5, 0xe4, 0x00, 0x0d, 0xfe, 0x09,\n","  0xe9, 0x0a, 0x10, 0x1d, 0xf8, 0xf4, 0x0a, 0x1a, 0x10, 0x12, 0x18, 0xf1,\n","  0xfc, 0x1d, 0x00, 0x25, 0xd8, 0x08, 0xf8, 0xff, 0x06, 0x19, 0xf5, 0x0f,\n","  0x1c, 0x17, 0x0c, 0x16, 0xf3, 0x29, 0x20, 0x32, 0xfe, 0x19, 0xfb, 0x02,\n","  0x04, 0x15, 0xf3, 0x2b, 0x06, 0x14, 0x0e, 0xde, 0x04, 0x0e, 0xfc, 0x2d,\n","  0x1b, 0xdb, 0xec, 0xee, 0x00, 0xf6, 0x01, 0x33, 0x02, 0xe7, 0x06, 0xdd,\n","  0xf9, 0x03, 0x13, 0x03, 0xf8, 0xec, 0x14, 0xe4, 0x0f, 0xfa, 0xd4, 0x22,\n","  0x00, 0x11, 0x09, 0x02, 0x0e, 0xf4, 0x05, 0xfb, 0x04, 0x15, 0x04, 0x03,\n","  0xff, 0x0f, 0x09, 0xf2, 0xeb, 0xfc, 0x06, 0x00, 0xe5, 0x0a, 0xf2, 0xfc,\n","  0xfd, 0x12, 0xee, 0xe9, 0xf2, 0xfd, 0xf9, 0xf3, 0xce, 0x0f, 0xe9, 0xee,\n","  0xff, 0x14, 0x15, 0x0b, 0xcb, 0x03, 0xf2, 0x1b, 0xdb, 0x09, 0x1d, 0x07,\n","  0xd8, 0xde, 0xe6, 0x13, 0xd8, 0xf0, 0xe6, 0x00, 0xe7, 0xec, 0xd3, 0x00,\n","  0xc5, 0x25, 0xdb, 0x0a, 0xde, 0x1f, 0xd9, 0x11, 0xc1, 0x06, 0x01, 0x2e,\n","  0x09, 0x19, 0x09, 0x0f, 0xbe, 0x00, 0xf7, 0x08, 0x10, 0x12, 0xff, 0x10,\n","  0xf4, 0x05, 0xdf, 0x16, 0xe7, 0xe6, 0xef, 0xf4, 0xdd, 0x18, 0x18, 0x16,\n","  0xeb, 0x1a, 0xd7, 0xdb, 0xee, 0x15, 0xf1, 0x1e, 0xfc, 0x02, 0xfe, 0x0a,\n","  0xed, 0x17, 0x1c, 0x39, 0x01, 0xde, 0x06, 0xf3, 0xdb, 0x27, 0xfc, 0x1e,\n","  0xe4, 0x01, 0x03, 0x1d, 0xc5, 0x0d, 0xea, 0x0b, 0xfe, 0x05, 0xfc, 0x10,\n","  0xc2, 0x06, 0x0a, 0x51, 0xf4, 0xd8, 0xe8, 0x03, 0xcd, 0x1a, 0xe7, 0x13,\n","  0xfb, 0xfd, 0xe2, 0x2a, 0xf7, 0x0d, 0xea, 0x29, 0xfc, 0xea, 0x1c, 0x08,\n","  0x0a, 0x13, 0xfc, 0xf8, 0x15, 0xf3, 0x06, 0xe9, 0x1d, 0x0c, 0x1c, 0x14,\n","  0xdc, 0x17, 0x16, 0xff, 0x00, 0x06, 0x0c, 0xfe, 0x0c, 0x0a, 0xe6, 0x18,\n","  0xef, 0xd6, 0x1d, 0xee, 0xd2, 0x1c, 0xfe, 0x0d, 0xec, 0xfc, 0xe8, 0x02,\n","  0xf8, 0x13, 0xf9, 0x17, 0x08, 0xf8, 0xf9, 0x06, 0x04, 0x07, 0xcf, 0x07,\n","  0xfb, 0xde, 0xf2, 0x0c, 0xe4, 0xf2, 0x1d, 0xdd, 0xd7, 0xfd, 0xec, 0xfd,\n","  0xd8, 0xd9, 0x0a, 0xf5, 0xf4, 0x02, 0x1f, 0x0e, 0xf8, 0x1a, 0xe0, 0x06,\n","  0x0a, 0x23, 0xf6, 0x1f, 0xea, 0x07, 0xde, 0x00, 0xf5, 0x10, 0xe7, 0x06,\n","  0xf3, 0xe1, 0x0a, 0x2a, 0xf0, 0x00, 0x18, 0x09, 0xe8, 0xd6, 0xec, 0x00,\n","  0xef, 0x1c, 0xf2, 0x07, 0xf1, 0xf5, 0x16, 0x13, 0xdf, 0x0f, 0xdd, 0x1b,\n","  0x10, 0xdb, 0xfb, 0x07, 0xda, 0x17, 0xdf, 0x28, 0xf5, 0xe9, 0x07, 0x0b,\n","  0x02, 0xf4, 0xf0, 0x0e, 0xda, 0x1e, 0x1d, 0xff, 0xde, 0x0e, 0x1e, 0x24,\n","  0xf5, 0xfc, 0x08, 0x1f, 0xff, 0x12, 0x09, 0x18, 0x20, 0xd8, 0x08, 0xf0,\n","  0xef, 0x07, 0x02, 0x19, 0xe8, 0xf3, 0x02, 0x03, 0xdf, 0x22, 0x0e, 0x04,\n","  0x0d, 0xf9, 0xea, 0x1c, 0xf1, 0x17, 0x08, 0x02, 0x0b, 0x02, 0x00, 0x22,\n","  0xf0, 0x0e, 0xdf, 0x07, 0xea, 0x01, 0xf3, 0xef, 0xfb, 0xff, 0x07, 0xfd,\n","  0xf7, 0xf2, 0x14, 0x1e, 0x17, 0xe7, 0x12, 0xf8, 0xee, 0xfc, 0x09, 0xe0,\n","  0x08, 0xd5, 0x07, 0xff, 0x11, 0xf7, 0xee, 0x14, 0xfd, 0xe0, 0xda, 0x03,\n","  0xd5, 0xcd, 0x04, 0xe5, 0xea, 0xde, 0xf7, 0x02, 0x0b, 0xfb, 0x03, 0x10,\n","  0xf7, 0xcf, 0x0c, 0xfb, 0xee, 0x06, 0x0a, 0x12, 0x0e, 0xd7, 0xfb, 0x06,\n","  0xf6, 0xe0, 0xfb, 0xf1, 0xec, 0xf6, 0x13, 0xf6, 0x0a, 0xea, 0x24, 0x0a,\n","  0xfd, 0xe6, 0xf8, 0x19, 0x06, 0xe2, 0x05, 0x20, 0x08, 0xe3, 0xd8, 0x05,\n","  0x00, 0xcd, 0xeb, 0x0f, 0xfd, 0xec, 0xf6, 0xfc, 0xe1, 0xf8, 0xf4, 0xfe,\n","  0xdf, 0x10, 0xf8, 0x0d, 0xf3, 0xf9, 0x06, 0x06, 0xd5, 0xfb, 0x16, 0x18,\n","  0x00, 0xfe, 0xf9, 0x17, 0x12, 0xe2, 0xfb, 0xf8, 0xe5, 0x06, 0x29, 0xdf,\n","  0xfb, 0xfd, 0x08, 0x11, 0xf8, 0x10, 0x13, 0x03, 0xe1, 0xf9, 0xf8, 0xfd,\n","  0x06, 0xf2, 0x11, 0xff, 0xf8, 0xfe, 0x12, 0xf5, 0xf2, 0xe1, 0x26, 0x0b,\n","  0xe9, 0xfe, 0x04, 0xf1, 0xeb, 0xfd, 0x0c, 0x26, 0xfd, 0xfb, 0x12, 0xf8,\n","  0xfd, 0x01, 0x03, 0x05, 0x09, 0x27, 0x28, 0xff, 0x0f, 0x0a, 0xe9, 0xff,\n","  0x00, 0xec, 0xf7, 0xf4, 0x04, 0x03, 0x08, 0x10, 0xfe, 0xf3, 0x1f, 0xf5,\n","  0xf0, 0xff, 0x0a, 0x20, 0x0c, 0xd4, 0xef, 0xdb, 0xf5, 0xf4, 0x1a, 0x02,\n","  0xfe, 0xda, 0x04, 0xe4, 0x0b, 0xd9, 0x1a, 0xee, 0xfd, 0xc6, 0xf8, 0x0d,\n","  0xec, 0xfe, 0x19, 0xe1, 0x1f, 0xc5, 0x1d, 0x02, 0xf6, 0xd6, 0x04, 0xe6,\n","  0x06, 0xe4, 0x0c, 0xf0, 0x31, 0xe8, 0xe2, 0xec, 0x1d, 0xe8, 0x0f, 0x02,\n","  0x2d, 0xe8, 0xf1, 0xf7, 0x0f, 0xf9, 0x13, 0xfd, 0x1f, 0xd8, 0x24, 0x17,\n","  0xfb, 0xf8, 0x01, 0xe3, 0x14, 0xaf, 0x14, 0x01, 0x1c, 0xe5, 0x10, 0xf2,\n","  0x16, 0xd3, 0xed, 0xe3, 0x15, 0x02, 0x27, 0xeb, 0x1e, 0x12, 0x19, 0xff,\n","  0x16, 0xeb, 0x13, 0x11, 0xfa, 0x14, 0xf4, 0x02, 0x11, 0x08, 0xfc, 0xf9,\n","  0x07, 0xdc, 0x1c, 0xeb, 0x16, 0xf0, 0x1c, 0x06, 0x08, 0xfa, 0xf9, 0x11,\n","  0xee, 0x07, 0xf3, 0x06, 0xfd, 0xfd, 0x19, 0xf9, 0xf1, 0xe2, 0x1f, 0xf2,\n","  0x0f, 0xe9, 0x0c, 0xfb, 0x1d, 0x03, 0x02, 0xe2, 0x1c, 0x11, 0xfb, 0xf7,\n","  0x04, 0x04, 0x18, 0xe7, 0x27, 0xe2, 0xfc, 0xf5, 0x06, 0x00, 0x08, 0xfd,\n","  0x15, 0xdb, 0x16, 0xfe, 0x04, 0x08, 0xf8, 0xff, 0xfb, 0xeb, 0xeb, 0xfe,\n","  0xed, 0xf4, 0xf0, 0xe4, 0xfe, 0x22, 0x09, 0x02, 0x21, 0xc8, 0x0b, 0xe4,\n","  0xf4, 0xf2, 0x04, 0x02, 0xef, 0xce, 0x13, 0x07, 0xfa, 0xe0, 0xff, 0xf1,\n","  0xfe, 0xd5, 0xfc, 0xdc, 0x0f, 0xf2, 0x05, 0x10, 0x00, 0xd4, 0x24, 0xea,\n","  0x1e, 0xe3, 0x2a, 0x18, 0xf3, 0xd2, 0x01, 0xe0, 0x0e, 0xdb, 0x2a, 0xeb,\n","  0x02, 0xdd, 0xec, 0xd7, 0x12, 0xec, 0x31, 0xfc, 0x25, 0xd9, 0x04, 0x08,\n","  0x15, 0xd0, 0xe8, 0x14, 0x18, 0xf9, 0xfa, 0xf6, 0x24, 0xea, 0x0a, 0x06,\n","  0x02, 0xfb, 0x05, 0xea, 0x02, 0xf0, 0x04, 0xf1, 0x1f, 0x13, 0x04, 0x17,\n","  0x14, 0xf0, 0x0d, 0x10, 0x03, 0x05, 0x26, 0xec, 0xfe, 0xe8, 0x19, 0xe9,\n","  0x0a, 0xee, 0xe4, 0x04, 0x2a, 0xec, 0x1b, 0x06, 0x05, 0xff, 0xd7, 0xf5,\n","  0x1c, 0x0c, 0x20, 0xfe, 0xe3, 0xe1, 0x11, 0xdc, 0x2b, 0x03, 0x04, 0x1d,\n","  0x1a, 0xd4, 0x1d, 0xea, 0x06, 0x04, 0x04, 0x1a, 0x1e, 0xef, 0x00, 0xe0,\n","  0x1e, 0xf8, 0x0c, 0xfe, 0x12, 0xd8, 0x0b, 0xe5, 0xf2, 0x03, 0x21, 0x06,\n","  0x01, 0x22, 0xef, 0xf3, 0xfb, 0xfb, 0x25, 0x17, 0x08, 0xeb, 0xf3, 0xec,\n","  0xf4, 0x06, 0x21, 0xec, 0xe3, 0xe3, 0xe4, 0xe5, 0xf9, 0xe8, 0x0d, 0xec,\n","  0x1c, 0xc3, 0x0b, 0xdf, 0x12, 0x05, 0xe6, 0xdd, 0xde, 0xc5, 0xe6, 0xea,\n","  0x1a, 0xf1, 0x0f, 0xe3, 0x11, 0xcf, 0xea, 0xe5, 0xfe, 0xf6, 0x02, 0x0b,\n","  0x0e, 0xd5, 0x03, 0xd6, 0x11, 0x02, 0x2d, 0xfc, 0xed, 0xec, 0xee, 0xfa,\n","  0xf8, 0xf2, 0x01, 0x0e, 0x19, 0xf1, 0x14, 0x03, 0x1a, 0xf3, 0x0c, 0xf9,\n","  0xf5, 0xf4, 0xf2, 0xdf, 0xf0, 0xd6, 0x32, 0xf6, 0x18, 0x06, 0xf3, 0x01,\n","  0x02, 0xe8, 0x09, 0x14, 0xff, 0x0f, 0x23, 0x26, 0x05, 0xf3, 0x08, 0xf3,\n","  0x16, 0xfb, 0xed, 0x0d, 0x13, 0xe8, 0x25, 0xf1, 0xe9, 0xf2, 0xf5, 0x0c,\n","  0x19, 0xf0, 0x1f, 0xfa, 0x00, 0xe4, 0xfe, 0x22, 0xf2, 0xd5, 0x14, 0xe9,\n","  0x06, 0xe9, 0xfe, 0x13, 0x07, 0x08, 0x00, 0xfd, 0x16, 0xdb, 0xe0, 0x12,\n","  0x07, 0x14, 0x09, 0x1c, 0x17, 0x10, 0x20, 0xd3, 0xfd, 0xe9, 0x25, 0xfb,\n","  0x19, 0xd8, 0x0b, 0xf9, 0xf3, 0xde, 0xfe, 0x21, 0x12, 0xec, 0xf4, 0xe4,\n","  0xf7, 0xff, 0x21, 0xef, 0x26, 0x0f, 0xf9, 0xee, 0xe6, 0x03, 0x2f, 0xf7,\n","  0x0e, 0x10, 0xfa, 0x08, 0x0b, 0xfa, 0xe9, 0xff, 0xf9, 0xdd, 0x01, 0xe3,\n","  0xfb, 0x01, 0xfc, 0xf4, 0x1a, 0xb9, 0xf6, 0xd5, 0x1b, 0x01, 0xfd, 0xe2,\n","  0x03, 0xd2, 0x11, 0xf5, 0x10, 0xd9, 0x07, 0x07, 0xe1, 0xc1, 0xff, 0xd4,\n","  0x10, 0xef, 0x23, 0x10, 0x01, 0xba, 0x09, 0xd1, 0xfd, 0xe3, 0x0d, 0xe3,\n","  0x00, 0xcf, 0x03, 0xcd, 0xfd, 0xf9, 0xfe, 0xe9, 0x07, 0xe4, 0x04, 0xfc,\n","  0xf1, 0x00, 0x21, 0x01, 0xf6, 0x01, 0xda, 0x14, 0xe8, 0xd9, 0x14, 0x05,\n","  0x08, 0x01, 0x26, 0xf8, 0xfb, 0xc1, 0x2c, 0x1a, 0x06, 0xed, 0xef, 0xf5,\n","  0xf1, 0x00, 0x0e, 0x19, 0x1f, 0x08, 0xff, 0x0c, 0x04, 0xf6, 0x25, 0x17,\n","  0x1a, 0x0b, 0xeb, 0xe6, 0x0f, 0x10, 0x13, 0x14, 0x12, 0xfa, 0x22, 0xee,\n","  0xe6, 0x0b, 0x2d, 0xf9, 0x1e, 0xf0, 0x04, 0x09, 0x00, 0x0f, 0x2f, 0x05,\n","  0xe8, 0xf9, 0x03, 0xd7, 0x02, 0xea, 0x1f, 0xfd, 0x22, 0xed, 0xf1, 0xed,\n","  0xfe, 0xdc, 0x0d, 0x0e, 0x0c, 0xf0, 0x19, 0xf1, 0x09, 0xe0, 0x2c, 0xfb,\n","  0x02, 0xdc, 0xf3, 0xd9, 0x32, 0xf7, 0x09, 0xe3, 0x09, 0x17, 0x03, 0xf3,\n","  0x08, 0x01, 0x1b, 0xfa, 0x06, 0xfa, 0x1f, 0x15, 0x16, 0xe7, 0x16, 0xfe,\n","  0xfe, 0xf4, 0xe0, 0xe2, 0x12, 0x21, 0xfa, 0x15, 0x00, 0xcb, 0x07, 0xb6,\n","  0x1b, 0xf2, 0x34, 0xfa, 0xfd, 0xba, 0x19, 0xd4, 0x2c, 0xde, 0xf2, 0x1c,\n","  0x0c, 0xc5, 0xef, 0xe4, 0x0a, 0xfb, 0x03, 0x03, 0xf2, 0xcd, 0x01, 0xe0,\n","  0xf2, 0xf6, 0xf5, 0x0a, 0xf6, 0xc5, 0x0d, 0xe2, 0x09, 0xdc, 0x00, 0x05,\n","  0x10, 0xe1, 0x14, 0xf7, 0x02, 0x08, 0x14, 0x12, 0xf5, 0xf8, 0x1c, 0xe9,\n","  0xf5, 0xf1, 0x26, 0xd8, 0x16, 0x06, 0x00, 0xf8, 0xf4, 0xe0, 0x32, 0x03,\n","  0x07, 0x15, 0xea, 0x10, 0xf2, 0xfa, 0x17, 0x1f, 0x07, 0x07, 0x17, 0x06,\n","  0x06, 0xe7, 0x05, 0xfe, 0xe5, 0x1b, 0x16, 0xff, 0xf8, 0xfe, 0x2c, 0xf8,\n","  0x00, 0x03, 0xf3, 0xf3, 0xf3, 0xf0, 0xfb, 0xdf, 0x02, 0xe5, 0x16, 0xed,\n","  0xf9, 0x01, 0x23, 0x03, 0x16, 0xe6, 0xfe, 0xeb, 0x00, 0xf0, 0x27, 0x1b,\n","  0xeb, 0xee, 0x03, 0xe9, 0x02, 0xd8, 0x2f, 0xe4, 0x0d, 0xde, 0x14, 0xe3,\n","  0xfd, 0xf6, 0x13, 0x06, 0x10, 0xf4, 0xeb, 0xe5, 0x19, 0xf0, 0x17, 0xea,\n","  0x15, 0x0d, 0xe4, 0x0b, 0x31, 0xf3, 0x13, 0x1b, 0xf9, 0xe0, 0x0b, 0xfc,\n","  0x09, 0x03, 0x26, 0xe6, 0xeb, 0xd1, 0xd9, 0xc8, 0x00, 0xf7, 0x26, 0x0a,\n","  0x08, 0xd4, 0xe3, 0xd6, 0x1b, 0x06, 0x1a, 0xed, 0xf4, 0xee, 0xfd, 0xe7,\n","  0x14, 0xe1, 0x06, 0x11, 0xf9, 0xaa, 0xf6, 0xd7, 0x0c, 0xdf, 0x25, 0x17,\n","  0x11, 0xd8, 0xfa, 0x08, 0x0e, 0xed, 0x29, 0x0c, 0xec, 0xeb, 0x0b, 0x02,\n","  0xf3, 0xfb, 0x19, 0x1c, 0x13, 0x11, 0x10, 0xeb, 0x0d, 0xef, 0x11, 0xff,\n","  0x14, 0xe4, 0xd9, 0x02, 0xed, 0xe6, 0x23, 0xdf, 0xfb, 0xf4, 0xef, 0xee,\n","  0xf9, 0xf2, 0x24, 0x04, 0x03, 0x02, 0x0b, 0x0e, 0xed, 0x08, 0x19, 0xf9,\n","  0xf2, 0x02, 0xf4, 0x02, 0xf0, 0x1b, 0x03, 0x08, 0xf7, 0xe7, 0xf9, 0xf3,\n","  0xf7, 0x15, 0x11, 0x18, 0x18, 0x0e, 0x13, 0x13, 0x0d, 0x0e, 0x0e, 0x06,\n","  0xfb, 0xe8, 0x13, 0x09, 0x07, 0xf2, 0x24, 0x0c, 0x22, 0xf8, 0x08, 0xef,\n","  0xee, 0xec, 0x25, 0x09, 0x17, 0xde, 0xfb, 0xdd, 0x0d, 0xd0, 0x3c, 0x29,\n","  0x13, 0xf5, 0xeb, 0xeb, 0xfc, 0xd2, 0x33, 0xf9, 0x05, 0xe0, 0x15, 0x04,\n","  0x08, 0xfd, 0x14, 0x14, 0xfe, 0x0a, 0xee, 0xe7, 0x14, 0xfb, 0x15, 0xef,\n","  0x07, 0xdf, 0x12, 0x14, 0x00, 0xf0, 0xff, 0x03, 0xf9, 0xe5, 0xf7, 0xcf,\n","  0x07, 0xeb, 0x0b, 0xd8, 0xf4, 0xce, 0xe1, 0xaf, 0x20, 0x0b, 0xfa, 0x09,\n","  0xf6, 0xbf, 0x18, 0xe9, 0x06, 0xcc, 0x03, 0xf4, 0x0e, 0xb8, 0x08, 0xd0,\n","  0x07, 0xe9, 0x10, 0x17, 0x0a, 0xcf, 0x21, 0xf7, 0x03, 0xf9, 0x26, 0xe0,\n","  0x04, 0xe8, 0x0c, 0xff, 0x0b, 0xfe, 0x16, 0x16, 0xfe, 0xda, 0x17, 0x04,\n","  0xfd, 0x0b, 0x15, 0x0d, 0xf8, 0x08, 0xf9, 0xf3, 0x00, 0xe8, 0x07, 0x0a,\n","  0xf4, 0xf9, 0x0e, 0xdc, 0xfb, 0xe3, 0xfe, 0x09, 0xff, 0x07, 0xfa, 0xfd,\n","  0xe6, 0x05, 0xf9, 0x0e, 0xf2, 0xef, 0xfe, 0xf6, 0x04, 0xee, 0x2d, 0x0e,\n","  0x04, 0xe7, 0xec, 0xfb, 0xf1, 0x08, 0x17, 0x04, 0xf9, 0xf9, 0x15, 0xff,\n","  0x00, 0xfc, 0x23, 0xf6, 0x00, 0x1a, 0xf4, 0x1c, 0x02, 0x04, 0x1e, 0x11,\n","  0x00, 0xee, 0xf3, 0xe6, 0xed, 0xfa, 0x24, 0xe0, 0xfb, 0xe7, 0x10, 0xd7,\n","  0xdc, 0xf5, 0x4c, 0xf3, 0x19, 0x01, 0xf9, 0xef, 0x00, 0xee, 0x13, 0xeb,\n","  0xf9, 0xd7, 0x0b, 0xf1, 0xef, 0x05, 0x45, 0xf7, 0x01, 0x0b, 0xf3, 0xfa,\n","  0x0d, 0x10, 0x18, 0x1c, 0xf5, 0xf5, 0x0a, 0xef, 0x0c, 0x19, 0x06, 0xf8,\n","  0x06, 0xf1, 0x29, 0xd0, 0x0c, 0x07, 0x17, 0xf7, 0x18, 0xb0, 0x26, 0xcf,\n","  0x16, 0x01, 0x03, 0xf4, 0xf0, 0xc8, 0x04, 0xe8, 0x1a, 0xf4, 0x0f, 0xeb,\n","  0x0e, 0xb6, 0x00, 0xd3, 0x04, 0xf8, 0x26, 0xf8, 0x1a, 0xa8, 0xf9, 0xcb,\n","  0x04, 0xeb, 0x22, 0x0a, 0x0d, 0xcd, 0xeb, 0xea, 0x03, 0xe2, 0x09, 0xed,\n","  0x0b, 0xe3, 0x09, 0xf1, 0xf1, 0xec, 0x21, 0xee, 0x0e, 0xf4, 0x1c, 0x04,\n","  0xee, 0xfb, 0x0d, 0x1a, 0xfc, 0xf4, 0xfe, 0xef, 0x06, 0xe0, 0x13, 0x0e,\n","  0xfd, 0x05, 0x0b, 0x1d, 0xfd, 0xf6, 0x09, 0x1b, 0x04, 0x27, 0xf5, 0x0e,\n","  0xf0, 0xed, 0x1e, 0xf7, 0xea, 0xfa, 0x1a, 0xf9, 0xe5, 0x07, 0x15, 0x0e,\n","  0x00, 0xea, 0xfa, 0xe9, 0xf7, 0xec, 0x31, 0xec, 0x04, 0x09, 0x10, 0xec,\n","  0xfd, 0xe4, 0x27, 0x00, 0x0c, 0xdc, 0xdc, 0xde, 0xed, 0xe9, 0x1f, 0xe4,\n","  0xfa, 0x02, 0xd9, 0xfe, 0x06, 0xf1, 0x15, 0xee, 0xf1, 0xf3, 0x14, 0xe2,\n","  0x00, 0xdb, 0x28, 0x17, 0x09, 0xdc, 0xfe, 0xea, 0xfc, 0x14, 0x20, 0x13,\n","  0xf9, 0xed, 0xf1, 0xe8, 0xfd, 0x04, 0x3a, 0xfd, 0x00, 0x15, 0xf1, 0xee,\n","  0x10, 0xe3, 0x0b, 0x20, 0x10, 0xeb, 0x10, 0xc3, 0x14, 0xf8, 0x03, 0x0b,\n","  0x11, 0xc3, 0x27, 0xc5, 0x2d, 0xdb, 0x15, 0x0e, 0xf5, 0xce, 0xfa, 0xd8,\n","  0x1c, 0xf0, 0x20, 0x04, 0xec, 0xc4, 0xf9, 0xda, 0x1c, 0xd9, 0x01, 0x05,\n","  0x1f, 0xbb, 0xf8, 0xff, 0xef, 0x06, 0x10, 0xe3, 0x02, 0xe6, 0xdb, 0xee,\n","  0x02, 0xfe, 0xfc, 0x15, 0xfe, 0xf0, 0xdb, 0xfb, 0xf5, 0xfc, 0x16, 0x02,\n","  0xed, 0x01, 0x12, 0xe2, 0x06, 0xeb, 0x10, 0x16, 0x03, 0xed, 0x1a, 0x07,\n","  0xf0, 0xe4, 0x29, 0xf5, 0xfa, 0xe1, 0x07, 0xe8, 0xf8, 0xfd, 0xf5, 0x03,\n","  0xfc, 0x18, 0x03, 0xe2, 0x00, 0xf7, 0x13, 0xf9, 0xe4, 0x10, 0x25, 0xfc,\n","  0x0e, 0x1f, 0x1c, 0x12, 0x1e, 0xfd, 0x01, 0xf9, 0xef, 0x1d, 0x17, 0x1b,\n","  0x04, 0xfd, 0x25, 0x12, 0xf5, 0x20, 0x0a, 0x02, 0x03, 0xff, 0xe6, 0xe5,\n","  0xf4, 0x05, 0x42, 0x1a, 0x0b, 0xdc, 0xfd, 0xed, 0xf3, 0xd0, 0x43, 0xf3,\n","  0x10, 0x09, 0x0a, 0xed, 0xff, 0xe2, 0x1b, 0x1d, 0x08, 0xe4, 0xfe, 0xf7,\n","  0xff, 0xf9, 0x2e, 0xfa, 0xf8, 0xe7, 0xe7, 0xeb, 0xfd, 0xfe, 0x30, 0x06,\n","  0x00, 0x1d, 0x12, 0xf4, 0x0d, 0xf4, 0x1c, 0xed, 0x01, 0xd2, 0x17, 0xb3,\n","  0x0c, 0x0c, 0xf4, 0x1e, 0x26, 0xd8, 0xf7, 0xbd, 0x24, 0xe7, 0x11, 0x12,\n","  0xf9, 0xb9, 0xf6, 0xde, 0x3c, 0xf7, 0xfe, 0x0c, 0x16, 0xc5, 0x14, 0xcd,\n","  0x24, 0x06, 0xfa, 0x21, 0x03, 0xcb, 0xf7, 0xf0, 0xfc, 0xff, 0xfe, 0xf8,\n","  0x0a, 0xed, 0xdf, 0xe4, 0x0f, 0x19, 0x10, 0x0f, 0xf9, 0xf9, 0x11, 0xf3,\n","  0xf1, 0xf1, 0x33, 0xdc, 0x02, 0xd6, 0xde, 0xe0, 0xf9, 0xec, 0xfe, 0x09,\n","  0xfc, 0xd4, 0xeb, 0x0b, 0xec, 0xe3, 0x10, 0x0e, 0x0d, 0x13, 0x00, 0xe6,\n","  0xf2, 0xf2, 0x12, 0xec, 0x05, 0xf7, 0xff, 0x03, 0x02, 0x0f, 0x0c, 0x00,\n","  0xf3, 0xfc, 0x02, 0xd9, 0xf0, 0x02, 0xef, 0xfa, 0x06, 0xda, 0x0a, 0xe4,\n","  0xf6, 0x10, 0x14, 0x03, 0x12, 0xe6, 0x25, 0x09, 0x06, 0xf1, 0x26, 0x04,\n","  0xfa, 0xe1, 0xdd, 0xfa, 0xef, 0x06, 0x11, 0xfd, 0xf9, 0xf8, 0xfd, 0xe8,\n","  0xf8, 0x0b, 0x24, 0x22, 0xf9, 0xd1, 0x1a, 0xfe, 0xf0, 0xed, 0x3c, 0xfd,\n","  0xf6, 0xfc, 0xe4, 0xf6, 0xf1, 0x05, 0x25, 0xf9, 0xee, 0x1b, 0x0d, 0xe2,\n","  0xf8, 0xff, 0x2b, 0x16, 0xf6, 0xf4, 0x27, 0xe0, 0x02, 0x05, 0x0a, 0x11,\n","  0x1d, 0xd1, 0xfb, 0xcb, 0x17, 0xf3, 0x23, 0xf9, 0x17, 0xb7, 0xec, 0x9f,\n","  0x1d, 0xf2, 0x0f, 0x27, 0x10, 0xc5, 0xfa, 0xdf, 0x21, 0xe7, 0x0e, 0x01,\n","  0x06, 0xb1, 0x02, 0xe2, 0x0e, 0xf8, 0x07, 0x04, 0x1a, 0xc1, 0x04, 0xed,\n","  0xfe, 0xf6, 0x0c, 0x1c, 0x1d, 0xe1, 0xe5, 0xed, 0x03, 0xd7, 0xfb, 0x28,\n","  0x00, 0xdf, 0xe9, 0xcd, 0xef, 0x04, 0x20, 0xe9, 0x10, 0xde, 0x00, 0xee,\n","  0xf3, 0xd0, 0x02, 0x09, 0x0b, 0x0e, 0xee, 0xf8, 0xea, 0xf3, 0x31, 0x0d,\n","  0xf7, 0x1e, 0x0f, 0xe9, 0xe9, 0xff, 0x16, 0xda, 0x12, 0xf3, 0xec, 0x1c,\n","  0xfd, 0x04, 0x0a, 0x09, 0x01, 0xed, 0xf9, 0x0d, 0xf9, 0x12, 0xfc, 0x08,\n","  0xfa, 0xd6, 0x12, 0x0b, 0x02, 0xff, 0xfe, 0x06, 0x0f, 0xe3, 0xf0, 0xdb,\n","  0xf0, 0xf5, 0x0f, 0x17, 0x0d, 0xe4, 0x2a, 0xf4, 0x13, 0xe9, 0x3a, 0x0c,\n","  0x04, 0x11, 0xee, 0xf0, 0xf1, 0xf5, 0x31, 0x04, 0xf2, 0x04, 0x14, 0x02,\n","  0xfd, 0xe7, 0x2a, 0xf6, 0xff, 0x17, 0xed, 0xea, 0xe1, 0xf9, 0x27, 0x20,\n","  0x0b, 0xe6, 0x1f, 0xfe, 0x00, 0xf9, 0x10, 0x05, 0x04, 0x0e, 0xf0, 0xf7,\n","  0x18, 0x17, 0x13, 0xf0, 0x21, 0xcd, 0xf9, 0xcd, 0x13, 0xfb, 0x05, 0xe6,\n","  0x1b, 0xba, 0xf5, 0xb2, 0x2b, 0xd4, 0x19, 0x18, 0xf4, 0xc8, 0xee, 0xce,\n","  0x31, 0xf4, 0xec, 0x2d, 0xfa, 0xc0, 0xeb, 0xe9, 0x0e, 0xe0, 0x2f, 0xfe,\n","  0x17, 0xd1, 0x09, 0xfc, 0xf6, 0xdc, 0xf1, 0x00, 0x11, 0xd2, 0xf4, 0xe4,\n","  0xfc, 0x0f, 0x02, 0x27, 0x0e, 0xdd, 0x19, 0x08, 0x03, 0xf8, 0x1f, 0xeb,\n","  0xfa, 0x0d, 0xf1, 0x11, 0x0c, 0xe4, 0x31, 0x07, 0x02, 0xe7, 0xec, 0xf0,\n","  0xe7, 0x02, 0x1b, 0xf0, 0xf8, 0x22, 0xfa, 0xe2, 0xfd, 0xf2, 0x13, 0x17,\n","  0x0d, 0xf3, 0xfc, 0x01, 0xe4, 0xe2, 0x01, 0x09, 0xf4, 0xf1, 0x0c, 0x0d,\n","  0x00, 0xf9, 0xfa, 0x07, 0x0c, 0xf4, 0xf5, 0xe9, 0xfa, 0x2f, 0x3d, 0x11,\n","  0xef, 0x0b, 0x12, 0x04, 0xed, 0xfb, 0x17, 0x0e, 0x0d, 0xfb, 0xfb, 0xe1,\n","  0x0e, 0xf0, 0x22, 0x13, 0x07, 0xed, 0xee, 0xda, 0xf2, 0xe8, 0x48, 0x07,\n","  0xfc, 0xd2, 0xe3, 0xf0, 0xfa, 0xf9, 0x10, 0x0c, 0xe7, 0xeb, 0x01, 0xd3,\n","  0xfb, 0xff, 0x3b, 0xf9, 0xf8, 0xef, 0xe9, 0xea, 0xe3, 0x01, 0x03, 0x04,\n","  0xfb, 0xf9, 0x1a, 0x1e, 0x18, 0xf4, 0x05, 0x22, 0x21, 0xc9, 0x0c, 0xbf,\n","  0x27, 0xfb, 0x06, 0x1d, 0x17, 0xce, 0x0e, 0xb7, 0x3c, 0xfa, 0xea, 0x0f,\n","  0x12, 0xa6, 0xff, 0xd6, 0x25, 0xd4, 0x1e, 0xe4, 0x12, 0xaf, 0xdd, 0xd6,\n","  0x2c, 0xfc, 0x08, 0xf5, 0x0e, 0xbb, 0x0a, 0xe2, 0x06, 0xfc, 0x27, 0x2e,\n","  0x0f, 0xc7, 0xf8, 0x00, 0x00, 0x04, 0x1c, 0x0b, 0x0e, 0x04, 0x17, 0x11,\n","  0x06, 0x0c, 0x17, 0x13, 0xfb, 0xf3, 0xe0, 0xe7, 0x06, 0xdf, 0x0b, 0x11,\n","  0x01, 0xfb, 0xef, 0x05, 0xf3, 0xc7, 0x01, 0xfc, 0xfc, 0x0b, 0x04, 0x00,\n","  0x04, 0x13, 0x25, 0x2a, 0x05, 0xfb, 0x24, 0xf1, 0xe6, 0xfd, 0x19, 0x09,\n","  0x01, 0xe0, 0xf2, 0xf5, 0x03, 0xfd, 0xfe, 0x06, 0x08, 0xe9, 0xde, 0x1a,\n","  0xfd, 0x17, 0x1b, 0x11, 0x0c, 0xf7, 0x0c, 0xf6, 0xfb, 0xf6, 0x29, 0x1b,\n","  0x1e, 0x00, 0xea, 0xe2, 0xfe, 0xeb, 0x1d, 0x22, 0xff, 0x15, 0xec, 0xcd,\n","  0xef, 0xc4, 0x18, 0x15, 0xed, 0xed, 0x08, 0xeb, 0xf8, 0xe4, 0x35, 0x08,\n","  0x0b, 0xe4, 0x13, 0xf4, 0xf6, 0xff, 0x12, 0xfc, 0xfc, 0x05, 0x0b, 0xf6,\n","  0xeb, 0x07, 0x0d, 0x0f, 0xf8, 0x21, 0xf0, 0xe1, 0x1e, 0xf2, 0xf1, 0xfe,\n","  0x2b, 0xe6, 0x2a, 0xd2, 0x15, 0xf1, 0x02, 0xfc, 0x22, 0xce, 0xe2, 0xbc,\n","  0x35, 0xf9, 0x1e, 0x1c, 0x17, 0xaf, 0xf7, 0xfa, 0x2a, 0xea, 0x13, 0xfe,\n","  0x08, 0xbe, 0x1b, 0xcf, 0x19, 0x16, 0x00, 0x1b, 0x1c, 0xbe, 0xe9, 0xee,\n","  0x05, 0xe6, 0xec, 0x03, 0x26, 0xd2, 0xec, 0x0c, 0xf7, 0xeb, 0xf8, 0xf8,\n","  0x1f, 0xde, 0xf3, 0xdd, 0x0f, 0x01, 0x26, 0xf9, 0x00, 0xf0, 0xe9, 0xe0,\n","  0x0f, 0xc3, 0x0b, 0xe9, 0x01, 0xee, 0x03, 0xd8, 0xf4, 0xee, 0x29, 0x14,\n","  0xf2, 0xfe, 0xf1, 0x09, 0xfc, 0x09, 0x0e, 0xfe, 0x06, 0x04, 0xfb, 0x07,\n","  0xf0, 0xfe, 0x24, 0xfa, 0xf7, 0xf9, 0x0b, 0xfa, 0xf1, 0xf3, 0x1c, 0xf9,\n","  0x05, 0xdb, 0x09, 0xf9, 0x10, 0xf5, 0x17, 0x2d, 0x09, 0xf9, 0xf3, 0x06,\n","  0xfd, 0xe4, 0x07, 0xf6, 0xff, 0xfb, 0xfe, 0xf7, 0xfb, 0xf2, 0x22, 0xfe,\n","  0xfb, 0xfb, 0x12, 0xe4, 0xf0, 0xec, 0x2a, 0x1c, 0xf8, 0xfa, 0x01, 0xd9,\n","  0xef, 0x00, 0x1d, 0x06, 0xf8, 0xff, 0x05, 0x0b, 0xf4, 0x00, 0x38, 0x16,\n","  0xf3, 0xf5, 0x1e, 0x07, 0xde, 0x0b, 0x32, 0x25, 0xfe, 0x03, 0x0d, 0x0a,\n","  0x1f, 0x05, 0x28, 0x01, 0x19, 0xd3, 0xff, 0xc2, 0x0a, 0x01, 0xf6, 0x1e,\n","  0x24, 0xda, 0xf9, 0xb2, 0x4f, 0xef, 0xf9, 0x13, 0xf5, 0xd2, 0xd7, 0xe6,\n","  0x37, 0xf4, 0x02, 0x09, 0x05, 0xa3, 0xf7, 0xd9, 0x14, 0xf2, 0x0b, 0x05,\n","  0x36, 0xbd, 0x0c, 0x17, 0xfc, 0xfa, 0x22, 0x27, 0x1f, 0xc2, 0xf6, 0xf3,\n","  0xff, 0xe6, 0x25, 0x17, 0x08, 0xd0, 0x04, 0x1a, 0xfb, 0xff, 0x08, 0x24,\n","  0xf1, 0xf3, 0x15, 0xf4, 0xf6, 0xf2, 0x12, 0xe5, 0x01, 0xd8, 0xec, 0x17,\n","  0x00, 0xd9, 0x08, 0x11, 0x04, 0x11, 0x02, 0xe9, 0xea, 0xe9, 0x20, 0xf4,\n","  0x12, 0xe7, 0xe3, 0x00, 0xfe, 0x10, 0x1d, 0xeb, 0xfe, 0xe6, 0xd6, 0x05,\n","  0xfa, 0xf3, 0x14, 0x19, 0x03, 0xdc, 0x0e, 0xe3, 0xf7, 0xfd, 0x31, 0xf3,\n","  0x05, 0x11, 0xf5, 0xe3, 0x01, 0x05, 0x2c, 0x03, 0x15, 0xdf, 0x21, 0x0e,\n","  0xe7, 0xfb, 0x09, 0x0c, 0xfb, 0xf9, 0x1b, 0xdc, 0xe3, 0xf3, 0x14, 0xdb,\n","  0x02, 0xe8, 0x0a, 0xfd, 0xf7, 0xf9, 0x05, 0xdb, 0xfb, 0xe7, 0xf2, 0xfe,\n","  0xf5, 0xe5, 0x10, 0xdd, 0x00, 0xf0, 0xe0, 0xf5, 0xf0, 0x04, 0x19, 0x24,\n","  0xff, 0xe4, 0xf0, 0xf0, 0x23, 0x19, 0x17, 0xf6, 0x11, 0xdd, 0xdf, 0xde,\n","  0x2a, 0xee, 0x0a, 0xfb, 0x2b, 0xc5, 0x05, 0xb4, 0x51, 0xf3, 0x09, 0x10,\n","  0x0a, 0xb3, 0xfd, 0xe6, 0x48, 0xdf, 0x14, 0x0b, 0x1b, 0xcc, 0xd9, 0xfa,\n","  0x15, 0xe5, 0xff, 0x24, 0x30, 0xbf, 0x05, 0x02, 0x09, 0x14, 0x25, 0x18,\n","  0x2d, 0xc2, 0xfe, 0xf5, 0x0a, 0x17, 0xfd, 0x03, 0x15, 0xd3, 0x21, 0x11,\n","  0x10, 0xe5, 0x02, 0xe3, 0xf7, 0x06, 0x15, 0xfa, 0xf5, 0xd3, 0x17, 0x02,\n","  0xf9, 0x05, 0x16, 0xe0, 0x16, 0xd4, 0x0c, 0xe9, 0xf4, 0xfd, 0x28, 0x15,\n","  0x04, 0xe2, 0x03, 0xfd, 0xf6, 0xf5, 0xfb, 0xf8, 0xf4, 0xf1, 0x10, 0xe6,\n","  0x02, 0xfe, 0x03, 0xca, 0xe8, 0x05, 0x14, 0x02, 0xf9, 0xdc, 0xef, 0xf7,\n","  0x09, 0x0f, 0x1e, 0x11, 0xfb, 0xfb, 0x13, 0x23, 0xf8, 0x06, 0x14, 0x12,\n","  0x1b, 0x13, 0x2a, 0xf4, 0x04, 0xe5, 0x24, 0x1c, 0x03, 0xf8, 0x01, 0xd3,\n","  0xe4, 0xd0, 0x3d, 0xe7, 0x0c, 0xde, 0xf1, 0xe3, 0xf1, 0xe8, 0x12, 0xf1,\n","  0x10, 0xdb, 0xe5, 0xd3, 0xe5, 0xf7, 0x0f, 0xeb, 0xf9, 0xee, 0x18, 0xe5,\n","  0xe9, 0x13, 0x18, 0x26, 0x14, 0x00, 0xfc, 0xf7, 0x2b, 0x0f, 0x05, 0xf5,\n","  0x39, 0xd3, 0xf1, 0xd8, 0x29, 0xf4, 0x0f, 0x15, 0x14, 0xbc, 0x00, 0xc9,\n","  0x3f, 0xe1, 0x05, 0x11, 0x23, 0xb4, 0xe3, 0xf6, 0x51, 0xde, 0x26, 0xf6,\n","  0x27, 0xb3, 0xf7, 0xdd, 0x2d, 0xf1, 0x10, 0x09, 0x3d, 0xcd, 0xea, 0xf1,\n","  0x0c, 0x0e, 0xfe, 0x21, 0x24, 0xd6, 0xf9, 0x08, 0xff, 0xee, 0x12, 0x08,\n","  0xfd, 0xe8, 0x19, 0xeb, 0x0b, 0xeb, 0x0f, 0x23, 0x0e, 0xd1, 0xfe, 0xf1,\n","  0xf3, 0xd7, 0xf7, 0x1f, 0xff, 0xe5, 0xfe, 0x12, 0x05, 0xee, 0x13, 0x20,\n","  0x22, 0xdd, 0x03, 0x19, 0x08, 0xee, 0xfd, 0x01, 0x12, 0x1a, 0xfc, 0x0c,\n","  0xf5, 0xf4, 0xfd, 0xef, 0x05, 0xe8, 0x17, 0x08, 0xf2, 0xea, 0x08, 0x13,\n","  0x03, 0xff, 0xf0, 0xe9, 0xfe, 0xff, 0x22, 0xfb, 0xff, 0xee, 0x0c, 0xfb,\n","  0xff, 0x06, 0x27, 0x01, 0x08, 0xe3, 0x0c, 0xf1, 0x06, 0xe4, 0x19, 0x0d,\n","  0x0e, 0xe1, 0xdc, 0xe8, 0xdb, 0xed, 0x2a, 0x0a, 0x06, 0xfd, 0x0e, 0xfb,\n","  0xfb, 0x06, 0x25, 0x27, 0xfc, 0xf2, 0xf5, 0xf6, 0xef, 0xf7, 0x35, 0xf2,\n","  0xe9, 0xea, 0x05, 0xf1, 0xdf, 0x06, 0x16, 0xf2, 0xfe, 0xde, 0xf0, 0x05,\n","  0x2c, 0x25, 0x0a, 0x15, 0x0e, 0xc2, 0x03, 0xad, 0x3a, 0xee, 0x09, 0x27,\n","  0x31, 0xb8, 0x20, 0xb5, 0x53, 0xd7, 0x09, 0xea, 0x0b, 0xc9, 0x04, 0xf9,\n","  0x61, 0xda, 0xde, 0x19, 0x2d, 0xc3, 0xe7, 0xd4, 0x1b, 0xe7, 0xf9, 0x0f,\n","  0x43, 0xc2, 0xff, 0xe6, 0x0c, 0xef, 0x13, 0xf3, 0x1b, 0xe0, 0x0b, 0x08,\n","  0x05, 0x03, 0x09, 0x03, 0x23, 0xf4, 0xe8, 0xf5, 0x15, 0xfe, 0xee, 0xe8,\n","  0x06, 0xe1, 0xe8, 0xf0, 0x20, 0xb3, 0xf4, 0x02, 0x06, 0xe4, 0xfa, 0x14,\n","  0x02, 0xef, 0x13, 0x16, 0x08, 0x0f, 0x0e, 0x22, 0x0b, 0xed, 0xf3, 0x1b,\n","  0x1d, 0x01, 0x22, 0xec, 0x01, 0xe0, 0xf5, 0x18, 0x0c, 0xd5, 0xff, 0x0e,\n","  0x09, 0x06, 0x0b, 0xf1, 0x12, 0xe2, 0xe4, 0xd5, 0x07, 0xfb, 0xfc, 0xfe,\n","  0xf7, 0xf7, 0x04, 0x02, 0xfe, 0xee, 0x05, 0x06, 0x04, 0xd9, 0x00, 0x06,\n","  0xfb, 0x01, 0x28, 0x06, 0x09, 0xfe, 0x1c, 0xd7, 0xf9, 0xdc, 0x1a, 0xf3,\n","  0xf6, 0xc9, 0xfd, 0xfe, 0x06, 0xdc, 0x09, 0xf6, 0xfe, 0xe7, 0x18, 0xf9,\n","  0xf7, 0xe4, 0x24, 0xf5, 0xe9, 0x0a, 0x08, 0xf0, 0xf1, 0x08, 0x2c, 0xfd,\n","  0xf9, 0xe4, 0xf9, 0x03, 0x38, 0x05, 0x0d, 0xf6, 0x1e, 0xda, 0xfc, 0xb9,\n","  0x58, 0x01, 0xff, 0xf5, 0x33, 0xb4, 0xf7, 0xb7, 0x72, 0x12, 0x14, 0xf7,\n","  0xff, 0xd5, 0x06, 0xda, 0x61, 0xd0, 0x06, 0x05, 0x1e, 0xca, 0x0a, 0xfa,\n","  0x30, 0xcf, 0xfa, 0xf2, 0x31, 0xd2, 0x0d, 0xcd, 0x2f, 0xd8, 0x13, 0x13,\n","  0x2c, 0xcc, 0x08, 0xd6, 0x23, 0xd9, 0x12, 0x11, 0x18, 0xfa, 0x0c, 0xe3,\n","  0x18, 0xef, 0xef, 0x00, 0x26, 0xf0, 0xf3, 0xe7, 0x1e, 0xc9, 0x0e, 0x26,\n","  0x04, 0xeb, 0xf0, 0x0a, 0x26, 0xc9, 0xf6, 0xfb, 0x0c, 0xf1, 0x11, 0x00,\n","  0x18, 0xec, 0x10, 0x07, 0x0e, 0x06, 0xde, 0xed, 0x0b, 0xd8, 0x13, 0xfe,\n","  0x05, 0xfc, 0x00, 0xd0, 0x13, 0x07, 0x1f, 0xf2, 0x11, 0x13, 0x0a, 0x1d,\n","  0x10, 0xf8, 0xfd, 0x06, 0x02, 0x06, 0xf5, 0xdf, 0x10, 0xfa, 0x11, 0xe0,\n","  0xf7, 0xf5, 0xf9, 0xe8, 0x0d, 0xda, 0x02, 0xf3, 0xf2, 0xef, 0x0c, 0xe9,\n","  0xfc, 0xc3, 0x18, 0x12, 0xea, 0xfb, 0x08, 0x0f, 0xf7, 0xdf, 0x23, 0x08,\n","  0x03, 0xeb, 0xe9, 0x1e, 0xf2, 0xe2, 0x13, 0xea, 0x01, 0xf2, 0xec, 0xe8,\n","  0xed, 0x0d, 0x15, 0xfc, 0x0f, 0xfd, 0x03, 0xfd, 0x61, 0xee, 0x12, 0xe4,\n","  0x01, 0xd0, 0x0d, 0xc4, 0x4a, 0x10, 0x07, 0x1d, 0x2e, 0xab, 0xe3, 0xa9,\n","  0x7f, 0xf8, 0x1f, 0xe3, 0x00, 0xe5, 0xe6, 0xcd, 0x6c, 0xc4, 0x2a, 0xfb,\n","  0x18, 0xd8, 0xf7, 0xb7, 0x49, 0xf7, 0x19, 0xe2, 0x2e, 0xe3, 0xf5, 0xfd,\n","  0x33, 0xfa, 0x0b, 0xfd, 0x0a, 0xdc, 0xf0, 0x0c, 0x34, 0xd0, 0x02, 0xf4,\n","  0x22, 0xe4, 0xf8, 0xe3, 0x2f, 0xe4, 0x11, 0xe5, 0x0e, 0x0c, 0x1e, 0xe6,\n","  0x21, 0xe8, 0x10, 0xfa, 0x07, 0xfa, 0xef, 0x03, 0x01, 0xde, 0x02, 0x08,\n","  0x0d, 0xdc, 0x17, 0x00, 0x01, 0xe1, 0x1c, 0x0e, 0xfc, 0x02, 0x04, 0xe8,\n","  0x07, 0xee, 0x06, 0xff, 0x09, 0xcd, 0x1a, 0xd1, 0x18, 0x2c, 0xff, 0xf4,\n","  0xf4, 0xee, 0x19, 0xec, 0x1b, 0xf4, 0x09, 0x0e, 0x02, 0xee, 0x15, 0xe3,\n","  0x0f, 0xe4, 0x02, 0x08, 0xfb, 0x15, 0x09, 0xf1, 0x01, 0xcd, 0x22, 0x19,\n","  0xee, 0x04, 0x1f, 0xd7, 0x0c, 0xd5, 0x10, 0xea, 0x0c, 0x06, 0x14, 0xd1,\n","  0xef, 0xef, 0x22, 0x22, 0xf1, 0xf1, 0xfc, 0x0d, 0xf7, 0x00, 0x0e, 0x07,\n","  0xf4, 0x0d, 0x12, 0x01, 0xde, 0x1d, 0x04, 0xe5, 0x03, 0x15, 0xe8, 0xda,\n","  0x62, 0x0f, 0x1a, 0xeb, 0x13, 0xd1, 0x09, 0xe7, 0x79, 0x25, 0xfb, 0xff,\n","  0x43, 0xa8, 0xef, 0xa4, 0x61, 0xfe, 0x15, 0x16, 0x28, 0xbc, 0x07, 0xd6,\n","  0x59, 0xd3, 0x00, 0xf0, 0x18, 0xcb, 0x05, 0xca, 0x2f, 0x08, 0xf4, 0x2d,\n","  0x1f, 0xe5, 0x07, 0xfb, 0x1c, 0x0e, 0x26, 0xf3, 0x3c, 0xd1, 0xe7, 0xf7,\n","  0x0f, 0xf2, 0xfc, 0x24, 0x3a, 0xf4, 0xfa, 0xfc, 0x09, 0xe1, 0x0e, 0x00,\n","  0x06, 0xe2, 0x04, 0xe8, 0x15, 0xdd, 0xf6, 0x06, 0x21, 0xe5, 0xfb, 0xe7,\n","  0xfe, 0xed, 0xfb, 0x14, 0x1c, 0xdd, 0xf8, 0xf6, 0x26, 0x02, 0x02, 0xf1,\n","  0xf7, 0xd3, 0x13, 0xeb, 0x18, 0x03, 0x12, 0xf4, 0xe5, 0xf0, 0xef, 0xe9,\n","  0x2c, 0x0d, 0xe3, 0x19, 0x12, 0xc8, 0xdd, 0xee, 0x08, 0x0b, 0xee, 0x19,\n","  0xf9, 0xf3, 0xf4, 0xf9, 0x0a, 0xfd, 0xf2, 0x0e, 0x15, 0xf8, 0xd6, 0x03,\n","  0x1f, 0xe9, 0xfd, 0x04, 0x15, 0x1f, 0x21, 0xe1, 0x0c, 0xf8, 0xec, 0xf4,\n","  0xee, 0x0c, 0xef, 0xfd, 0x0a, 0xf4, 0x06, 0x14, 0x10, 0xe1, 0xdd, 0x0b,\n","  0x0b, 0x05, 0x0e, 0x0f, 0x01, 0xf7, 0xfd, 0xe0, 0xe2, 0x26, 0x28, 0x26,\n","  0x10, 0x00, 0xe8, 0xfd, 0xfa, 0xec, 0xf7, 0x14, 0x08, 0xff, 0xf7, 0x0c,\n","  0x06, 0x09, 0xf3, 0x0b, 0xf3, 0xfe, 0xec, 0xfd, 0x1a, 0xf8, 0xf1, 0xdb,\n","  0xfe, 0x0f, 0xff, 0x0b, 0x17, 0x1f, 0xfb, 0xe7, 0x0c, 0x13, 0x10, 0xf6,\n","  0x04, 0x11, 0xf3, 0xfd, 0xec, 0xd0, 0xf3, 0xfa, 0x01, 0xfe, 0x03, 0x07,\n","  0x0d, 0xde, 0xf8, 0x05, 0xee, 0xf0, 0xff, 0x08, 0xff, 0xf0, 0x1d, 0x05,\n","  0x14, 0xea, 0xfe, 0x04, 0xf1, 0x0e, 0x19, 0xfb, 0x1a, 0xff, 0xef, 0xf2,\n","  0x02, 0xf5, 0xe7, 0x0e, 0xe4, 0x1d, 0xfa, 0x14, 0xf0, 0xde, 0xf0, 0xe4,\n","  0xf6, 0x04, 0x07, 0xe6, 0xf1, 0x1b, 0xff, 0xfb, 0x16, 0x02, 0x01, 0x10,\n","  0x08, 0x14, 0x08, 0x03, 0xf7, 0x01, 0x02, 0xf6, 0xf9, 0xe7, 0xe9, 0xf5,\n","  0x05, 0x14, 0xfc, 0xe1, 0xfb, 0x20, 0x03, 0x18, 0xfa, 0xe9, 0xf0, 0x1d,\n","  0xf9, 0xf0, 0xfb, 0xed, 0x0a, 0xd9, 0xf4, 0xeb, 0xed, 0x05, 0xf7, 0x0b,\n","  0x0f, 0xf0, 0x0a, 0x07, 0xee, 0xdd, 0x17, 0x08, 0xfb, 0x1c, 0xf4, 0x23,\n","  0xfd, 0x0f, 0x07, 0xdf, 0x03, 0x1f, 0xed, 0xf1, 0xfd, 0xfb, 0xdc, 0x0a,\n","  0x18, 0xf9, 0x00, 0xea, 0xf7, 0xe8, 0xf6, 0x07, 0xee, 0xf8, 0xec, 0xf7,\n","  0x04, 0x0e, 0x0f, 0x00, 0x18, 0xfc, 0x09, 0x1a, 0xfb, 0x00, 0xe5, 0xff,\n","  0x0f, 0x08, 0xeb, 0xfc, 0x0f, 0xe6, 0x14, 0x03, 0xf6, 0xfc, 0x0f, 0xfc,\n","  0x0b, 0xf2, 0x1c, 0x06, 0xf9, 0x09, 0xf9, 0xdf, 0x14, 0xfb, 0xd6, 0xeb,\n","  0xfb, 0xeb, 0x0d, 0x0b, 0x15, 0xe6, 0xf6, 0x04, 0x17, 0xfc, 0x10, 0xf4,\n","  0x05, 0xf7, 0xf7, 0xf2, 0xf9, 0xf0, 0xfc, 0x10, 0x08, 0x0d, 0xe1, 0x0c,\n","  0x06, 0x12, 0xf1, 0xfd, 0x10, 0x2a, 0xfb, 0xec, 0x0c, 0x05, 0x0b, 0x18,\n","  0x2b, 0x0c, 0x08, 0xeb, 0x22, 0xfb, 0xfe, 0x07, 0x08, 0x17, 0x0d, 0xed,\n","  0xe8, 0xf2, 0x0d, 0xdf, 0x14, 0xf5, 0xed, 0xe3, 0x00, 0x06, 0xfb, 0x15,\n","  0x01, 0x03, 0xf9, 0xfe, 0x08, 0x14, 0x01, 0xf3, 0xe4, 0xfb, 0xfe, 0xde,\n","  0x0f, 0xe8, 0xff, 0xf1, 0x03, 0xe5, 0x18, 0xff, 0xfd, 0x02, 0x10, 0xec,\n","  0xfb, 0xf5, 0x12, 0x06, 0x0c, 0xde, 0x0f, 0x0e, 0x03, 0xf1, 0xf9, 0x02,\n","  0xfa, 0x01, 0x07, 0xf3, 0x02, 0x0f, 0x03, 0x13, 0xf4, 0xee, 0x0a, 0x04,\n","  0x0f, 0x1c, 0x1a, 0x03, 0x08, 0x06, 0xf6, 0x16, 0xff, 0xec, 0x14, 0xfe,\n","  0x09, 0xf5, 0x06, 0x1d, 0xf3, 0xf0, 0x22, 0xf7, 0x28, 0xe3, 0x09, 0x28,\n","  0xf2, 0x1a, 0x1c, 0x0e, 0x1a, 0xd5, 0xf6, 0xdd, 0x03, 0xce, 0xff, 0x03,\n","  0xf5, 0xf2, 0x14, 0x02, 0x11, 0xd2, 0x08, 0xfa, 0xf2, 0xf7, 0xf6, 0xef,\n","  0xf8, 0xea, 0xf3, 0xf7, 0xe7, 0x0e, 0x03, 0xf5, 0x07, 0x04, 0x21, 0xf5,\n","  0xec, 0xf6, 0xf1, 0x0f, 0x09, 0x0a, 0x06, 0x03, 0x14, 0xee, 0x03, 0x26,\n","  0x01, 0x0a, 0x09, 0xf8, 0x0a, 0x17, 0xf6, 0x19, 0x1c, 0xfc, 0x0f, 0xf1,\n","  0xf8, 0x06, 0xf7, 0xd9, 0x0b, 0x0e, 0x04, 0xda, 0x03, 0xe8, 0x15, 0x0a,\n","  0x35, 0xfe, 0x03, 0xe5, 0x07, 0xfc, 0x11, 0xfa, 0xfc, 0xf4, 0xe9, 0x06,\n","  0xfd, 0xe4, 0x15, 0x07, 0x10, 0xef, 0xf6, 0xfc, 0x13, 0x14, 0x08, 0x09,\n","  0x12, 0xe6, 0xfb, 0xe1, 0x17, 0x04, 0xf8, 0xfc, 0xfc, 0xf1, 0xf3, 0xee,\n","  0x27, 0x0d, 0xf7, 0xfd, 0x0a, 0xf7, 0x14, 0x00, 0x0d, 0xff, 0xf3, 0x0a,\n","  0xf9, 0x01, 0x04, 0xfd, 0xf2, 0xf4, 0x13, 0x16, 0xfb, 0x09, 0xe4, 0xef,\n","  0xf8, 0xf1, 0x10, 0xff, 0x14, 0xfa, 0xda, 0xf6, 0xff, 0xff, 0xfb, 0x10,\n","  0x0b, 0x08, 0x0d, 0xf8, 0x04, 0x10, 0xf8, 0xf2, 0x10, 0x00, 0x16, 0x0b,\n","  0x00, 0x00, 0x14, 0x0b, 0xee, 0xf7, 0x0e, 0x0b, 0xf8, 0xed, 0xf6, 0x0f,\n","  0xff, 0xc1, 0xfc, 0x04, 0xf6, 0x0a, 0xfa, 0x01, 0xe3, 0xdc, 0x05, 0x07,\n","  0x00, 0x27, 0x01, 0x06, 0xe1, 0xeb, 0x25, 0x05, 0xf1, 0x22, 0x17, 0x1a,\n","  0x0a, 0xff, 0x15, 0x18, 0xf3, 0x0f, 0x01, 0x19, 0xfd, 0x0e, 0xec, 0x08,\n","  0xfa, 0xfd, 0x0f, 0xeb, 0x09, 0x0e, 0xe2, 0x23, 0x07, 0xfa, 0xef, 0xfe,\n","  0xe9, 0xfc, 0x27, 0x0d, 0x08, 0xf9, 0x0d, 0xf8, 0x1f, 0x15, 0x15, 0xd7,\n","  0x1d, 0x1a, 0x0e, 0x12, 0x10, 0x23, 0x0d, 0xef, 0xf4, 0x04, 0xff, 0xec,\n","  0x05, 0xfc, 0x05, 0x07, 0xf0, 0x0c, 0xfb, 0xf9, 0x07, 0xf4, 0x01, 0x0b,\n","  0xf5, 0x02, 0x14, 0xfa, 0xe3, 0xee, 0xe5, 0x08, 0xea, 0x11, 0x08, 0x0f,\n","  0xfc, 0xfc, 0xf4, 0xfb, 0xf6, 0x37, 0x0f, 0xea, 0xfe, 0xfe, 0xf6, 0xf5,\n","  0x11, 0x27, 0xed, 0xe9, 0xfb, 0x09, 0xfb, 0x05, 0xeb, 0xf8, 0x00, 0xf0,\n","  0xf1, 0x0c, 0x2b, 0x07, 0xe3, 0x0d, 0x27, 0xdc, 0x06, 0x22, 0xf3, 0x02,\n","  0xf9, 0x0a, 0x07, 0x24, 0xfe, 0x0a, 0x17, 0x1a, 0x07, 0xf7, 0xee, 0xf3,\n","  0x14, 0x0c, 0x04, 0x08, 0xf2, 0xec, 0xf7, 0x1d, 0xf1, 0xef, 0xf8, 0xef,\n","  0x19, 0xe8, 0x1d, 0x1a, 0xe1, 0xd8, 0x0c, 0xee, 0xe7, 0x17, 0x16, 0xe4,\n","  0xf4, 0xe8, 0x26, 0x08, 0x05, 0x24, 0x06, 0x0b, 0xf7, 0xe8, 0x27, 0x17,\n","  0xe5, 0xe7, 0xeb, 0xe8, 0x0d, 0xe2, 0xf7, 0x11, 0xfd, 0xdb, 0xf9, 0x17,\n","  0xfc, 0x15, 0x0f, 0x17, 0xe6, 0xeb, 0xf4, 0xf9, 0x03, 0x19, 0xe0, 0x1e,\n","  0x09, 0xed, 0xfe, 0xf7, 0x2a, 0x26, 0x12, 0x1a, 0xed, 0xe9, 0x0b, 0xf5,\n","  0x15, 0x20, 0x1c, 0x07, 0x07, 0xf7, 0x0a, 0x0d, 0x0f, 0x1e, 0x1a, 0xe6,\n","  0x0f, 0x24, 0x03, 0x1b, 0x20, 0xfc, 0x13, 0x04, 0x0c, 0x03, 0xfe, 0xea,\n","  0x00, 0x07, 0xec, 0x0f, 0xde, 0x16, 0x19, 0x07, 0xe7, 0xe5, 0x15, 0xfd,\n","  0xd4, 0x1a, 0xfb, 0x01, 0x07, 0xdb, 0x04, 0xfe, 0xda, 0x20, 0xf9, 0x0f,\n","  0xce, 0xf6, 0x19, 0x14, 0xe6, 0x2f, 0xed, 0x0b, 0x02, 0xfb, 0xd8, 0xf8,\n","  0xec, 0x1f, 0x03, 0xfe, 0x14, 0x1e, 0xfd, 0x00, 0xff, 0x13, 0xf4, 0xfb,\n","  0x01, 0x08, 0xd7, 0x03, 0x03, 0xe0, 0x03, 0xef, 0xfe, 0x0a, 0xe3, 0x05,\n","  0x03, 0x0b, 0x1e, 0xf0, 0xf1, 0x16, 0x18, 0x01, 0xfb, 0xe5, 0xf5, 0xdc,\n","  0x03, 0xed, 0x02, 0xff, 0x0b, 0x1a, 0xf7, 0x24, 0xf9, 0xda, 0x1a, 0xe7,\n","  0x05, 0x1d, 0xf8, 0xf1, 0xf6, 0xf2, 0xd6, 0xf0, 0xfb, 0x16, 0xf1, 0x10,\n","  0x17, 0xf5, 0x08, 0x09, 0xf7, 0xfa, 0xed, 0x02, 0x09, 0xfc, 0xf1, 0xf2,\n","  0xfd, 0xea, 0xfc, 0x01, 0x07, 0x06, 0x09, 0x06, 0x08, 0xfb, 0xea, 0x0c,\n","  0x03, 0x1e, 0x0b, 0x2b, 0xe3, 0xf1, 0x0b, 0xe4, 0x1b, 0x27, 0xea, 0x1c,\n","  0x0b, 0xfb, 0x01, 0x04, 0x1c, 0x26, 0xf2, 0xf2, 0xf6, 0xf2, 0xfb, 0xfb,\n","  0x05, 0x2c, 0xef, 0xe9, 0xfb, 0x05, 0x10, 0x0b, 0x08, 0x05, 0x1c, 0xf1,\n","  0xd2, 0x07, 0x0b, 0xe0, 0xf9, 0x03, 0xe7, 0xf3, 0xfa, 0x12, 0xee, 0xf3,\n","  0xe0, 0xf8, 0x0e, 0xf0, 0xf1, 0x30, 0x17, 0x01, 0x00, 0xe0, 0x1a, 0xfe,\n","  0xde, 0x2c, 0x03, 0x05, 0x00, 0xe5, 0xf7, 0x02, 0xfb, 0x34, 0xdd, 0x08,\n","  0x09, 0x06, 0x1f, 0x0a, 0x00, 0x14, 0xec, 0xdd, 0xf7, 0xf0, 0xdb, 0xe9,\n","  0xf8, 0x14, 0xff, 0xee, 0xf5, 0xf9, 0x12, 0x01, 0x0c, 0xf7, 0xfd, 0x23,\n","  0xff, 0x0d, 0x19, 0x12, 0xfa, 0xf6, 0xf9, 0xfe, 0xe6, 0x00, 0x21, 0x0b,\n","  0xf8, 0xfd, 0x15, 0xfb, 0xee, 0xf2, 0xfe, 0x0a, 0x12, 0x1d, 0x09, 0xee,\n","  0xf4, 0xc4, 0xff, 0xe7, 0xfd, 0x2a, 0x22, 0x00, 0xe9, 0xff, 0xea, 0xf1,\n","  0xfb, 0x15, 0xe0, 0x19, 0xde, 0xe6, 0xf1, 0x00, 0xee, 0xfd, 0xf5, 0x0a,\n","  0x00, 0xfd, 0x0a, 0x0d, 0xf4, 0xf9, 0xf2, 0xe6, 0x02, 0x15, 0x1c, 0x00,\n","  0xee, 0xfb, 0xfe, 0xed, 0xf0, 0x3e, 0xff, 0x2f, 0xf6, 0xf7, 0xf7, 0xda,\n","  0x11, 0x22, 0x15, 0x26, 0xfc, 0xfe, 0xfb, 0xfc, 0xf6, 0x2f, 0x02, 0x14,\n","  0x18, 0xe9, 0x14, 0x19, 0x14, 0x22, 0x02, 0xfd, 0xff, 0x1a, 0x13, 0xf9,\n","  0xfd, 0x08, 0x06, 0xeb, 0xeb, 0x1e, 0xf0, 0xf6, 0xf4, 0x01, 0xf9, 0x0f,\n","  0xe5, 0x03, 0xf4, 0xea, 0x02, 0xe0, 0x04, 0x09, 0xe2, 0x2d, 0xf7, 0x16,\n","  0x04, 0xde, 0xd8, 0xf2, 0xe2, 0x46, 0xe3, 0x08, 0xe8, 0x0d, 0xf6, 0xfc,\n","  0xfb, 0x2b, 0xf6, 0x0d, 0xe4, 0x01, 0xfa, 0x03, 0xeb, 0x28, 0x03, 0x24,\n","  0x1d, 0xf3, 0xff, 0xe9, 0xe7, 0x19, 0x1a, 0xe3, 0x04, 0xf7, 0xed, 0xfd,\n","  0x02, 0x04, 0x14, 0x09, 0x09, 0x1c, 0x0b, 0x08, 0x09, 0xe8, 0x0b, 0xef,\n","  0x04, 0x02, 0xfe, 0x19, 0xfc, 0xf4, 0x08, 0xf8, 0xef, 0xd4, 0x04, 0x13,\n","  0xf6, 0x1c, 0x16, 0x0b, 0xe1, 0xc3, 0xe0, 0xc7, 0x0f, 0x40, 0x12, 0xff,\n","  0xdf, 0x02, 0xf5, 0xf2, 0xfd, 0x0a, 0xfa, 0x12, 0xef, 0xe6, 0xfb, 0x0c,\n","  0xfa, 0x0d, 0xfa, 0x18, 0xed, 0xfe, 0x21, 0xf9, 0xed, 0xf3, 0x00, 0x1f,\n","  0xfc, 0x08, 0x1d, 0x20, 0xdd, 0x14, 0xf8, 0x0e, 0x15, 0x40, 0xeb, 0x30,\n","  0xdb, 0x09, 0xfc, 0xf1, 0xee, 0x1d, 0x0d, 0x3a, 0x02, 0x0c, 0x0d, 0xf3,\n","  0x2b, 0x2c, 0x0e, 0x0a, 0x04, 0xf6, 0xfe, 0xe6, 0x17, 0x21, 0xee, 0x0a,\n","  0x11, 0x05, 0xf4, 0x19, 0x05, 0x2b, 0xe7, 0xfa, 0xfa, 0x25, 0x08, 0xd8,\n","  0xdd, 0xf6, 0xf6, 0x22, 0xf0, 0xfa, 0x06, 0xdf, 0xe5, 0xe1, 0x09, 0xf2,\n","  0xfc, 0x2d, 0x07, 0xfa, 0xf2, 0xe8, 0xf7, 0xee, 0xf7, 0x46, 0x03, 0xfb,\n","  0xe9, 0xf7, 0x07, 0x01, 0x1b, 0x23, 0xf3, 0x09, 0xff, 0x07, 0xfa, 0xeb,\n","  0xfb, 0x38, 0x05, 0xf1, 0xed, 0xf9, 0x13, 0xfd, 0xf9, 0x16, 0x04, 0x12,\n","  0x00, 0x06, 0xf1, 0xf2, 0x0c, 0xfe, 0xf4, 0xd7, 0x08, 0x15, 0xe2, 0x11,\n","  0x14, 0x0c, 0x02, 0xeb, 0x06, 0x21, 0x00, 0x0c, 0x14, 0x0a, 0x24, 0xfe,\n","  0xda, 0xdb, 0x0f, 0x0a, 0xf5, 0x3a, 0x11, 0xe3, 0xed, 0xcc, 0xfb, 0xbb,\n","  0x12, 0x27, 0x0a, 0x02, 0xe8, 0x00, 0xfe, 0xf2, 0xfe, 0x1c, 0x05, 0xfb,\n","  0xf9, 0x0c, 0xf8, 0x1c, 0xe9, 0xfa, 0xe5, 0x10, 0xdc, 0xea, 0xdb, 0xfd,\n","  0xe4, 0x0a, 0xe9, 0xf5, 0xe9, 0x01, 0x2a, 0x19, 0xf9, 0x10, 0xfc, 0xff,\n","  0x06, 0x27, 0x0a, 0x4c, 0xe9, 0x03, 0xf4, 0x10, 0x25, 0x48, 0xef, 0x3f,\n","  0xfe, 0x00, 0xf9, 0x0a, 0x21, 0x2d, 0x08, 0x18, 0x0a, 0xed, 0x06, 0xe4,\n","  0x2d, 0x13, 0x09, 0x0c, 0x0c, 0x0f, 0x11, 0x06, 0x18, 0x18, 0xf0, 0xff,\n","  0xf2, 0x1e, 0xf8, 0x13, 0xe6, 0xf3, 0xea, 0x1e, 0xf5, 0x18, 0xfb, 0x1c,\n","  0xe2, 0xdb, 0x13, 0xf8, 0x03, 0x35, 0xfc, 0xf8, 0xed, 0xf1, 0x05, 0xf6,\n","  0x0b, 0x3c, 0xfe, 0x06, 0xe1, 0x0f, 0x03, 0x07, 0x11, 0x29, 0x16, 0x0e,\n","  0xec, 0x01, 0xf3, 0xf3, 0x11, 0x29, 0x07, 0x04, 0x15, 0x11, 0x10, 0xf0,\n","  0x04, 0x11, 0xf2, 0x22, 0x08, 0x0b, 0xff, 0xe8, 0x08, 0xf5, 0x00, 0xe1,\n","  0x01, 0x09, 0x04, 0xfd, 0x03, 0xea, 0x06, 0xf6, 0x01, 0x08, 0xed, 0x0d,\n","  0xfe, 0x0f, 0x07, 0x00, 0xe3, 0xd8, 0x02, 0x1e, 0xf3, 0x3d, 0x35, 0x0f,\n","  0xcb, 0xe2, 0x13, 0xd6, 0x0c, 0x4e, 0x16, 0xe3, 0xe0, 0xf2, 0xf4, 0xf4,\n","  0xf5, 0x28, 0xf8, 0xf8, 0xe8, 0x05, 0xe8, 0x12, 0xf9, 0x04, 0xee, 0x0e,\n","  0xdf, 0xee, 0xed, 0x0d, 0xe1, 0xff, 0xfa, 0x0c, 0xfd, 0x05, 0xe7, 0x13,\n","  0x07, 0x1c, 0xfd, 0x05, 0x0c, 0x42, 0xf3, 0x42, 0xf8, 0x17, 0xe6, 0x0d,\n","  0x03, 0x3a, 0x27, 0x33, 0x03, 0x02, 0x1c, 0x10, 0x15, 0x31, 0xdf, 0x18,\n","  0x00, 0xfb, 0x02, 0xe1, 0x12, 0x29, 0x23, 0x05, 0x03, 0x0c, 0x07, 0x11,\n","  0x0e, 0x06, 0xfa, 0xf8, 0x05, 0x1d, 0x24, 0x10, 0x00, 0xfa, 0xea, 0x10,\n","  0x06, 0xf4, 0xfd, 0x16, 0xec, 0xf0, 0x0c, 0x0e, 0x0d, 0x1b, 0xf0, 0xfd,\n","  0xf9, 0xe4, 0x16, 0xeb, 0x01, 0x2b, 0x0b, 0xeb, 0xef, 0x10, 0xdf, 0x04,\n","  0x14, 0x2f, 0x03, 0xe5, 0xf6, 0x11, 0x26, 0xf9, 0xf8, 0x36, 0x14, 0xf0,\n","  0x03, 0x05, 0xf3, 0xed, 0x13, 0x0a, 0x07, 0x03, 0xfb, 0xff, 0xf6, 0xe7,\n","  0x08, 0xf4, 0xf5, 0x08, 0x05, 0xfe, 0xe8, 0x1e, 0x05, 0x08, 0xe2, 0xfd,\n","  0x25, 0x14, 0x1c, 0x15, 0x0b, 0x1b, 0x0e, 0x01, 0xf1, 0xe1, 0x19, 0x15,\n","  0xec, 0x4b, 0x26, 0xfe, 0xd9, 0xe4, 0x09, 0xd5, 0xf9, 0x40, 0xfe, 0xff,\n","  0xf0, 0x07, 0x10, 0xfa, 0x0a, 0x29, 0x08, 0x01, 0xdd, 0xf1, 0xe8, 0x12,\n","  0xf3, 0x13, 0xf8, 0x02, 0xe2, 0xe2, 0x28, 0x11, 0xee, 0xf6, 0xd4, 0x0c,\n","  0xee, 0x19, 0xff, 0x10, 0xf2, 0x25, 0xff, 0xfd, 0x0d, 0x19, 0x06, 0x53,\n","  0x06, 0x06, 0xf9, 0x1a, 0x0a, 0x45, 0xe5, 0x43, 0x0a, 0x05, 0xfd, 0x0e,\n","  0x00, 0x17, 0x02, 0x14, 0xf4, 0x12, 0x08, 0xd7, 0x14, 0x0d, 0xf3, 0x1a,\n","  0x0b, 0xfe, 0x21, 0x1b, 0x2f, 0x1e, 0xf1, 0xf4, 0xfb, 0x21, 0x08, 0x00,\n","  0xf3, 0xfe, 0x02, 0x0e, 0x00, 0xfb, 0x13, 0xfc, 0xee, 0xf0, 0x04, 0x04,\n","  0x04, 0x10, 0x02, 0xfd, 0xdc, 0xfc, 0x0c, 0xe8, 0xfa, 0x36, 0xf4, 0x0a,\n","  0xf1, 0xfa, 0xf8, 0xea, 0x00, 0x23, 0x0c, 0x18, 0x00, 0xfb, 0x12, 0xf1,\n","  0xff, 0x29, 0x05, 0x05, 0x05, 0x0a, 0xf6, 0xec, 0x0c, 0x12, 0x05, 0x16,\n","  0xff, 0x11, 0x16, 0xed, 0x01, 0x0c, 0x16, 0x1a, 0xf6, 0x06, 0x09, 0x00,\n","  0x0b, 0x21, 0xf3, 0x0b, 0xdd, 0x1a, 0x12, 0x14, 0x06, 0x2f, 0x11, 0xdc,\n","  0xff, 0xdf, 0x1d, 0x17, 0xe4, 0x3f, 0x0c, 0xf5, 0xe6, 0xdb, 0xfa, 0xdf,\n","  0x0c, 0x33, 0x27, 0xe8, 0xe8, 0xf1, 0x0f, 0x06, 0x05, 0x18, 0x11, 0xfb,\n","  0xe2, 0xf9, 0xd9, 0xef, 0xf7, 0x0c, 0xf3, 0xfe, 0xce, 0xec, 0xfd, 0x04,\n","  0xfe, 0xf5, 0xf6, 0x19, 0xe0, 0x0f, 0x0a, 0x0a, 0xed, 0x0c, 0xf9, 0x26,\n","  0x08, 0x30, 0x00, 0x2a, 0xe7, 0xfd, 0xf4, 0xfc, 0x0c, 0x1c, 0xe9, 0x5e,\n","  0x1c, 0x0b, 0x07, 0xf0, 0x10, 0x23, 0xea, 0x17, 0xfc, 0x01, 0x0d, 0xfb,\n","  0x2f, 0x2f, 0xe7, 0xfb, 0x04, 0x1b, 0x0f, 0xd9, 0x14, 0x21, 0xf0, 0x0e,\n","  0xe9, 0x1c, 0x0d, 0xdd, 0xf2, 0x0c, 0xe7, 0x09, 0x01, 0x12, 0x0b, 0xe5,\n","  0xe4, 0xe7, 0x05, 0xdb, 0x10, 0x25, 0xf1, 0xfa, 0xfa, 0xeb, 0x18, 0x0f,\n","  0xf7, 0x3d, 0x22, 0xf3, 0xed, 0xfa, 0x01, 0x0d, 0x1b, 0x28, 0x00, 0xe8,\n","  0xfc, 0x0d, 0xf3, 0x00, 0x00, 0x16, 0x19, 0x05, 0x0b, 0x07, 0xfb, 0xfe,\n","  0x18, 0x15, 0x24, 0xeb, 0xf2, 0x16, 0x0a, 0xe6, 0x13, 0x03, 0xf5, 0xff,\n","  0x04, 0x00, 0xfe, 0x0e, 0x03, 0xf3, 0x0e, 0x0d, 0x04, 0x27, 0xeb, 0x0d,\n","  0x09, 0x23, 0x15, 0xf3, 0xdb, 0xf0, 0xf6, 0x14, 0x06, 0x1f, 0x19, 0xfa,\n","  0xe0, 0xe9, 0xfc, 0xd8, 0x09, 0x54, 0x21, 0xfb, 0xd8, 0x0d, 0xd8, 0x07,\n","  0x12, 0x1f, 0x04, 0x14, 0xdd, 0x03, 0x14, 0xf1, 0xdf, 0xfa, 0x01, 0x0e,\n","  0xe4, 0xfa, 0x0c, 0x20, 0xe6, 0x06, 0xf4, 0xfc, 0xf2, 0xf9, 0x01, 0x20,\n","  0xed, 0x18, 0xf3, 0x07, 0xfe, 0x3c, 0xfa, 0x3f, 0xfc, 0x14, 0x0b, 0xfe,\n","  0x0d, 0x29, 0x15, 0x30, 0x04, 0x01, 0xe6, 0x0f, 0x19, 0x23, 0xff, 0x22,\n","  0x0d, 0xfc, 0xfe, 0x13, 0x1a, 0x26, 0xd8, 0x1a, 0x13, 0x03, 0xfd, 0xfc,\n","  0x0c, 0x12, 0xed, 0x08, 0x18, 0x11, 0x1b, 0xfe, 0x19, 0xfa, 0xf4, 0x07,\n","  0xf6, 0x11, 0x09, 0xf5, 0x02, 0xdb, 0x21, 0xe0, 0x06, 0x13, 0xec, 0x06,\n","  0xf3, 0xe5, 0xfb, 0xfb, 0xf9, 0x2d, 0xf7, 0xe4, 0xfe, 0xfb, 0xdc, 0xf9,\n","  0x03, 0x1d, 0x0c, 0xec, 0xf1, 0x14, 0x03, 0x00, 0x04, 0x16, 0x1d, 0x00,\n","  0x18, 0x01, 0x20, 0xf8, 0x0f, 0x26, 0x11, 0xdb, 0x06, 0x0a, 0xe9, 0xf1,\n","  0x0b, 0xfe, 0x07, 0xf7, 0xff, 0xfe, 0x08, 0xff, 0x05, 0xf4, 0x13, 0x05,\n","  0xe3, 0x02, 0x24, 0xfe, 0x0e, 0x0d, 0x01, 0xea, 0xec, 0xd9, 0x0e, 0x0e,\n","  0xf2, 0x2f, 0x23, 0x11, 0xf3, 0xcd, 0x0c, 0xea, 0x1f, 0x49, 0x16, 0x04,\n","  0xec, 0x0a, 0x18, 0xef, 0x20, 0x0e, 0x20, 0xda, 0xc3, 0xfd, 0x09, 0xeb,\n","  0xe5, 0x20, 0xfb, 0x06, 0xe7, 0x04, 0xfc, 0x10, 0xfa, 0xf6, 0xfb, 0xee,\n","  0xe6, 0x0b, 0xee, 0x13, 0xeb, 0x11, 0xea, 0xed, 0x20, 0x34, 0x0f, 0x2e,\n","  0xf3, 0x1c, 0x00, 0xd4, 0x15, 0x3e, 0x12, 0x31, 0xf4, 0x06, 0xf9, 0xdd,\n","  0x11, 0x1c, 0x23, 0x11, 0xf8, 0xfb, 0x11, 0xfb, 0x19, 0x10, 0xd8, 0x24,\n","  0x10, 0x18, 0x0d, 0x27, 0x04, 0x0f, 0xdf, 0xf5, 0x08, 0x07, 0x12, 0xdb,\n","  0x08, 0x01, 0x07, 0xfe, 0xf3, 0x00, 0x09, 0xf9, 0x01, 0xd3, 0x00, 0xf9,\n","  0x05, 0x1d, 0xf9, 0xf2, 0xf4, 0xf9, 0x1a, 0xfd, 0xf2, 0x38, 0x01, 0x12,\n","  0xef, 0xf6, 0x06, 0xfb, 0x0a, 0x1c, 0xf6, 0x10, 0x06, 0x05, 0xf2, 0x03,\n","  0xf9, 0x07, 0x07, 0xf8, 0x0f, 0xff, 0xf3, 0xff, 0x17, 0x18, 0x08, 0x0d,\n","  0xf2, 0xff, 0xf1, 0x03, 0x2e, 0xfb, 0xff, 0xd5, 0xf0, 0x05, 0x01, 0x0d,\n","  0xf5, 0xf0, 0xeb, 0x05, 0x0c, 0x0d, 0xff, 0x13, 0x0c, 0x13, 0x24, 0xf1,\n","  0xf9, 0xf1, 0x07, 0x06, 0xe9, 0x45, 0x2c, 0x0e, 0xdc, 0xe5, 0x1c, 0xea,\n","  0x0e, 0x4e, 0x32, 0x05, 0xed, 0xfb, 0xfa, 0xf6, 0x0d, 0x15, 0xfb, 0xfb,\n","  0xe2, 0xf7, 0xea, 0xfb, 0xf1, 0x14, 0xef, 0x07, 0xf8, 0x08, 0x1d, 0x24,\n","  0xed, 0x06, 0xe3, 0xed, 0xf1, 0x09, 0x1f, 0x0e, 0xef, 0x1b, 0xec, 0xfb,\n","  0x10, 0x2c, 0x08, 0x3e, 0xef, 0x0d, 0x07, 0xf4, 0x0c, 0x35, 0x18, 0x30,\n","  0xf5, 0xf7, 0xf4, 0xf8, 0x12, 0x29, 0x00, 0x0e, 0xfe, 0x00, 0x03, 0xe6,\n","  0x13, 0x29, 0x1f, 0x22, 0x08, 0xfc, 0x0c, 0x06, 0x1f, 0x16, 0x0d, 0x0c,\n","  0xff, 0xf6, 0xfd, 0x1a, 0xfc, 0x00, 0xef, 0xff, 0x09, 0x0f, 0x0c, 0x02,\n","  0xfd, 0xe2, 0x0a, 0xf5, 0xfb, 0x1b, 0xf2, 0xdc, 0xff, 0xf7, 0x14, 0xf9,\n","  0x17, 0x2a, 0x19, 0x1c, 0xfc, 0x0d, 0xf0, 0x02, 0x09, 0x22, 0x13, 0x05,\n","  0x0c, 0x02, 0xff, 0x0c, 0x04, 0x08, 0x0d, 0xd3, 0x0b, 0x04, 0x12, 0xe4,\n","  0x0a, 0x16, 0x00, 0xf1, 0x10, 0x13, 0x07, 0xe6, 0x2a, 0xf1, 0xf3, 0xdf,\n","  0x08, 0x11, 0x0b, 0x07, 0x08, 0x0b, 0xe9, 0xef, 0xed, 0x0d, 0x06, 0x1e,\n","  0x06, 0x1d, 0x04, 0xf9, 0xfe, 0xde, 0xf6, 0x1b, 0xea, 0x4d, 0x12, 0xfd,\n","  0xe1, 0xec, 0x1e, 0xeb, 0xfc, 0x2f, 0x0b, 0x01, 0xdc, 0x03, 0xf7, 0xef,\n","  0x08, 0x07, 0x16, 0x04, 0xd2, 0x07, 0x08, 0xf0, 0xe7, 0x13, 0xfd, 0x04,\n","  0xdf, 0xf3, 0xfb, 0x25, 0xef, 0x06, 0x00, 0x07, 0xf1, 0x0d, 0x05, 0x00,\n","  0x01, 0x1a, 0xf9, 0xf1, 0x09, 0x42, 0x19, 0x2b, 0x0b, 0x12, 0xfc, 0x16,\n","  0x15, 0x2b, 0x19, 0x27, 0xfa, 0xfb, 0x04, 0xec, 0x15, 0x0e, 0x26, 0x26,\n","  0x11, 0xef, 0xf9, 0xeb, 0x29, 0x23, 0xf9, 0x05, 0xf6, 0x01, 0x17, 0x14,\n","  0x08, 0x14, 0x0a, 0x03, 0x05, 0x05, 0x10, 0x02, 0x0f, 0x0a, 0x0e, 0x0a,\n","  0x00, 0xff, 0x02, 0x03, 0xf0, 0xec, 0xe3, 0xf2, 0xf4, 0x16, 0x08, 0xf0,\n","  0x07, 0xda, 0x20, 0x05, 0x17, 0x34, 0x0b, 0xda, 0x02, 0xeb, 0x05, 0x14,\n","  0xfb, 0x19, 0x10, 0xe8, 0x08, 0xfa, 0xed, 0x07, 0xfe, 0x25, 0xf3, 0x1d,\n","  0xfd, 0xf6, 0xfa, 0xe5, 0x10, 0x12, 0x28, 0x09, 0x01, 0xfc, 0x0e, 0xde,\n","  0x2f, 0x05, 0x18, 0xfa, 0xf4, 0xf4, 0xed, 0x1d, 0x05, 0xfc, 0x01, 0xfd,\n","  0x10, 0x0a, 0x1c, 0x09, 0x23, 0x21, 0x0c, 0x18, 0xfe, 0xdc, 0xf7, 0x1d,\n","  0xea, 0x3c, 0x0d, 0x0c, 0x07, 0xe7, 0xe4, 0xe0, 0x03, 0x2c, 0xf7, 0xea,\n","  0xd2, 0x01, 0xfd, 0xe7, 0x24, 0x19, 0x04, 0xf1, 0xce, 0x02, 0xda, 0xe9,\n","  0xf7, 0x1d, 0xf2, 0x00, 0xd7, 0x15, 0x13, 0x15, 0xf1, 0x0b, 0xf6, 0xe8,\n","  0xf3, 0x11, 0xe5, 0x12, 0xea, 0x1f, 0xee, 0x18, 0x0c, 0x39, 0x02, 0x1c,\n","  0x03, 0x13, 0xf6, 0x1f, 0x0b, 0x39, 0xfb, 0x1d, 0x04, 0x03, 0xfb, 0xe9,\n","  0x12, 0x29, 0xfd, 0xfc, 0x18, 0x13, 0xff, 0x13, 0x18, 0x30, 0x0b, 0x1f,\n","  0xf9, 0x04, 0x02, 0xf8, 0x17, 0xfb, 0x26, 0xfe, 0x13, 0x02, 0x12, 0x1e,\n","  0x03, 0xfa, 0xf2, 0x06, 0x04, 0xe9, 0xed, 0x22, 0xfa, 0xfa, 0xf9, 0x00,\n","  0xfa, 0x18, 0x1b, 0xf9, 0x17, 0xf9, 0xed, 0x1c, 0xff, 0x22, 0x08, 0xfc,\n","  0xfd, 0x0a, 0xfe, 0x0e, 0x00, 0x17, 0xf9, 0x0b, 0xfa, 0x18, 0x0f, 0xff,\n","  0x01, 0x14, 0x27, 0xda, 0x10, 0x0a, 0xf0, 0xde, 0x10, 0x16, 0xfd, 0xef,\n","  0xfb, 0x04, 0x04, 0xf9, 0x2d, 0xfb, 0x00, 0xfe, 0xf5, 0xff, 0xfe, 0xdf,\n","  0x0a, 0x17, 0xfa, 0x04, 0xf6, 0x17, 0xf7, 0x11, 0xf4, 0x2a, 0xfa, 0x26,\n","  0x09, 0xfa, 0xdf, 0x16, 0xea, 0x29, 0x03, 0xf4, 0xdc, 0xdc, 0xf2, 0xd7,\n","  0x0a, 0x3e, 0x01, 0xfe, 0xf6, 0x04, 0x0a, 0x00, 0x0a, 0x38, 0xfb, 0xf7,\n","  0xdc, 0x02, 0x0b, 0xff, 0xeb, 0x0e, 0xee, 0x00, 0xe8, 0x0c, 0x0c, 0x23,\n","  0xf1, 0x00, 0xf0, 0x11, 0xec, 0x25, 0xf0, 0x0a, 0xfa, 0x23, 0xf7, 0xf2,\n","  0x10, 0x1b, 0x09, 0x08, 0x04, 0x10, 0xed, 0x03, 0x19, 0x33, 0x16, 0x1c,\n","  0xfb, 0x19, 0x08, 0x07, 0x07, 0x26, 0xfc, 0x11, 0x19, 0x05, 0xfb, 0xf6,\n","  0x38, 0x0e, 0xed, 0x2a, 0x0a, 0x14, 0x13, 0xe5, 0x15, 0x01, 0x07, 0x08,\n","  0xfc, 0x05, 0x1d, 0xf2, 0x08, 0x01, 0xde, 0xf3, 0x0d, 0xe8, 0x1b, 0xff,\n","  0xf3, 0xf5, 0xfa, 0xea, 0x07, 0x16, 0x25, 0x01, 0x07, 0xfc, 0x09, 0x12,\n","  0xf5, 0x12, 0xfc, 0x04, 0xf6, 0x0c, 0xef, 0xfd, 0x05, 0x2c, 0x04, 0x13,\n","  0xf8, 0xfc, 0xec, 0x15, 0x04, 0xfb, 0x05, 0x14, 0x03, 0xea, 0x02, 0xfe,\n","  0x0c, 0x14, 0x0d, 0x15, 0xfb, 0x07, 0xfd, 0xeb, 0x24, 0x06, 0xff, 0xeb,\n","  0xf4, 0x07, 0xfb, 0x22, 0x07, 0xe8, 0xee, 0x0e, 0xfb, 0xfe, 0xde, 0x19,\n","  0xf1, 0x38, 0xf6, 0x00, 0x0c, 0xf0, 0x00, 0x11, 0xfe, 0x36, 0xef, 0xd6,\n","  0xe6, 0xed, 0x0f, 0xe7, 0xfe, 0x38, 0xf2, 0x21, 0xe0, 0x02, 0xe4, 0x06,\n","  0x03, 0x1f, 0x08, 0xd9, 0xfd, 0xf6, 0x13, 0xfc, 0xe1, 0x0b, 0xfd, 0xfc,\n","  0xf2, 0xff, 0x29, 0x1a, 0xef, 0x04, 0xfb, 0xf0, 0xe2, 0x1d, 0x0a, 0x16,\n","  0xe3, 0x26, 0xdd, 0x0c, 0xf6, 0x23, 0xf0, 0x13, 0xfc, 0x0b, 0x10, 0x0e,\n","  0x0f, 0x35, 0xfb, 0x16, 0x14, 0x20, 0x04, 0x01, 0x0a, 0x0e, 0xf1, 0x0f,\n","  0x08, 0xf8, 0xf9, 0xf8, 0x1c, 0x32, 0x1a, 0x14, 0x05, 0x0c, 0x10, 0xda,\n","  0x24, 0x25, 0x13, 0x1f, 0x14, 0xfb, 0x06, 0xdf, 0x01, 0x0c, 0xfa, 0x03,\n","  0x00, 0xe5, 0x15, 0xf4, 0xf3, 0xdf, 0x0d, 0xee, 0xfe, 0x0e, 0x0e, 0x27,\n","  0x11, 0xe9, 0x0c, 0x11, 0xed, 0x2b, 0x03, 0x16, 0x09, 0xec, 0x06, 0xe7,\n","  0xf2, 0x33, 0x04, 0x09, 0xf8, 0x0f, 0x0e, 0xf2, 0xfd, 0xfa, 0x04, 0xf4,\n","  0x10, 0x0a, 0x0b, 0xfc, 0x02, 0xfc, 0xfc, 0xf8, 0x1e, 0x04, 0xe8, 0xdb,\n","  0x10, 0xf8, 0x0a, 0x0f, 0xfb, 0xf8, 0x1f, 0xee, 0x0e, 0xde, 0xed, 0xd8,\n","  0x09, 0x0a, 0x1c, 0x2c, 0x06, 0x35, 0x01, 0x0a, 0x0f, 0xf5, 0xf7, 0x0f,\n","  0xf7, 0x30, 0x15, 0x08, 0xcf, 0xf9, 0x20, 0xd9, 0x05, 0x35, 0xe3, 0x04,\n","  0xe1, 0x12, 0xf0, 0x04, 0xfa, 0x1d, 0xfd, 0xff, 0x04, 0xfb, 0x28, 0xfd,\n","  0xed, 0x2f, 0xf5, 0x04, 0xe6, 0x04, 0xe6, 0x16, 0xe5, 0xe8, 0xf2, 0xeb,\n","  0xf1, 0x02, 0x07, 0x08, 0xe9, 0x22, 0xed, 0xf6, 0x01, 0x3c, 0xf4, 0x17,\n","  0xe5, 0x0e, 0x07, 0x06, 0x00, 0x34, 0xfc, 0x1a, 0x1a, 0x17, 0x0c, 0xfe,\n","  0xf7, 0x1d, 0xe9, 0x30, 0x11, 0xf8, 0x19, 0x05, 0x1d, 0x1b, 0xda, 0xfe,\n","  0x07, 0xf1, 0x08, 0xf2, 0x2b, 0xff, 0xef, 0x01, 0xf9, 0x04, 0x05, 0xfb,\n","  0xf8, 0x06, 0x0d, 0x04, 0x14, 0xfb, 0x14, 0x06, 0xf2, 0xe3, 0xfe, 0x07,\n","  0xf4, 0x0c, 0xfd, 0x1d, 0x18, 0xeb, 0x05, 0xee, 0x12, 0x24, 0x00, 0x0b,\n","  0xff, 0xf9, 0x01, 0xec, 0xfa, 0x1e, 0x1b, 0xfe, 0x01, 0x07, 0x26, 0x06,\n","  0x02, 0x0c, 0xf7, 0x03, 0x1c, 0xf2, 0x14, 0xdc, 0x09, 0x1f, 0xf4, 0x14,\n","  0x0e, 0x0c, 0xf8, 0xec, 0x1c, 0x0f, 0xf8, 0xf8, 0x0a, 0xf7, 0x1b, 0xfb,\n","  0xfe, 0x1b, 0xfa, 0xee, 0x05, 0x06, 0xef, 0x20, 0xe5, 0x4e, 0xef, 0xea,\n","  0xf5, 0xe7, 0x06, 0x17, 0xd8, 0x1e, 0x12, 0xfa, 0xed, 0xf5, 0x01, 0xf2,\n","  0xfe, 0x2a, 0x07, 0xfd, 0xdd, 0x01, 0xfa, 0x02, 0x12, 0x2f, 0xf5, 0x0e,\n","  0xf2, 0xff, 0x03, 0xfc, 0xe7, 0x23, 0xd8, 0x08, 0xef, 0x00, 0xef, 0x0c,\n","  0xe4, 0xe7, 0xf6, 0xfc, 0xcb, 0x18, 0x0d, 0x0d, 0xe9, 0x12, 0x0c, 0x00,\n","  0xf8, 0x23, 0xea, 0x28, 0xeb, 0x26, 0xfa, 0xe5, 0x1a, 0x32, 0x1a, 0x1b,\n","  0x15, 0x16, 0xf1, 0x07, 0xf5, 0x2b, 0x01, 0x11, 0x12, 0x0f, 0x09, 0x00,\n","  0x23, 0x23, 0xf4, 0xfb, 0x08, 0x0b, 0x10, 0x1f, 0x21, 0x0e, 0x0a, 0x08,\n","  0x0a, 0xff, 0x01, 0x17, 0xf2, 0xe3, 0xdc, 0x12, 0x0f, 0x05, 0x16, 0xec,\n","  0xf3, 0xef, 0xeb, 0xeb, 0xfb, 0x12, 0x11, 0xf8, 0x17, 0xe7, 0x0c, 0xf6,\n","  0x08, 0x0e, 0x15, 0xe4, 0x0a, 0x00, 0xd8, 0xf8, 0xf6, 0x00, 0x08, 0x22,\n","  0xfd, 0xfa, 0x0c, 0xfe, 0x08, 0x14, 0xfc, 0x04, 0x06, 0xfa, 0x15, 0xf5,\n","  0x0f, 0xf6, 0xf0, 0x03, 0x03, 0xf2, 0x0e, 0xf1, 0x27, 0xf0, 0xf0, 0xf4,\n","  0xf9, 0x07, 0xf0, 0x07, 0x02, 0x1b, 0xfc, 0x00, 0xf2, 0xfc, 0x13, 0x06,\n","  0xef, 0x2a, 0xf2, 0xeb, 0xf9, 0xe5, 0xfa, 0x0c, 0xe6, 0x20, 0xf4, 0x03,\n","  0xe2, 0x12, 0xfb, 0x03, 0x0b, 0x39, 0xed, 0x0f, 0xf4, 0x10, 0x04, 0xf8,\n","  0x04, 0x27, 0xee, 0x05, 0xfe, 0x00, 0x07, 0xf3, 0xd6, 0x0e, 0xe8, 0x10,\n","  0xd9, 0xf6, 0x04, 0x10, 0xd3, 0xf3, 0xef, 0x01, 0xf8, 0x18, 0x00, 0x1b,\n","  0xec, 0x28, 0xe1, 0x18, 0xee, 0x38, 0x13, 0x1b, 0x0d, 0x1b, 0xf8, 0x05,\n","  0xec, 0x20, 0xe6, 0x10, 0x0a, 0x14, 0x0a, 0xe4, 0x0a, 0x1b, 0x05, 0xef,\n","  0x1b, 0xed, 0xf7, 0x07, 0x13, 0x0d, 0x1a, 0x06, 0x05, 0x09, 0x05, 0xff,\n","  0x32, 0x04, 0xee, 0x10, 0x1d, 0xf5, 0x0e, 0xe8, 0x0e, 0x07, 0x1e, 0xf4,\n","  0xff, 0x00, 0xfa, 0x0d, 0xf6, 0xdd, 0xf8, 0xfd, 0xed, 0x1a, 0x1c, 0xe0,\n","  0x0d, 0xf7, 0xfe, 0xe6, 0xf9, 0x1a, 0x25, 0x0a, 0x11, 0x04, 0xf7, 0xe8,\n","  0x0a, 0x09, 0x09, 0x0b, 0x03, 0x03, 0x1e, 0xe8, 0x07, 0x1a, 0xdf, 0x15,\n","  0x03, 0xe5, 0xdb, 0xe9, 0x1c, 0xf4, 0x0e, 0xf5, 0xfb, 0x08, 0x03, 0xfe,\n","  0x15, 0xfa, 0xfc, 0x22, 0xef, 0x01, 0xf8, 0x0a, 0xf2, 0x04, 0x0a, 0x15,\n","  0xdf, 0x09, 0x0b, 0x18, 0xef, 0x28, 0x05, 0x05, 0xe1, 0xf2, 0x16, 0x1a,\n","  0xd9, 0x1a, 0xfc, 0x01, 0xe2, 0x11, 0x25, 0xdc, 0xe9, 0x13, 0xf6, 0xf9,\n","  0xd5, 0xfd, 0xfd, 0xf9, 0xd2, 0x14, 0xf3, 0x0c, 0xd5, 0x08, 0x15, 0xff,\n","  0xe1, 0xfa, 0xe0, 0xf9, 0xfe, 0x10, 0x00, 0x15, 0xd5, 0xf0, 0x05, 0x1d,\n","  0xf1, 0x28, 0x04, 0xf7, 0xea, 0x20, 0xf8, 0x0e, 0x08, 0x29, 0x10, 0x0a,\n","  0xf1, 0x21, 0x0d, 0x16, 0xfc, 0x24, 0xf0, 0x1e, 0x14, 0x09, 0xec, 0x07,\n","  0x1a, 0x1f, 0x09, 0xf0, 0x19, 0xe5, 0x19, 0xf4, 0x04, 0x0d, 0x01, 0x00,\n","  0xf4, 0xf8, 0x07, 0xfd, 0x18, 0x10, 0x1d, 0x01, 0x0a, 0xfc, 0x18, 0x1b,\n","  0xf5, 0x15, 0xfa, 0xf9, 0x09, 0xe7, 0x0d, 0x0d, 0xff, 0xfe, 0xf0, 0xf2,\n","  0xf9, 0x10, 0x0f, 0x09, 0xfa, 0xdb, 0xf2, 0xe4, 0xfa, 0x14, 0x08, 0x0b,\n","  0x04, 0xfb, 0x12, 0xe8, 0xf7, 0x02, 0x05, 0x0d, 0xfe, 0x0d, 0xe4, 0xf9,\n","  0xef, 0x20, 0x0d, 0xe6, 0xff, 0x00, 0xea, 0xf0, 0x09, 0x07, 0x02, 0x0a,\n","  0x14, 0xec, 0x1e, 0xe4, 0x20, 0xf8, 0xf5, 0x05, 0x02, 0x09, 0x0a, 0x10,\n","  0xf8, 0xff, 0x01, 0xf3, 0x16, 0x10, 0x00, 0x02, 0xda, 0x24, 0xd2, 0x00,\n","  0x0c, 0xff, 0xdd, 0x13, 0xdc, 0x2b, 0xd6, 0xf5, 0xfd, 0x1d, 0xf7, 0x0d,\n","  0xd9, 0x18, 0xee, 0x0e, 0xef, 0x0f, 0xf2, 0xf8, 0xd5, 0x11, 0xdd, 0x0f,\n","  0xf2, 0xfb, 0xfc, 0x07, 0xda, 0x01, 0xe1, 0xf2, 0xf4, 0xf8, 0xf6, 0xfa,\n","  0xd4, 0xfa, 0xee, 0xdd, 0xce, 0x06, 0xf7, 0x0c, 0xf5, 0x1a, 0xe8, 0x0a,\n","  0xf5, 0x1e, 0xf4, 0x12, 0xf1, 0x23, 0x02, 0x09, 0xfa, 0x00, 0xea, 0x21,\n","  0xed, 0x04, 0xf7, 0x05, 0xf9, 0x1a, 0xf7, 0x09, 0x0c, 0xf8, 0x08, 0x20,\n","  0x12, 0x21, 0xdd, 0x08, 0x04, 0xfd, 0x17, 0x08, 0x06, 0x17, 0xec, 0x12,\n","  0x05, 0xfb, 0x07, 0x14, 0x01, 0x1c, 0x13, 0xf4, 0x1e, 0xea, 0x09, 0xf0,\n","  0x1a, 0x04, 0xfb, 0xfe, 0xf2, 0x1e, 0x1a, 0xfb, 0x20, 0xef, 0x10, 0xff,\n","  0x04, 0x19, 0x09, 0x07, 0x15, 0xee, 0xda, 0xc8, 0x01, 0x22, 0xf5, 0xe3,\n","  0x05, 0xfb, 0x18, 0xf2, 0xf1, 0x07, 0xfb, 0x05, 0xf8, 0xef, 0xf9, 0xf4,\n","  0xfa, 0xfa, 0xf0, 0xfa, 0x00, 0xe5, 0xf1, 0x0a, 0xfc, 0xf5, 0xe9, 0x25,\n","  0xff, 0xed, 0xf8, 0xfd, 0xec, 0xf0, 0xfc, 0x00, 0xf3, 0x0b, 0x0a, 0x1c,\n","  0xdb, 0x03, 0x0b, 0x0b, 0xfa, 0xfc, 0xda, 0xfc, 0xef, 0x2d, 0xff, 0xf5,\n","  0x0b, 0x04, 0xfa, 0xe2, 0xd8, 0x03, 0x0f, 0x1c, 0xf8, 0x0c, 0x1c, 0x1e,\n","  0xe3, 0x12, 0x18, 0xeb, 0xe7, 0x18, 0xfb, 0xdd, 0xcc, 0xf2, 0xf2, 0xe3,\n","  0xfe, 0x11, 0xed, 0x15, 0xf6, 0xff, 0xdf, 0xf9, 0xfb, 0x04, 0x03, 0x23,\n","  0xc6, 0x1b, 0x2e, 0xfa, 0x0a, 0x2d, 0x02, 0x08, 0xf2, 0x14, 0xeb, 0xe5,\n","  0xfb, 0xf7, 0x03, 0x15, 0x12, 0x06, 0x1d, 0x05, 0x07, 0x10, 0x0d, 0x0c,\n","  0x13, 0x0c, 0xf5, 0xf6, 0x14, 0x05, 0xee, 0xfa, 0xf6, 0x06, 0x0c, 0xe3,\n","  0x06, 0xf9, 0xea, 0xf6, 0x23, 0xea, 0x0e, 0xfb, 0xf6, 0x10, 0x17, 0x11,\n","  0x08, 0xfb, 0x25, 0xef, 0x07, 0x1e, 0xf4, 0xeb, 0xed, 0x27, 0xef, 0xf7,\n","  0xfa, 0xe5, 0xee, 0xe9, 0x06, 0x18, 0xed, 0xea, 0x09, 0xec, 0x0e, 0x01,\n","  0xfc, 0xe9, 0xff, 0xec, 0xee, 0xe0, 0x1f, 0xe7, 0xfb, 0x12, 0x20, 0x07,\n","  0x16, 0xea, 0x08, 0xd3, 0x1d, 0xe8, 0xea, 0xff, 0xf1, 0xf2, 0x0f, 0xd1,\n","  0x24, 0xfb, 0xe3, 0x1b, 0xff, 0xe4, 0xef, 0xf3, 0xfd, 0xf8, 0xfd, 0xe0,\n","  0xf1, 0x01, 0x11, 0x0d, 0x0f, 0xf5, 0x05, 0xe3, 0xf8, 0xe3, 0xe5, 0xfd,\n","  0x01, 0x03, 0xfc, 0x04, 0x0f, 0x08, 0xfe, 0xf8, 0xeb, 0x1b, 0x0e, 0xdc,\n","  0x19, 0xf9, 0x11, 0xe6, 0x0e, 0xfd, 0x0d, 0x14, 0xfa, 0x06, 0x20, 0x01,\n","  0x00, 0x0e, 0x0f, 0xe5, 0x24, 0xe0, 0x0e, 0xf3, 0xfa, 0xfe, 0xf6, 0xfb,\n","  0x05, 0x18, 0xfe, 0xf7, 0xe9, 0x1b, 0x07, 0xe0, 0xf9, 0xf2, 0x0f, 0xc7,\n","  0xf8, 0x08, 0x01, 0x16, 0xfa, 0xfe, 0x0a, 0xed, 0xfd, 0xfd, 0xf9, 0x14,\n","  0xf7, 0xe0, 0x0c, 0xf9, 0xf9, 0x0d, 0x0e, 0x06, 0x03, 0x1f, 0x17, 0x00,\n","  0xfa, 0x00, 0xfb, 0x15, 0x12, 0xe4, 0xde, 0xf7, 0x03, 0xec, 0x14, 0x22,\n","  0x02, 0xfe, 0x0a, 0xfe, 0x06, 0xf8, 0xfe, 0xed, 0x0a, 0xed, 0x0b, 0x08,\n","  0xe4, 0xec, 0xf8, 0xe5, 0x06, 0x07, 0xe8, 0xf4, 0xfa, 0xe0, 0xfa, 0x0f,\n","  0x05, 0x13, 0x0a, 0xf3, 0xf2, 0x09, 0x19, 0x0c, 0xfa, 0xf1, 0xfc, 0xec,\n","  0xe2, 0x01, 0xfa, 0xfb, 0x0b, 0xfc, 0x09, 0x06, 0x0d, 0x11, 0x04, 0xf2,\n","  0x0a, 0xf7, 0x28, 0x0f, 0xf4, 0x17, 0xda, 0xff, 0xfb, 0xe9, 0x26, 0x02,\n","  0xf5, 0x17, 0x00, 0xe2, 0xfe, 0x10, 0xe8, 0x05, 0x21, 0xed, 0x0b, 0x1e,\n","  0x01, 0x00, 0xf5, 0xff, 0x29, 0xe8, 0x11, 0xf4, 0x20, 0xf9, 0x16, 0xec,\n","  0x0c, 0xd9, 0x23, 0xf0, 0x0f, 0x27, 0xdd, 0xe9, 0x05, 0xd9, 0x0d, 0x20,\n","  0x0e, 0x34, 0xfe, 0xda, 0x27, 0xf9, 0x1a, 0x26, 0x17, 0x0b, 0xd8, 0xd7,\n","  0xfe, 0x0a, 0x0c, 0xe3, 0xf8, 0x0c, 0xdd, 0xf9, 0x12, 0xef, 0xff, 0x1b,\n","  0x01, 0xf7, 0xe9, 0xf6, 0x05, 0xf1, 0x1d, 0xdf, 0xf3, 0x05, 0xf2, 0xf2,\n","  0x00, 0x05, 0x00, 0xf8, 0x0f, 0xdf, 0x00, 0x03, 0x00, 0x13, 0x0b, 0xf0,\n","  0x0e, 0xec, 0x13, 0x11, 0x17, 0xf1, 0x07, 0xe7, 0x1d, 0xe1, 0xe7, 0xe9,\n","  0x07, 0x00, 0x1e, 0x21, 0x0b, 0xe8, 0xeb, 0xf5, 0x14, 0xf0, 0x10, 0xe9,\n","  0xf5, 0xf1, 0xf2, 0xcd, 0xf3, 0x0e, 0x10, 0xff, 0x11, 0xf0, 0x03, 0xf8,\n","  0x1c, 0x0f, 0x13, 0xeb, 0xf1, 0xe3, 0x0e, 0xfc, 0x02, 0x0b, 0xf3, 0xf2,\n","  0xfb, 0x01, 0x01, 0xfc, 0xee, 0xfb, 0xeb, 0xf9, 0xf4, 0x0d, 0x03, 0xfe,\n","  0xda, 0x13, 0x15, 0xfa, 0x14, 0xf1, 0xd9, 0x09, 0xf5, 0x07, 0x04, 0x03,\n","  0xfe, 0x15, 0xfe, 0x0b, 0x0e, 0xf9, 0x20, 0xeb, 0x11, 0xef, 0xf8, 0xf7,\n","  0x21, 0xeb, 0x0e, 0x0f, 0x0d, 0x1e, 0x13, 0x02, 0xec, 0xef, 0xf5, 0xf4,\n","  0xf9, 0x1a, 0xf3, 0xf8, 0x01, 0xfb, 0xfd, 0xe1, 0xfb, 0x22, 0x09, 0xee,\n","  0xf0, 0xf0, 0x0c, 0xf9, 0x12, 0x3d, 0xfe, 0xef, 0xec, 0x02, 0x0e, 0x15,\n","  0x0c, 0x18, 0x25, 0xf2, 0x0d, 0xf4, 0x13, 0xf6, 0x0e, 0x02, 0xf1, 0xf4,\n","  0x0d, 0xff, 0x0f, 0xf0, 0x20, 0x00, 0xf4, 0xf2, 0x12, 0xfc, 0x2a, 0xda,\n","  0xe8, 0xe6, 0xfa, 0xfc, 0xf7, 0x14, 0x07, 0xf2, 0x11, 0xe9, 0x00, 0x00,\n","  0x02, 0x02, 0x06, 0x10, 0x05, 0xf6, 0x17, 0xf7, 0xf6, 0x0a, 0x06, 0xea,\n","  0xfc, 0xf6, 0xe9, 0x01, 0x05, 0xfd, 0x01, 0x12, 0x12, 0xdd, 0xf4, 0xfb,\n","  0x02, 0x10, 0xf6, 0x03, 0x06, 0xf5, 0x24, 0xed, 0x16, 0xf0, 0xfe, 0x1e,\n","  0xff, 0xf8, 0x20, 0xeb, 0xf7, 0xf4, 0x01, 0x09, 0x1f, 0x03, 0x0e, 0x07,\n","  0xe8, 0xe1, 0x04, 0xf1, 0x0e, 0xec, 0x1c, 0x07, 0x0a, 0xf3, 0xf3, 0xed,\n","  0xfe, 0xf3, 0x0d, 0xf9, 0xeb, 0xf9, 0xff, 0xf5, 0x1f, 0xf9, 0x14, 0xff,\n","  0xee, 0x09, 0xda, 0xfc, 0x09, 0xe2, 0x24, 0xfa, 0x00, 0x15, 0x0b, 0xf7,\n","  0x00, 0xf8, 0x1b, 0x08, 0x09, 0xea, 0x13, 0x11, 0xf5, 0x0c, 0x11, 0x10,\n","  0xfa, 0xf6, 0x11, 0xf5, 0xed, 0x05, 0x17, 0xff, 0xfd, 0x0f, 0x19, 0xe7,\n","  0xf3, 0x39, 0xf7, 0x04, 0x06, 0xf6, 0x01, 0x07, 0xf5, 0x38, 0x03, 0xfe,\n","  0xf8, 0xfe, 0xea, 0xfe, 0xfb, 0x43, 0x0f, 0x04, 0x21, 0xe5, 0x14, 0x1a,\n","  0xdb, 0x20, 0x24, 0xfa, 0x23, 0xdb, 0x24, 0xf8, 0xfa, 0x0f, 0x02, 0x00,\n","  0xf0, 0xd6, 0x07, 0x0e, 0xe6, 0xeb, 0x01, 0x14, 0x08, 0x12, 0x0e, 0x16,\n","  0xf4, 0xe5, 0x02, 0xed, 0x0c, 0x07, 0xe0, 0x13, 0xf7, 0xf6, 0xea, 0xf2,\n","  0xf2, 0x08, 0xe6, 0xe6, 0xfc, 0xe9, 0x0d, 0x0b, 0xf3, 0xff, 0xf7, 0xe0,\n","  0x1c, 0xed, 0x0b, 0xfe, 0x0a, 0x0f, 0xfb, 0x10, 0xf5, 0xf2, 0x1e, 0x1d,\n","  0xdf, 0x0a, 0x25, 0x0e, 0x09, 0x11, 0xf4, 0xf4, 0xfc, 0xf1, 0x0a, 0x0b,\n","  0x0f, 0x1b, 0x14, 0x09, 0x0f, 0xf7, 0xf9, 0x04, 0x06, 0x19, 0x0b, 0x0b,\n","  0xea, 0xdc, 0xf9, 0x03, 0x05, 0xf6, 0x10, 0x20, 0xfc, 0xea, 0xe9, 0x0b,\n","  0x0c, 0x1c, 0xe1, 0x11, 0x18, 0x0d, 0x09, 0x11, 0x0a, 0xf6, 0xf5, 0xf7,\n","  0x07, 0x14, 0x1c, 0x09, 0xf3, 0x04, 0x0d, 0x02, 0xed, 0x25, 0x13, 0xf2,\n","  0xe9, 0x07, 0xfe, 0xf3, 0xed, 0xf3, 0xdf, 0x00, 0xe9, 0x27, 0xf3, 0xf6,\n","  0xe6, 0xd8, 0x11, 0xfc, 0x13, 0x50, 0xfd, 0x0d, 0xd8, 0xd8, 0x1e, 0x0f,\n","  0xe4, 0x49, 0x01, 0x1c, 0xfb, 0xee, 0x20, 0x19, 0x23, 0x2a, 0xfe, 0x10,\n","  0x02, 0xd0, 0x1d, 0x15, 0x1e, 0x3f, 0x07, 0x09, 0x02, 0xc5, 0x1c, 0x22,\n","  0xe9, 0x0a, 0x07, 0xfe, 0x00, 0x00, 0x09, 0x04, 0x0f, 0x04, 0x09, 0x09,\n","  0xe6, 0x14, 0x10, 0x0d, 0xe9, 0xea, 0xf6, 0x18, 0x07, 0x36, 0xe7, 0xf9,\n","  0xe7, 0xde, 0xe4, 0xf6, 0x1d, 0x12, 0xfd, 0xee, 0xf1, 0xed, 0x05, 0x0d,\n","  0x08, 0x10, 0xf9, 0xee, 0x09, 0xf9, 0xf9, 0x09, 0xf1, 0xff, 0xea, 0xf4,\n","  0xeb, 0xfd, 0xf3, 0x1e, 0xff, 0xe6, 0xdb, 0x09, 0xf7, 0xf9, 0x1c, 0x15,\n","  0x0c, 0xfb, 0xf2, 0xfc, 0x02, 0x19, 0xe5, 0x2a, 0xe6, 0xfc, 0xda, 0xec,\n","  0xea, 0x05, 0xfd, 0x0a, 0xe2, 0xec, 0x01, 0xf4, 0x08, 0x27, 0xf8, 0x25,\n","  0x0c, 0xf6, 0xf1, 0xec, 0xe5, 0xfb, 0x05, 0x0d, 0xed, 0xe3, 0xf0, 0xdc,\n","  0x05, 0xe4, 0x09, 0xf6, 0xfe, 0xdf, 0xff, 0xf9, 0xe9, 0x05, 0x26, 0xfe,\n","  0x03, 0x08, 0xfe, 0x07, 0xed, 0xfb, 0xef, 0xfa, 0xe0, 0xfd, 0xf0, 0x14,\n","  0x0b, 0x3c, 0x06, 0x04, 0xfe, 0xe0, 0x04, 0xfa, 0xe2, 0x4a, 0x07, 0x0d,\n","  0xee, 0xda, 0x03, 0xf3, 0xee, 0x40, 0x07, 0x0e, 0xfd, 0xbd, 0x11, 0xfe,\n","  0x20, 0x41, 0xdd, 0x09, 0xf6, 0xd6, 0x20, 0xfe, 0xec, 0x20, 0xf1, 0xff,\n","  0x06, 0xc9, 0x15, 0x02, 0xfb, 0x04, 0x1f, 0xe8, 0xe5, 0xe4, 0xfc, 0xe4,\n","  0x06, 0xfe, 0x23, 0xf3, 0xfe, 0x1b, 0xe2, 0xf5, 0xfd, 0xce, 0x17, 0xea,\n","  0xdc, 0x16, 0xf0, 0x0f, 0xd7, 0xd9, 0x11, 0xf9, 0xf9, 0x19, 0x15, 0x10,\n","  0x04, 0xf3, 0xef, 0xff, 0xff, 0xff, 0xf0, 0xdf, 0xfb, 0xe8, 0x1d, 0xff,\n","  0x1a, 0xf9, 0xed, 0xfb, 0xfe, 0x03, 0xd6, 0x13, 0x09, 0x04, 0x0a, 0x10,\n","  0x1e, 0xfb, 0xfc, 0x17, 0x04, 0xf4, 0x00, 0x0b, 0xf9, 0x05, 0x14, 0x2b,\n","  0xe7, 0xe2, 0x0c, 0x06, 0x10, 0x29, 0x16, 0x2b, 0xdb, 0xe1, 0xf5, 0x00,\n","  0xf0, 0xf8, 0x13, 0x2b, 0x01, 0xea, 0xf5, 0xdf, 0xe9, 0xe5, 0x00, 0x16,\n","  0xf4, 0xf8, 0xff, 0xfb, 0xfe, 0xd8, 0x29, 0x11, 0xf2, 0xfe, 0xdd, 0xf0,\n","  0x1d, 0xfb, 0x1a, 0x09, 0xf9, 0x0e, 0x1d, 0xff, 0xda, 0x08, 0x0f, 0x06,\n","  0xf3, 0xec, 0xe0, 0x1b, 0xef, 0x2a, 0xf9, 0x0e, 0xef, 0xb9, 0xf2, 0x08,\n","  0xfe, 0x58, 0xfd, 0x14, 0xfd, 0xea, 0xea, 0x17, 0xf4, 0x30, 0x1e, 0x08,\n","  0x0c, 0xe2, 0x23, 0xe7, 0x09, 0x38, 0x05, 0x18, 0x12, 0xd1, 0x1a, 0x24,\n","  0x07, 0x2f, 0x16, 0xe1, 0x01, 0xc5, 0x1e, 0x00, 0x08, 0x13, 0xe1, 0xdd,\n","  0xff, 0xe5, 0xea, 0xe0, 0xe4, 0xee, 0xe8, 0xeb, 0xe1, 0x1a, 0xfe, 0xf5,\n","  0xef, 0xbe, 0xe9, 0xff, 0xff, 0x0f, 0xec, 0xfa, 0xcc, 0xd1, 0x09, 0xfc,\n","  0x06, 0x23, 0x0e, 0xf7, 0xe8, 0xec, 0xdb, 0x07, 0x18, 0x09, 0xdc, 0xf6,\n","  0xec, 0xe2, 0x02, 0x15, 0x13, 0x0a, 0xea, 0xee, 0x17, 0xf8, 0xf0, 0x31,\n","  0x04, 0xf2, 0x0a, 0x07, 0xfe, 0x05, 0x0d, 0x2f, 0xfe, 0xe2, 0xf5, 0x1f,\n","  0x13, 0x2b, 0xe4, 0x21, 0xe0, 0xdc, 0x04, 0x0e, 0x01, 0x14, 0x25, 0x47,\n","  0xe6, 0xf6, 0xe5, 0x05, 0x07, 0x0c, 0x12, 0x23, 0xeb, 0xe1, 0xd5, 0x18,\n","  0xe4, 0xe5, 0xde, 0x12, 0xf0, 0x01, 0x04, 0xf2, 0x07, 0xe1, 0xed, 0xeb,\n","  0x03, 0xf2, 0x0d, 0x0c, 0xf3, 0xf1, 0xfb, 0x14, 0xfc, 0x26, 0xe6, 0xe9,\n","  0xee, 0xf4, 0xfe, 0x0a, 0xdf, 0xe2, 0xfa, 0x0e, 0x0b, 0x27, 0xe9, 0xf9,\n","  0xec, 0xc4, 0x16, 0xe3, 0x00, 0x50, 0x0b, 0x0f, 0x0d, 0xd2, 0x0d, 0x01,\n","  0xf7, 0x35, 0xfe, 0x1f, 0x1f, 0xe1, 0x0c, 0x05, 0x19, 0x2f, 0xfd, 0xf4,\n","  0x0d, 0xc3, 0x1d, 0xff, 0x11, 0x29, 0x04, 0xfe, 0x01, 0xc8, 0x2c, 0x02,\n","  0xf5, 0x26, 0xf6, 0xdd, 0xfe, 0xe3, 0x0b, 0x09, 0xe6, 0xe0, 0xfc, 0xdf,\n","  0xfa, 0x17, 0xf0, 0xfa, 0xf0, 0xd7, 0x02, 0xe9, 0x00, 0x1d, 0xd8, 0x04,\n","  0xcb, 0xd3, 0x1a, 0xe2, 0x12, 0x0d, 0xdc, 0xf6, 0x06, 0x08, 0x0d, 0xfb,\n","  0x27, 0x06, 0xf5, 0xdc, 0xfc, 0xdc, 0xfb, 0x2a, 0x0f, 0x16, 0xf3, 0xd9,\n","  0xfa, 0xf7, 0xf0, 0x2d, 0x0a, 0x0a, 0xf8, 0x01, 0xd7, 0x04, 0xfe, 0x3e,\n","  0x07, 0xd4, 0xe6, 0xf3, 0x1d, 0x19, 0x07, 0x3c, 0x01, 0xca, 0xea, 0x06,\n","  0x0a, 0x06, 0xe9, 0x20, 0xe7, 0xdf, 0xd6, 0x03, 0x06, 0x16, 0xf9, 0x42,\n","  0xf0, 0xf0, 0xd8, 0xfc, 0xfe, 0xee, 0x18, 0x0f, 0xf3, 0xf9, 0xf7, 0x03,\n","  0x08, 0xf0, 0xf8, 0xf6, 0x09, 0xd9, 0xfc, 0x06, 0xed, 0x1c, 0x0f, 0x07,\n","  0x0b, 0x00, 0xfc, 0xfb, 0xc4, 0xfe, 0xf9, 0xfb, 0xe1, 0x02, 0x0d, 0x02,\n","  0x02, 0x2e, 0xff, 0xed, 0xef, 0x9c, 0xfb, 0x09, 0x16, 0x4d, 0x27, 0x1d,\n","  0x0e, 0xce, 0xf8, 0x1b, 0x16, 0x4e, 0x12, 0x24, 0x14, 0xeb, 0x04, 0x0b,\n","  0xfd, 0x20, 0x28, 0xfb, 0x18, 0xb5, 0xff, 0xf2, 0x15, 0x15, 0xff, 0xf6,\n","  0x1d, 0xb7, 0x34, 0xe3, 0x02, 0x13, 0xfb, 0x09, 0x09, 0xdc, 0xfa, 0xf1,\n","  0x13, 0x06, 0x22, 0xe1, 0xf2, 0x15, 0xea, 0xdb, 0xee, 0xbf, 0x07, 0xe7,\n","  0xfe, 0x2a, 0xe4, 0xe9, 0xe8, 0xcf, 0xfa, 0xf3, 0x23, 0x16, 0x0f, 0xf9,\n","  0xed, 0xe9, 0x09, 0x01, 0x0c, 0x00, 0xef, 0xea, 0xe6, 0xd8, 0x13, 0x11,\n","  0x0f, 0x2b, 0xdf, 0xd6, 0xfb, 0xf8, 0xf8, 0x30, 0x0a, 0x00, 0xf2, 0xd8,\n","  0xf7, 0x16, 0xfd, 0x45, 0x07, 0xf9, 0xd5, 0x03, 0x0d, 0x17, 0x12, 0x47,\n","  0xf8, 0xe0, 0xed, 0xfb, 0xf9, 0x06, 0xfb, 0x36, 0xfe, 0xe9, 0x01, 0xea,\n","  0xef, 0x13, 0x0d, 0x40, 0xe1, 0xf5, 0xe6, 0xfe, 0xff, 0xf9, 0xe9, 0x06,\n","  0xdd, 0xf2, 0xe1, 0x02, 0x01, 0xf0, 0xeb, 0x03, 0xf8, 0x09, 0x0a, 0x09,\n","  0xf8, 0x01, 0x01, 0x00, 0xff, 0x09, 0xf4, 0x0e, 0xd1, 0x11, 0xe4, 0x0f,\n","  0xe0, 0xe6, 0x05, 0x24, 0x00, 0x3e, 0xf7, 0xf4, 0xe7, 0xc9, 0xf8, 0x1c,\n","  0x1a, 0x44, 0x05, 0x12, 0xfe, 0xc4, 0x26, 0x09, 0x16, 0x53, 0x11, 0x22,\n","  0x16, 0xe5, 0x16, 0xf5, 0x24, 0x34, 0x08, 0x0b, 0x0a, 0xbf, 0xfb, 0xdf,\n","  0x0f, 0x2a, 0x0b, 0xf6, 0x30, 0xa4, 0x3a, 0xf9, 0x08, 0x2f, 0x20, 0xe6,\n","  0x01, 0xea, 0x10, 0x26, 0xf0, 0xf7, 0x05, 0xd1, 0xf2, 0x07, 0x03, 0x13,\n","  0xeb, 0xcc, 0x07, 0xf0, 0xfe, 0x16, 0xf3, 0xfb, 0xd0, 0xd8, 0x10, 0xe4,\n","  0x0a, 0x09, 0xdc, 0x0b, 0xe5, 0xdf, 0xed, 0x00, 0x13, 0x09, 0xf7, 0xf7,\n","  0xf8, 0xf4, 0xf6, 0x1c, 0x28, 0xfc, 0xe5, 0xf2, 0xe6, 0x01, 0x0b, 0x41,\n","  0x0e, 0xe8, 0xf5, 0xe0, 0x14, 0x02, 0x12, 0x38, 0x02, 0xd9, 0xd2, 0x0d,\n","  0x04, 0x10, 0x06, 0x4f, 0x07, 0xdd, 0x03, 0xf0, 0xf9, 0x0e, 0xff, 0x3e,\n","  0xef, 0xf4, 0xed, 0xed, 0x04, 0x11, 0xed, 0x4e, 0xdd, 0xeb, 0xe1, 0xfd,\n","  0xff, 0xfe, 0xef, 0x0f, 0xba, 0xeb, 0xe7, 0xe3, 0x0b, 0xee, 0x0f, 0xd9,\n","  0xf7, 0x0b, 0xff, 0xed, 0xff, 0x09, 0x03, 0xfd, 0xff, 0xf2, 0x09, 0xf0,\n","  0xdd, 0x1d, 0xed, 0x07, 0xde, 0xe8, 0x05, 0xfd, 0x11, 0x3f, 0x04, 0xf5,\n","  0xe4, 0x9e, 0x01, 0x26, 0x30, 0x5c, 0x08, 0x12, 0x11, 0xcc, 0x07, 0xf3,\n","  0x24, 0x38, 0xf9, 0x1b, 0x1f, 0xd6, 0x16, 0x18, 0x1d, 0x28, 0x11, 0x05,\n","  0x21, 0xa4, 0x09, 0xfa, 0x1d, 0x2f, 0xe5, 0xdd, 0x15, 0xa6, 0x24, 0xe2,\n","  0xf6, 0x0c, 0xd6, 0xfb, 0x1c, 0xd9, 0x1c, 0x06, 0xf8, 0xf9, 0x1e, 0xdd,\n","  0xf9, 0x12, 0xec, 0x10, 0xdb, 0xb6, 0x0b, 0xe4, 0xf1, 0x32, 0xd0, 0x18,\n","  0xd2, 0xce, 0xf5, 0xf5, 0x26, 0x08, 0xf8, 0x0b, 0xfc, 0xfa, 0x11, 0xf1,\n","  0x0b, 0x13, 0xfc, 0xfe, 0xf0, 0xf3, 0xfa, 0x16, 0x0e, 0x02, 0xd6, 0x17,\n","  0xf3, 0x10, 0xf9, 0x47, 0x07, 0xfd, 0xe6, 0xf9, 0x00, 0x13, 0x04, 0x2b,\n","  0xf6, 0xe5, 0xf8, 0xeb, 0x0e, 0xfd, 0x1a, 0x2e, 0xee, 0xf2, 0xf4, 0xf0,\n","  0xfc, 0xf9, 0xeb, 0x47, 0xf7, 0xe9, 0xd5, 0xec, 0xf7, 0x0c, 0xee, 0x3c,\n","  0xe4, 0xfb, 0xd4, 0xf3, 0x02, 0x03, 0x03, 0x22, 0xd1, 0x01, 0xeb, 0x0c,\n","  0xf8, 0xde, 0x0e, 0x15, 0xf8, 0x10, 0xf8, 0x04, 0xfc, 0x0f, 0x05, 0x06,\n","  0xff, 0xe1, 0x0e, 0xe1, 0xf3, 0x14, 0xf7, 0x09, 0xda, 0xfd, 0xfe, 0xf8,\n","  0x16, 0x2c, 0x0a, 0xfc, 0xed, 0xb0, 0x15, 0x07, 0x1a, 0x48, 0x03, 0x1f,\n","  0x05, 0xcc, 0x14, 0xee, 0x2c, 0x47, 0x0b, 0x0d, 0x29, 0xda, 0x0b, 0x1b,\n","  0x25, 0x31, 0x1e, 0xf8, 0x1d, 0xc5, 0x26, 0xe9, 0x14, 0x1b, 0x0c, 0xe8,\n","  0x0d, 0x91, 0x1d, 0x07, 0x06, 0x21, 0x22, 0xd9, 0xfc, 0xd1, 0x17, 0x05,\n","  0x03, 0xd7, 0xe2, 0xfa, 0xfa, 0x09, 0xf3, 0xf6, 0xe1, 0xc6, 0xf6, 0xe1,\n","  0xff, 0x18, 0xdd, 0x19, 0xe8, 0xee, 0x1f, 0xeb, 0xfe, 0x1b, 0xe5, 0x11,\n","  0xdc, 0xf7, 0x06, 0x0c, 0xf8, 0x0a, 0xee, 0xf9, 0xdb, 0x06, 0x04, 0x21,\n","  0x0a, 0x1f, 0x05, 0x15, 0xe6, 0xfc, 0x1f, 0x30, 0x0e, 0xe6, 0xe0, 0x03,\n","  0xff, 0x11, 0xe2, 0x43, 0xfd, 0xe1, 0xf6, 0x13, 0xf7, 0x04, 0x21, 0x59,\n","  0x0c, 0xf8, 0xcd, 0x11, 0xf9, 0x0c, 0xf3, 0x5c, 0x00, 0xd7, 0xf3, 0x0f,\n","  0xfa, 0x16, 0x0f, 0x45, 0xe9, 0xed, 0xec, 0xff, 0xf5, 0x0c, 0x1e, 0x1a,\n","  0xd1, 0xfe, 0xe6, 0xfc, 0x0b, 0xfa, 0xe2, 0xef, 0xf5, 0x0b, 0x02, 0x10,\n","  0xea, 0x10, 0xf3, 0xfe, 0xf3, 0xe8, 0x17, 0x08, 0xdd, 0x28, 0x15, 0x04,\n","  0xf2, 0xdf, 0xfd, 0xff, 0x1d, 0x39, 0xf6, 0xf3, 0xdd, 0xc1, 0x0c, 0x09,\n","  0x18, 0x60, 0x0c, 0x22, 0x0a, 0xe3, 0x34, 0xf7, 0x26, 0x5b, 0x0a, 0x18,\n","  0x09, 0xd7, 0x29, 0x18, 0x2b, 0x44, 0xe1, 0x16, 0x14, 0xd0, 0x21, 0xfb,\n","  0x15, 0x0b, 0x06, 0xe4, 0x07, 0xa5, 0x2a, 0x02, 0xf8, 0x10, 0x15, 0xe1,\n","  0x0c, 0xe3, 0x19, 0xfc, 0x0f, 0xef, 0xf3, 0xf9, 0xed, 0x01, 0xd7, 0x05,\n","  0xe7, 0xbe, 0xdf, 0xe8, 0xff, 0x05, 0xf0, 0x12, 0xcf, 0xcf, 0xf3, 0xf3,\n","  0x13, 0x1e, 0xef, 0x11, 0xe1, 0x01, 0xf0, 0x0e, 0x26, 0xed, 0xef, 0xfb,\n","  0xee, 0xf7, 0xfe, 0x17, 0x0a, 0x18, 0xd6, 0x09, 0xf3, 0x05, 0xec, 0x28,\n","  0x0b, 0xef, 0xd2, 0xf8, 0xec, 0x05, 0x00, 0x36, 0x03, 0xf8, 0xe8, 0x1f,\n","  0x1d, 0x00, 0x1a, 0x3c, 0xfe, 0xee, 0xea, 0xf2, 0xf6, 0x0d, 0x00, 0x3c,\n","  0xf4, 0xe1, 0xf2, 0xf7, 0x0d, 0x10, 0x21, 0x47, 0xf7, 0x0a, 0xea, 0xfe,\n","  0x0c, 0x07, 0xfd, 0x26, 0xbe, 0xfb, 0xd6, 0x04, 0x11, 0xf9, 0x12, 0xfd,\n","  0xf3, 0x17, 0xff, 0xe6, 0x0d, 0xfc, 0xf7, 0xee, 0xf9, 0xf1, 0x07, 0xf6,\n","  0xfc, 0x0d, 0x03, 0x01, 0xdd, 0xe7, 0x01, 0x14, 0x23, 0x31, 0x09, 0xff,\n","  0x02, 0xa4, 0x14, 0xe5, 0x22, 0x5c, 0xf9, 0x01, 0x18, 0xcb, 0x13, 0x05,\n","  0x13, 0x50, 0x0d, 0x11, 0x12, 0xe8, 0x2f, 0xe6, 0x33, 0x37, 0x1a, 0xf4,\n","  0x2b, 0xc0, 0x20, 0xdd, 0x12, 0x1c, 0xe8, 0xca, 0x0b, 0xa8, 0x1d, 0xe3,\n","  0x16, 0xf8, 0xf5, 0xcf, 0x0f, 0xeb, 0xfb, 0xfc, 0xfe, 0x00, 0xf0, 0xe1,\n","  0xe0, 0x11, 0xe1, 0x03, 0xef, 0xc2, 0xfa, 0xfc, 0xf3, 0x11, 0xf1, 0x26,\n","  0xdb, 0xbe, 0xef, 0xee, 0x13, 0x0c, 0xf1, 0xe3, 0xdc, 0xfb, 0x00, 0xfa,\n","  0x07, 0xe7, 0xe4, 0xe5, 0xdd, 0xfd, 0x0f, 0x17, 0x15, 0x13, 0xf8, 0xe8,\n","  0xfb, 0x04, 0x0e, 0x33, 0x09, 0xfe, 0xcc, 0xfa, 0x00, 0x0a, 0x25, 0x33,\n","  0x09, 0xfb, 0xd3, 0xf0, 0x00, 0x06, 0xfd, 0x39, 0x08, 0xf8, 0xd0, 0x0c,\n","  0x0c, 0x13, 0xf5, 0x47, 0xf4, 0xe9, 0xe9, 0xf7, 0xfa, 0x1e, 0xfe, 0x50,\n","  0xf7, 0xfe, 0xdc, 0x19, 0xf4, 0xf7, 0xfc, 0x29, 0xd2, 0xf9, 0xd7, 0xe0,\n","  0xfb, 0xed, 0xed, 0xf7, 0xe1, 0x04, 0xe3, 0x16, 0xf5, 0xfc, 0x08, 0x03,\n","  0xe8, 0xda, 0x00, 0x10, 0x02, 0x1a, 0xe5, 0xf9, 0xca, 0x08, 0x23, 0x00,\n","  0x22, 0x3b, 0x11, 0x0f, 0xf0, 0xc2, 0x1d, 0xdf, 0x24, 0x49, 0x21, 0x1a,\n","  0x08, 0xdc, 0x0c, 0x0a, 0x2d, 0x51, 0x0b, 0x27, 0x1f, 0xdd, 0x2c, 0xf5,\n","  0x13, 0x31, 0xe1, 0xfe, 0x25, 0xc8, 0x0c, 0x28, 0x1a, 0x13, 0xfe, 0xca,\n","  0x0c, 0xa4, 0x08, 0x1e, 0x1b, 0x11, 0x06, 0xf0, 0x08, 0xd5, 0x03, 0xec,\n","  0x08, 0x09, 0x01, 0x09, 0xeb, 0xf5, 0xf8, 0xf7, 0xef, 0xb5, 0xda, 0xf7,\n","  0xf2, 0x0c, 0xe7, 0x0c, 0xce, 0xcb, 0x03, 0xf8, 0x13, 0x1a, 0xfe, 0xf6,\n","  0xe4, 0xfe, 0xf0, 0x0b, 0x11, 0xe0, 0xeb, 0x13, 0xd8, 0xfa, 0x1c, 0x06,\n","  0x19, 0x2d, 0xdd, 0xf5, 0xe4, 0xfd, 0xe5, 0x27, 0x07, 0xfd, 0xbe, 0xdc,\n","  0x00, 0x13, 0x1e, 0x2a, 0x26, 0xf8, 0xf6, 0x0b, 0x13, 0x1e, 0xf4, 0x37,\n","  0x0b, 0xf6, 0xcb, 0xe6, 0xfb, 0x0f, 0xf6, 0x48, 0xfb, 0xe3, 0xd6, 0x01,\n","  0xf5, 0x13, 0xf1, 0x22, 0xe9, 0xfe, 0xe9, 0xf5, 0x0b, 0x11, 0xfa, 0x1c,\n","  0xe2, 0xfb, 0xd3, 0x06, 0x0e, 0xf1, 0x0b, 0x0e, 0xf5, 0x13, 0x27, 0x10,\n","  0xea, 0xfc, 0x18, 0x0f, 0xf3, 0xe6, 0x0e, 0x09, 0x00, 0x10, 0x02, 0xfe,\n","  0xc1, 0xfc, 0xfa, 0x0a, 0x1a, 0x3e, 0x15, 0x01, 0xe3, 0xb5, 0x1d, 0x08,\n","  0x21, 0x58, 0xeb, 0x14, 0xf2, 0xcb, 0x1a, 0x04, 0x2d, 0x59, 0xe0, 0x21,\n","  0x0a, 0xf9, 0x2d, 0xe9, 0x18, 0x26, 0x03, 0xf8, 0x2d, 0xc2, 0x1a, 0x06,\n","  0x16, 0x03, 0x19, 0xcf, 0x2a, 0xae, 0x32, 0xfc, 0x0a, 0x15, 0x1e, 0xde,\n","  0x19, 0xd3, 0x0d, 0xed, 0x0c, 0xf4, 0x02, 0xf8, 0xdc, 0x03, 0x11, 0xff,\n","  0xf7, 0xa7, 0x0c, 0xf5, 0x0c, 0x0b, 0xee, 0xeb, 0xd6, 0xcd, 0x0d, 0xde,\n","  0xf8, 0x11, 0xf4, 0xfb, 0xd3, 0xff, 0xf3, 0x00, 0x17, 0xe9, 0xf6, 0xeb,\n","  0xe6, 0xef, 0x12, 0x04, 0x1d, 0x34, 0xd9, 0xf9, 0xd6, 0xff, 0xff, 0x2b,\n","  0xfb, 0x09, 0xef, 0xf4, 0xeb, 0x10, 0xfd, 0x30, 0x26, 0x04, 0xe2, 0x04,\n","  0x09, 0x06, 0xe5, 0x41, 0x02, 0xff, 0xed, 0x17, 0x05, 0xfd, 0x08, 0x39,\n","  0xfd, 0xfd, 0xce, 0xdd, 0xed, 0x03, 0x05, 0x30, 0x01, 0x12, 0xec, 0xfb,\n","  0xff, 0x10, 0x1a, 0x21, 0xe2, 0x08, 0xeb, 0xf9, 0x07, 0xf5, 0xf0, 0xd8,\n","  0xea, 0xfb, 0xea, 0xf4, 0x10, 0xf8, 0x1c, 0xfd, 0xfd, 0xe9, 0x16, 0xfd,\n","  0x0d, 0x0e, 0xf9, 0x06, 0xbb, 0xed, 0x08, 0x00, 0x3d, 0x3d, 0xd7, 0x12,\n","  0xda, 0xb9, 0x0d, 0x05, 0x10, 0x4f, 0x02, 0x1c, 0x1b, 0xd1, 0x02, 0xf0,\n","  0x12, 0x4b, 0x11, 0xfc, 0x20, 0x0a, 0x05, 0x01, 0x0e, 0x3c, 0x05, 0xf9,\n","  0x23, 0xcb, 0x1b, 0x15, 0x2b, 0xfe, 0x20, 0xda, 0x23, 0xa0, 0x1e, 0x12,\n","  0x1a, 0x09, 0x14, 0xd0, 0x11, 0xcc, 0x07, 0xfa, 0x13, 0x08, 0xed, 0x11,\n","  0xdf, 0x05, 0xf9, 0x1e, 0xfb, 0xc5, 0x12, 0xf5, 0x0d, 0x2f, 0xe5, 0xfc,\n","  0xd0, 0xd4, 0xe9, 0x05, 0xf8, 0x17, 0x04, 0x07, 0xd6, 0xe1, 0x0c, 0xef,\n","  0x19, 0xec, 0x01, 0xff, 0xd5, 0xf3, 0x07, 0x18, 0x0f, 0x12, 0xc2, 0xf9,\n","  0xd8, 0x07, 0xf0, 0x2f, 0x03, 0x10, 0xe2, 0x16, 0x05, 0x14, 0x0a, 0x30,\n","  0x21, 0xf4, 0xd6, 0x08, 0x08, 0x1e, 0x12, 0x3c, 0x01, 0x05, 0xd9, 0x0d,\n","  0xfb, 0x18, 0xfb, 0x2f, 0x02, 0xf2, 0xee, 0x08, 0xf3, 0x14, 0x09, 0x3d,\n","  0x02, 0x13, 0xf4, 0xfc, 0xe7, 0x1d, 0x23, 0x43, 0xe7, 0x07, 0xd5, 0x1c,\n","  0x0b, 0xe5, 0xf5, 0x08, 0xe6, 0xe4, 0xea, 0x08, 0xeb, 0x0e, 0x00, 0x06,\n","  0xd8, 0x10, 0xfe, 0x06, 0x09, 0x28, 0x03, 0xf7, 0xb2, 0xe8, 0x02, 0x27,\n","  0x22, 0x28, 0xe8, 0x04, 0xd9, 0xb5, 0x12, 0x08, 0x22, 0x4b, 0x0a, 0x29,\n","  0x0d, 0xf0, 0x11, 0xdc, 0x24, 0x46, 0x02, 0x21, 0x18, 0xf8, 0x0d, 0x03,\n","  0x36, 0x3c, 0x04, 0xf5, 0x1c, 0xc6, 0x1f, 0x05, 0x11, 0x0c, 0xe2, 0xcc,\n","  0x2b, 0x92, 0x11, 0xfb, 0x22, 0x01, 0xf7, 0xd8, 0x16, 0xd5, 0xf4, 0x00,\n","  0x20, 0xff, 0x02, 0xec, 0xe6, 0x0f, 0xef, 0x06, 0xf4, 0xb4, 0xf2, 0xd3,\n","  0xf7, 0x17, 0x02, 0x00, 0xd3, 0xb2, 0xdb, 0x1b, 0x05, 0x24, 0xdf, 0xef,\n","  0xce, 0xea, 0x21, 0x04, 0x01, 0xf1, 0xf1, 0xfa, 0xd7, 0xeb, 0xf6, 0x01,\n","  0x19, 0x14, 0xeb, 0x09, 0xe5, 0x04, 0x0d, 0x16, 0x08, 0x14, 0xf1, 0x19,\n","  0xf0, 0x18, 0x05, 0x30, 0x0e, 0xe2, 0xea, 0xeb, 0xf1, 0x1b, 0xe6, 0x28,\n","  0x0c, 0x11, 0xd9, 0x01, 0xfd, 0x06, 0x0b, 0x38, 0x05, 0xdf, 0xd4, 0x1d,\n","  0xe8, 0x08, 0x0e, 0x3d, 0xfb, 0x04, 0xee, 0x04, 0xf2, 0x11, 0x0e, 0x35,\n","  0xe3, 0x13, 0xeb, 0x11, 0x05, 0x01, 0xf9, 0x07, 0xec, 0xff, 0x21, 0x26,\n","  0xf4, 0xf0, 0xf4, 0x00, 0xc7, 0xf4, 0xf3, 0x22, 0xfe, 0x20, 0x0e, 0x0e,\n","  0xbd, 0x01, 0xfa, 0x01, 0x25, 0x42, 0xff, 0x10, 0xdb, 0xca, 0x14, 0x1f,\n","  0x17, 0x4f, 0xfa, 0x2e, 0x04, 0xe7, 0x19, 0xe0, 0x17, 0x40, 0xfd, 0x11,\n","  0x02, 0xfc, 0x1e, 0xee, 0x21, 0x30, 0x1a, 0x0a, 0x27, 0xd3, 0x0f, 0x13,\n","  0x1e, 0x05, 0x02, 0xee, 0x1f, 0x9a, 0x05, 0x1f, 0x12, 0xfd, 0x14, 0xea,\n","  0x0c, 0xcf, 0x06, 0xea, 0x27, 0xf1, 0xfb, 0xf2, 0xe2, 0x1f, 0x04, 0xee,\n","  0xe5, 0xbe, 0xee, 0xe7, 0xea, 0x19, 0xed, 0x01, 0xc8, 0xd8, 0x10, 0x17,\n","  0x12, 0x16, 0xe9, 0x09, 0xd0, 0xfb, 0xf4, 0x20, 0x0c, 0x14, 0xfb, 0x03,\n","  0xcf, 0xff, 0xf3, 0xfe, 0xfd, 0x1b, 0xe8, 0xf0, 0xdc, 0xf6, 0xd7, 0x13,\n","  0x11, 0x07, 0xe6, 0xdf, 0xe5, 0x08, 0x05, 0x2e, 0x0c, 0xef, 0xc4, 0xec,\n","  0xf1, 0x0a, 0xe9, 0x14, 0xf2, 0x1f, 0xf3, 0x0d, 0xfe, 0x08, 0x29, 0x34,\n","  0x09, 0xfb, 0xd4, 0xf1, 0xe0, 0x30, 0x06, 0x54, 0xfa, 0xfd, 0xe6, 0x16,\n","  0xfe, 0x12, 0xe5, 0x1f, 0xea, 0x02, 0xfa, 0xe1, 0x06, 0xf7, 0xe8, 0xe9,\n","  0xe3, 0x0d, 0x02, 0xfe, 0xe9, 0xfc, 0xfb, 0x1a, 0xeb, 0xf9, 0x06, 0x04,\n","  0x0a, 0x11, 0x09, 0xf4, 0xbe, 0x04, 0x18, 0xf7, 0x35, 0x3a, 0xf1, 0xf6,\n","  0xdc, 0xbf, 0x14, 0xf7, 0x16, 0x4b, 0xe8, 0x20, 0x03, 0xd6, 0x15, 0xfc,\n","  0x1f, 0x38, 0xea, 0x0b, 0x12, 0x2e, 0x0c, 0xd5, 0x30, 0x2b, 0x00, 0x00,\n","  0x2d, 0xc7, 0x15, 0xd6, 0x1c, 0xeb, 0xec, 0xcc, 0x2c, 0x99, 0x14, 0xf4,\n","  0x12, 0x09, 0x1e, 0xf5, 0xf7, 0xc4, 0xf7, 0xf8, 0x0f, 0xe7, 0x0c, 0xf4,\n","  0xf6, 0xfb, 0x00, 0x01, 0xe6, 0xce, 0xe6, 0x23, 0xe9, 0x0c, 0xf2, 0xf1,\n","  0xc6, 0xf5, 0x1a, 0xfc, 0xf5, 0x0e, 0xfc, 0xfa, 0xc2, 0xef, 0x0a, 0x1f,\n","  0xed, 0x1c, 0xcf, 0xfd, 0xd1, 0xfb, 0x0a, 0x07, 0x11, 0x2b, 0xe4, 0x01,\n","  0xd9, 0x0c, 0xfd, 0x3f, 0x02, 0x09, 0xe4, 0xee, 0xea, 0x06, 0xf3, 0x2d,\n","  0x1c, 0xe6, 0xd6, 0x1c, 0xfd, 0x0d, 0x17, 0x25, 0xf4, 0x18, 0xfe, 0xe7,\n","  0xfd, 0xff, 0x04, 0x36, 0xfe, 0x06, 0xda, 0xee, 0xf1, 0x20, 0x02, 0x41,\n","  0xee, 0x18, 0xdc, 0xf9, 0xf4, 0x27, 0x03, 0x2f, 0xee, 0x19, 0xe3, 0xf1,\n","  0x10, 0xf2, 0xdf, 0xe2, 0xdf, 0x0b, 0x23, 0x09, 0x02, 0xfc, 0xe2, 0x11,\n","  0xca, 0xf1, 0xf1, 0xf0, 0xf1, 0x10, 0xfe, 0x0c, 0xbd, 0xfc, 0x1b, 0x1a,\n","  0x30, 0x3a, 0x04, 0x03, 0xce, 0xce, 0x21, 0x00, 0xfc, 0x4a, 0xda, 0x1b,\n","  0x05, 0xdc, 0x07, 0xeb, 0x0b, 0x41, 0x21, 0x17, 0x0e, 0x0d, 0x1f, 0xfe,\n","  0x2c, 0x29, 0xe6, 0x15, 0x26, 0xb9, 0x06, 0xf7, 0x22, 0x09, 0x03, 0xea,\n","  0x30, 0x95, 0x28, 0xf6, 0x20, 0x02, 0xfc, 0xf3, 0xf6, 0xce, 0xee, 0x00,\n","  0x2d, 0xee, 0xf2, 0xf1, 0xd6, 0x0b, 0xe3, 0x08, 0xfa, 0xe1, 0xe2, 0x0c,\n","  0xef, 0x22, 0xf1, 0x06, 0xd3, 0xed, 0x08, 0x1b, 0xfc, 0x03, 0xec, 0x03,\n","  0xb6, 0x03, 0xec, 0xfd, 0xfd, 0xf3, 0xd9, 0x0e, 0xd4, 0xd7, 0xd5, 0x15,\n","  0x0c, 0x1a, 0xd9, 0xeb, 0xdd, 0x11, 0x14, 0x1c, 0x10, 0x07, 0xe9, 0xf0,\n","  0xdf, 0x07, 0xdb, 0x15, 0x1e, 0xe8, 0xe6, 0xe1, 0x00, 0x13, 0x12, 0x1b,\n","  0xef, 0x0a, 0xc8, 0xfd, 0x0d, 0x0f, 0x0a, 0x40, 0x07, 0xf6, 0xcb, 0x02,\n","  0xde, 0x16, 0x13, 0x4f, 0xfb, 0x08, 0xd3, 0xee, 0xde, 0x07, 0xe5, 0x22,\n","  0xe7, 0xfe, 0xec, 0xea, 0x06, 0xf0, 0xfe, 0xdf, 0xd6, 0xd5, 0xfb, 0x14,\n","  0xf9, 0xff, 0x0d, 0xfd, 0xd2, 0xeb, 0x02, 0x03, 0xf5, 0x06, 0xf8, 0xfb,\n","  0xb0, 0xee, 0x06, 0x1a, 0x22, 0x47, 0x0d, 0xf9, 0xd0, 0xec, 0x03, 0xd5,\n","  0x0c, 0x3f, 0x07, 0x1b, 0xf1, 0xcc, 0x03, 0xec, 0x1d, 0x47, 0xeb, 0xf6,\n","  0x04, 0x18, 0x19, 0x09, 0x27, 0x27, 0x01, 0xeb, 0x18, 0xda, 0x10, 0xf9,\n","  0x1f, 0xf7, 0x11, 0xe2, 0x2e, 0xa8, 0x0a, 0x05, 0x2f, 0x06, 0xf9, 0x02,\n","  0x18, 0xc5, 0x0f, 0x20, 0x24, 0xe0, 0xf4, 0xea, 0xd5, 0xf0, 0xeb, 0xf3,\n","  0xef, 0xd1, 0x11, 0xfd, 0xeb, 0xf8, 0xfa, 0x02, 0xcd, 0xe6, 0x11, 0xf8,\n","  0x04, 0x01, 0xcd, 0x15, 0xd4, 0xea, 0x08, 0xe3, 0x0e, 0x1e, 0xe5, 0x0d,\n","  0xe7, 0xe8, 0x04, 0xd6, 0x02, 0x30, 0xe6, 0x06, 0xe5, 0xf6, 0x08, 0x0e,\n","  0x09, 0x1a, 0xf1, 0x08, 0xed, 0x13, 0xfb, 0x2b, 0x0c, 0xf1, 0xe5, 0xfc,\n","  0x03, 0x19, 0xfb, 0x24, 0x00, 0x1a, 0xcd, 0xf8, 0x03, 0x12, 0x0c, 0x21,\n","  0x14, 0x00, 0xc4, 0xf2, 0xe3, 0x08, 0x09, 0x2f, 0xf0, 0x04, 0xe4, 0xdd,\n","  0xd7, 0x2d, 0x24, 0x26, 0xf3, 0x05, 0xfd, 0x1e, 0xfb, 0xf4, 0x07, 0xe8,\n","  0xce, 0xd8, 0xe9, 0xe1, 0x09, 0x0d, 0xdc, 0x0a, 0xd5, 0x00, 0xf8, 0xd7,\n","  0x0d, 0x05, 0xfe, 0x02, 0xbd, 0x07, 0x14, 0xe9, 0x18, 0x23, 0xe2, 0xfa,\n","  0xbf, 0xeb, 0x19, 0x1d, 0x18, 0x39, 0xf4, 0x13, 0xe2, 0xe6, 0x08, 0x10,\n","  0x11, 0x2b, 0xf6, 0x29, 0x08, 0x2c, 0x15, 0xfb, 0x33, 0x1f, 0x25, 0x24,\n","  0x24, 0xe4, 0x1e, 0xfd, 0x0f, 0x0e, 0xfc, 0xe9, 0x1d, 0xb9, 0x0a, 0xe4,\n","  0x36, 0xed, 0x10, 0xf4, 0x03, 0xd2, 0x04, 0xff, 0x14, 0xe5, 0x1f, 0xf7,\n","  0xe4, 0x0c, 0xdf, 0x0d, 0xfc, 0xf1, 0x1a, 0xee, 0xe2, 0x0b, 0xe7, 0xe9,\n","  0xbf, 0xee, 0x03, 0xe8, 0xeb, 0xf9, 0xf7, 0x22, 0xab, 0x0a, 0xef, 0x0a,\n","  0x12, 0x05, 0xfc, 0xea, 0xfc, 0xee, 0x20, 0xf2, 0x01, 0x27, 0xc8, 0xf6,\n","  0xf1, 0x04, 0x00, 0x0a, 0x03, 0x17, 0xf7, 0xe2, 0xe2, 0x16, 0xef, 0x0e,\n","  0x15, 0xfb, 0xd5, 0xee, 0xe3, 0x2c, 0x15, 0x13, 0xef, 0x15, 0xe6, 0xe3,\n","  0xf1, 0x2a, 0xed, 0x32, 0xef, 0x0d, 0xea, 0xf6, 0xe6, 0x27, 0xfc, 0x3e,\n","  0xeb, 0x09, 0xe7, 0xef, 0xf4, 0x0f, 0xf2, 0x0a, 0xfa, 0x0e, 0xda, 0xf6,\n","  0xfe, 0xfc, 0x12, 0xe3, 0xd7, 0x03, 0x10, 0xf9, 0xfd, 0x0a, 0x28, 0x0e,\n","  0xd1, 0xf5, 0xdc, 0xfc, 0xf4, 0x0f, 0xf3, 0x1b, 0xb8, 0x16, 0x0c, 0xf4,\n","  0xf8, 0x24, 0xf9, 0x18, 0xc6, 0xf1, 0x0d, 0xf3, 0x22, 0x2b, 0x06, 0x2a,\n","  0xf2, 0xfe, 0xf3, 0xe7, 0x10, 0x10, 0x02, 0x15, 0x15, 0x4c, 0x01, 0xe5,\n","  0x0d, 0xfc, 0x17, 0x19, 0x36, 0xeb, 0x01, 0xfe, 0x0a, 0xf3, 0xfa, 0x09,\n","  0x31, 0xa5, 0x04, 0x2c, 0x32, 0x0d, 0xe8, 0x05, 0x04, 0xc2, 0x0c, 0x1b,\n","  0x4e, 0xe3, 0xeb, 0xf0, 0xb5, 0x06, 0xee, 0xea, 0x16, 0xea, 0xde, 0xfd,\n","  0xcf, 0x24, 0xf0, 0x04, 0xcf, 0xeb, 0xf5, 0x07, 0xe3, 0xff, 0xde, 0x11,\n","  0xce, 0xee, 0x13, 0x0e, 0x0a, 0xfb, 0xfb, 0xf5, 0xda, 0xed, 0x04, 0xe9,\n","  0x17, 0x34, 0xec, 0x11, 0xe2, 0xff, 0x04, 0x07, 0x15, 0x26, 0xf7, 0xff,\n","  0x07, 0x20, 0x1d, 0x34, 0x13, 0x0d, 0xfd, 0xf9, 0x04, 0x39, 0xf5, 0x1e,\n","  0xfd, 0x28, 0xf5, 0x04, 0x02, 0x3a, 0xea, 0x2a, 0xe7, 0xf8, 0x03, 0xda,\n","  0xf7, 0x27, 0xfc, 0x2b, 0xd5, 0x15, 0xf4, 0x14, 0xdf, 0x1c, 0xfc, 0x2e,\n","  0xdc, 0x17, 0xee, 0xe7, 0xe9, 0x06, 0x0a, 0xe0, 0xd9, 0xd9, 0xee, 0x13,\n","  0xe5, 0x1e, 0x06, 0x02, 0xe9, 0xfa, 0xfb, 0xf9, 0xed, 0x10, 0xe8, 0x0a,\n","  0xdf, 0x0f, 0x2e, 0xee, 0x0a, 0x21, 0xfc, 0xff, 0xe9, 0xe5, 0x1b, 0xe5,\n","  0xfb, 0x29, 0x05, 0x23, 0xfa, 0x11, 0x07, 0x09, 0xeb, 0x11, 0x1c, 0x12,\n","  0xf9, 0x33, 0x09, 0xfa, 0x1a, 0x20, 0xdd, 0x03, 0x24, 0xf6, 0xf8, 0x04,\n","  0x14, 0x07, 0xef, 0xf6, 0x0c, 0xd1, 0x0f, 0x0f, 0x36, 0x05, 0x06, 0xeb,\n","  0xf6, 0xcf, 0x03, 0xfd, 0x3a, 0xe2, 0x03, 0xf6, 0xd7, 0x11, 0x0b, 0x0a,\n","  0xf7, 0xfe, 0x00, 0x10, 0xdf, 0xf5, 0xdb, 0x1b, 0xca, 0x13, 0x02, 0xf9,\n","  0x03, 0xef, 0x10, 0xf8, 0xe3, 0x1e, 0xe8, 0x13, 0x16, 0x12, 0xea, 0xe9,\n","  0xee, 0x15, 0xfd, 0xef, 0xfe, 0x11, 0xf1, 0x05, 0x00, 0x1d, 0x02, 0xf0,\n","  0x00, 0x21, 0xe9, 0xfa, 0xdc, 0x1b, 0xef, 0x16, 0x24, 0x19, 0x0f, 0x10,\n","  0xf2, 0x35, 0xed, 0x1b, 0x01, 0x2c, 0x1f, 0xea, 0x01, 0x3f, 0x15, 0x0e,\n","  0xfe, 0x1d, 0xed, 0xf0, 0xf8, 0x22, 0x09, 0x01, 0x03, 0x1c, 0xfa, 0x08,\n","  0xe5, 0x39, 0x02, 0x27, 0x15, 0x19, 0xe3, 0xff, 0xe0, 0x11, 0x0e, 0x0b,\n","  0x01, 0x1a, 0xfb, 0xf3, 0x07, 0x01, 0xec, 0x0e, 0x06, 0xf9, 0xfb, 0x12,\n","  0xf6, 0x12, 0x17, 0x00, 0xf5, 0x04, 0xfa, 0x15, 0x07, 0xff, 0xf3, 0xfa,\n","  0x20, 0xf5, 0x0d, 0x0e, 0x0e, 0xf1, 0xd9, 0x03, 0x11, 0x1a, 0xfb, 0x0e,\n","  0xed, 0xe9, 0xe5, 0xf1, 0x04, 0x14, 0x0f, 0xf3, 0x15, 0xec, 0xfd, 0x0b,\n","  0x04, 0x0f, 0xf8, 0x1b, 0x08, 0xf4, 0xe1, 0x1c, 0x10, 0x0f, 0x06, 0xf8,\n","  0xed, 0xee, 0x05, 0x0d, 0xff, 0x22, 0xec, 0xe8, 0xf8, 0x0c, 0xdb, 0x0e,\n","  0x18, 0xe6, 0xf0, 0x03, 0xf2, 0xed, 0x06, 0xef, 0xf5, 0x19, 0x01, 0x12,\n","  0xf4, 0xe4, 0x29, 0x29, 0x12, 0xdb, 0x03, 0x0e, 0x0e, 0x07, 0x1a, 0x0c,\n","  0xed, 0x01, 0x09, 0x06, 0x00, 0xfe, 0x0b, 0xd8, 0x13, 0xf0, 0x00, 0x1c,\n","  0xf8, 0x0c, 0xf7, 0x0c, 0x0b, 0x15, 0xf8, 0x15, 0xf0, 0x28, 0x10, 0x1e,\n","  0xe6, 0xf0, 0xfa, 0x06, 0xec, 0xff, 0x0b, 0xfc, 0xfe, 0x03, 0x10, 0x0a,\n","  0xea, 0xed, 0xf7, 0xff, 0xeb, 0xf6, 0xea, 0xe7, 0xf7, 0x0c, 0xe9, 0x23,\n","  0xfe, 0xe3, 0xec, 0xd6, 0x04, 0xfa, 0x05, 0x0a, 0xf7, 0xf0, 0xf4, 0xd9,\n","  0xf3, 0xd6, 0xf4, 0xf7, 0xf1, 0xdf, 0xfc, 0xde, 0x06, 0x10, 0x08, 0x03,\n","  0x16, 0x03, 0x18, 0xe7, 0x0d, 0xfc, 0xf9, 0x02, 0xee, 0x04, 0xf7, 0xec,\n","  0x15, 0x05, 0xf0, 0x0b, 0xf6, 0x1a, 0x09, 0x03, 0x23, 0xff, 0xe4, 0xf3,\n","  0xed, 0xfc, 0xf4, 0xf7, 0x18, 0x17, 0x26, 0xdb, 0xe3, 0x0b, 0x03, 0xda,\n","  0x26, 0xfb, 0x08, 0xf6, 0xff, 0x0f, 0x0d, 0xf8, 0xff, 0xf7, 0xf9, 0xf7,\n","  0xe4, 0xf4, 0xf0, 0x1a, 0x02, 0x09, 0xf6, 0xfd, 0xee, 0x1a, 0x07, 0xed,\n","  0x14, 0x03, 0xe8, 0xf7, 0x07, 0xfd, 0x1b, 0x1e, 0x35, 0xfb, 0xe6, 0xf4,\n","  0xf6, 0x17, 0xf0, 0xed, 0xfc, 0x0f, 0xfd, 0x11, 0xef, 0x03, 0x11, 0x07,\n","  0x1a, 0xf7, 0xef, 0xef, 0x0b, 0x15, 0x14, 0xf8, 0x1c, 0x0d, 0x1d, 0xf7,\n","  0x10, 0xec, 0x1f, 0x0a, 0x05, 0x11, 0x0b, 0xda, 0xe7, 0xee, 0xfd, 0xdc,\n","  0x15, 0xf0, 0xfd, 0xeb, 0xe1, 0x16, 0xf9, 0x06, 0x02, 0xeb, 0x09, 0x03,\n","  0x04, 0xe7, 0x19, 0x15, 0xff, 0xf0, 0x05, 0xf5, 0xf7, 0x0a, 0x11, 0xe7,\n","  0xf8, 0x15, 0x10, 0xf8, 0xfe, 0x11, 0x05, 0x00, 0x0d, 0xee, 0xde, 0x00,\n","  0xe5, 0x0f, 0xf0, 0x05, 0xf6, 0x1a, 0x0b, 0x08, 0x10, 0x13, 0x0c, 0xf1,\n","  0x1e, 0xf2, 0x01, 0xfb, 0x1b, 0x0b, 0x05, 0x05, 0x1b, 0x2d, 0xde, 0x0b,\n","  0xed, 0x11, 0xfc, 0xfe, 0x2a, 0x0d, 0xfc, 0xf8, 0xf9, 0xf4, 0x13, 0xe4,\n","  0x1a, 0xf6, 0xf0, 0xf3, 0x1d, 0x01, 0x18, 0xdb, 0xf8, 0x0f, 0xf9, 0xf5,\n","  0x11, 0x1b, 0x0f, 0xf9, 0x19, 0xf6, 0x05, 0xf3, 0xf5, 0x3a, 0x12, 0xdd,\n","  0x08, 0xe4, 0x15, 0xfb, 0x01, 0x3f, 0xfd, 0x1e, 0x03, 0xf0, 0x06, 0xf9,\n","  0xfe, 0x13, 0x03, 0x17, 0x1d, 0xea, 0xf6, 0xec, 0xe3, 0x05, 0xe0, 0x0f,\n","  0xf4, 0xfd, 0x01, 0xea, 0x13, 0xf8, 0xe1, 0x07, 0x10, 0xed, 0xff, 0xf0,\n","  0xfe, 0xd8, 0xfe, 0x06, 0xf0, 0xfe, 0x0b, 0x00, 0xf1, 0x24, 0xe8, 0xfc,\n","  0x20, 0x0d, 0x13, 0xed, 0x00, 0x0e, 0x05, 0x10, 0x04, 0x0a, 0x18, 0xdc,\n","  0xfa, 0x08, 0xed, 0x0c, 0x0f, 0x05, 0x0a, 0xe0, 0xe6, 0xe6, 0x12, 0x02,\n","  0x05, 0xf2, 0x04, 0xee, 0x1f, 0xf6, 0xe2, 0xf2, 0xff, 0x00, 0x05, 0xf5,\n","  0x25, 0xe4, 0xf4, 0xf7, 0x00, 0x05, 0x06, 0xeb, 0x1d, 0xf8, 0x03, 0xe7,\n","  0x06, 0xef, 0x06, 0xeb, 0x1c, 0xfc, 0x00, 0x16, 0x06, 0xfe, 0x02, 0xfc,\n","  0x01, 0xeb, 0xe9, 0x08, 0x00, 0x05, 0x0a, 0x14, 0x02, 0xf6, 0xdd, 0xff,\n","  0x18, 0x1b, 0x07, 0x14, 0x00, 0x02, 0x03, 0x06, 0x0a, 0x07, 0xf1, 0x25,\n","  0xf3, 0x02, 0x06, 0x07, 0x0c, 0x0c, 0x19, 0x07, 0x06, 0x0d, 0xf1, 0xfb,\n","  0xec, 0x0c, 0x03, 0x09, 0xfa, 0x29, 0xf5, 0x08, 0x0a, 0xff, 0xf5, 0x00,\n","  0xfe, 0x3e, 0x12, 0xee, 0x18, 0xe4, 0xef, 0x10, 0xe3, 0x3f, 0x08, 0x14,\n","  0x06, 0xf7, 0x16, 0x1c, 0x21, 0x17, 0xfd, 0x10, 0xd9, 0xee, 0xf7, 0x0a,\n","  0xf8, 0x09, 0x00, 0x11, 0x17, 0xec, 0xe8, 0xe3, 0xfe, 0xf4, 0xe8, 0x0b,\n","  0xf3, 0x06, 0x17, 0x04, 0x01, 0xe7, 0xe6, 0x00, 0xe0, 0x0a, 0x02, 0x04,\n","  0x04, 0xf7, 0xf6, 0xda, 0x1f, 0x16, 0xe5, 0xfc, 0xf0, 0x1d, 0xfd, 0xfb,\n","  0x15, 0x0c, 0xf7, 0x09, 0xeb, 0x15, 0x0a, 0xe7, 0xf6, 0x0e, 0xfb, 0xeb,\n","  0x00, 0xee, 0xe2, 0xff, 0x05, 0x13, 0x04, 0xe9, 0x09, 0x0b, 0xeb, 0xfb,\n","  0x02, 0x17, 0x01, 0xf3, 0x03, 0x07, 0x09, 0xe1, 0xfb, 0x03, 0x02, 0x0b,\n","  0xe2, 0x27, 0x07, 0xe7, 0x09, 0x0e, 0x19, 0xfa, 0xf4, 0x02, 0x09, 0xfc,\n","  0x04, 0x04, 0x07, 0x21, 0x0a, 0x09, 0xfe, 0x03, 0xf0, 0x0f, 0xf0, 0x19,\n","  0xef, 0xf7, 0x19, 0xdb, 0x0f, 0x35, 0xe1, 0xf5, 0x24, 0xf2, 0x04, 0xe6,\n","  0x05, 0x21, 0xea, 0x30, 0x10, 0x1d, 0xe7, 0x08, 0x01, 0x20, 0xf4, 0x24,\n","  0x0d, 0x12, 0xfa, 0x07, 0x0e, 0x0f, 0xf1, 0x14, 0xe7, 0x10, 0x15, 0xef,\n","  0x0c, 0xeb, 0xf6, 0x07, 0x05, 0x48, 0xce, 0xf6, 0xea, 0xe9, 0x04, 0x1d,\n","  0xf3, 0x45, 0xea, 0xf6, 0xf9, 0xdc, 0xfb, 0x10, 0x0c, 0x25, 0xff, 0xf5,\n","  0xfe, 0xf2, 0x1f, 0x01, 0x0b, 0x06, 0xd6, 0x1b, 0xe8, 0x03, 0x04, 0x0f,\n","  0x0e, 0xe5, 0xf7, 0x07, 0x04, 0xfa, 0x04, 0x0d, 0x03, 0xf4, 0x00, 0xf8,\n","  0xfc, 0xfd, 0x07, 0x07, 0x14, 0x06, 0x17, 0xec, 0x1d, 0x24, 0xef, 0x01,\n","  0xff, 0xf5, 0xec, 0xfb, 0x19, 0x17, 0x16, 0x06, 0x06, 0xe5, 0xdb, 0x1b,\n","  0x0e, 0x04, 0xe7, 0xe7, 0xfe, 0x07, 0xf5, 0xf1, 0xf3, 0x0c, 0x23, 0xfb,\n","  0xf6, 0x09, 0xd6, 0xd9, 0xe5, 0x0d, 0xe1, 0xf4, 0x13, 0x08, 0xe9, 0x0e,\n","  0x03, 0x19, 0x04, 0x0d, 0x04, 0x0e, 0xf5, 0x1e, 0xe6, 0xef, 0x0e, 0xfa,\n","  0x07, 0xe1, 0x14, 0xf4, 0xfb, 0xfa, 0x0f, 0x0c, 0x02, 0xfc, 0xda, 0xf4,\n","  0xf9, 0x1f, 0x0b, 0x0f, 0x09, 0x19, 0x06, 0xfb, 0x0e, 0x43, 0xfe, 0x0f,\n","  0x13, 0x04, 0xea, 0xfe, 0x16, 0x3f, 0x14, 0x4a, 0xff, 0xf5, 0xda, 0xf7,\n","  0x0f, 0x01, 0xed, 0x10, 0xfb, 0x27, 0xe9, 0x01, 0xfe, 0x12, 0x0c, 0x0b,\n","  0x01, 0x25, 0x07, 0xee, 0xfe, 0x10, 0xf7, 0xf9, 0x04, 0x63, 0xd7, 0x13,\n","  0xf4, 0xd8, 0xf3, 0x11, 0x11, 0x50, 0xe3, 0x15, 0xf2, 0xc6, 0x22, 0x13,\n","  0x08, 0x2a, 0xe0, 0x17, 0xfe, 0xf1, 0xe7, 0xf8, 0x2e, 0x1b, 0xed, 0x14,\n","  0x1c, 0xf9, 0xf9, 0xf0, 0xf2, 0xf1, 0xff, 0xdc, 0xff, 0xfc, 0x0a, 0x07,\n","  0x00, 0xf3, 0x00, 0x1d, 0x0d, 0xfa, 0xe3, 0x07, 0xfb, 0xde, 0x02, 0x1e,\n","  0xfe, 0x18, 0xf1, 0xfe, 0x10, 0x00, 0xec, 0xfa, 0x18, 0x23, 0x21, 0xfc,\n","  0x02, 0xf0, 0x04, 0x07, 0xf8, 0x08, 0xf4, 0xee, 0x0d, 0xe9, 0xe7, 0xe4,\n","  0x05, 0xf5, 0x07, 0xe9, 0xf7, 0x04, 0xe9, 0xde, 0x0b, 0x20, 0x21, 0x03,\n","  0x07, 0xec, 0xe6, 0xeb, 0xf8, 0xed, 0xf0, 0xeb, 0x06, 0x09, 0x08, 0xf4,\n","  0x13, 0xe8, 0xf3, 0xfd, 0xfa, 0xfa, 0xfb, 0xf5, 0xfd, 0x09, 0xf8, 0x03,\n","  0xfd, 0x11, 0xfa, 0xf9, 0xfa, 0x14, 0xe1, 0x14, 0x03, 0x11, 0xe7, 0x29,\n","  0x1c, 0x55, 0x07, 0x17, 0x1c, 0x07, 0xf1, 0x14, 0x14, 0x28, 0x28, 0x66,\n","  0xfd, 0x0e, 0xd3, 0x24, 0x18, 0x0a, 0x0a, 0x1c, 0xf7, 0x2d, 0xfe, 0xfb,\n","  0x0e, 0xf6, 0x09, 0xf6, 0x0b, 0x24, 0xeb, 0xf8, 0xf6, 0x0d, 0x03, 0x08,\n","  0x03, 0x71, 0xe8, 0xf5, 0xdd, 0xe0, 0xe9, 0x08, 0xf0, 0x52, 0xf0, 0x08,\n","  0xe0, 0xd4, 0x0c, 0xe5, 0x20, 0x37, 0xe0, 0x03, 0xf9, 0xe9, 0x00, 0xf0,\n","  0x10, 0x12, 0x00, 0x15, 0x10, 0xfd, 0xee, 0x03, 0x22, 0xf0, 0x0b, 0xfc,\n","  0x08, 0xf1, 0x04, 0x11, 0xfe, 0x0c, 0xec, 0x05, 0xf4, 0xfc, 0x0a, 0xf8,\n","  0x0d, 0xee, 0xe1, 0xe1, 0x29, 0x0f, 0x2a, 0x06, 0xfe, 0xea, 0xf0, 0xf7,\n","  0x27, 0x0b, 0xf2, 0x07, 0xf5, 0xdb, 0xf8, 0x19, 0xf5, 0x05, 0xda, 0xf3,\n","  0x01, 0xec, 0xea, 0x15, 0xfb, 0x1d, 0x00, 0xde, 0xeb, 0xfe, 0xf0, 0x01,\n","  0x04, 0x03, 0xfc, 0x04, 0xf7, 0x1a, 0xf8, 0xda, 0x0c, 0xfb, 0x03, 0xeb,\n","  0xf8, 0x08, 0xdc, 0xff, 0xed, 0xf7, 0xf5, 0xfd, 0x07, 0x06, 0xfc, 0xf6,\n","  0x02, 0xf8, 0xf3, 0x11, 0x0e, 0xe9, 0xf1, 0x18, 0xf2, 0x0c, 0x00, 0x22,\n","  0x10, 0xea, 0x10, 0x16, 0x24, 0x42, 0x0d, 0x26, 0x06, 0x15, 0xde, 0xe9,\n","  0x05, 0x1d, 0xec, 0x4c, 0xfd, 0x23, 0xf1, 0x1e, 0x1c, 0xf9, 0x02, 0x19,\n","  0xff, 0x10, 0xe3, 0xf0, 0xff, 0xf5, 0xfe, 0x03, 0x1a, 0x29, 0xcf, 0xdb,\n","  0xe2, 0x0f, 0xf4, 0xf1, 0x0d, 0x5a, 0xfd, 0x27, 0xde, 0xe8, 0xff, 0x17,\n","  0xdd, 0x52, 0xd9, 0x15, 0xdf, 0xd5, 0x00, 0xc9, 0x0f, 0x39, 0x03, 0xee,\n","  0xe5, 0xe2, 0xe8, 0xf5, 0x2e, 0x1b, 0x03, 0xf3, 0x19, 0x0c, 0xf8, 0xe9,\n","  0x13, 0xf4, 0x00, 0xe2, 0x0f, 0x05, 0x02, 0x17, 0x06, 0xf1, 0xe9, 0x1b,\n","  0x1c, 0x11, 0xd9, 0xef, 0x03, 0xe5, 0xeb, 0x14, 0x10, 0x05, 0xfe, 0x01,\n","  0xfd, 0xef, 0x11, 0x1e, 0x12, 0x0a, 0x13, 0xea, 0xf2, 0xf0, 0xf4, 0x19,\n","  0x04, 0x05, 0x01, 0xe6, 0x0c, 0xfe, 0xf4, 0x25, 0x1a, 0x12, 0x1e, 0xdd,\n","  0xfc, 0x06, 0xd7, 0x11, 0x12, 0xfe, 0xe4, 0xe0, 0x03, 0xef, 0xe3, 0x14,\n","  0x06, 0xf9, 0x06, 0x00, 0x0e, 0x08, 0xe2, 0x01, 0xf5, 0xfb, 0xfe, 0xf6,\n","  0x02, 0xfc, 0xf5, 0x12, 0x00, 0xf1, 0x07, 0x01, 0x14, 0x0f, 0x06, 0xf6,\n","  0xee, 0x38, 0x21, 0x1a, 0x18, 0xe5, 0xff, 0x0d, 0xf7, 0x46, 0xea, 0x1c,\n","  0x07, 0xf0, 0xdc, 0xf9, 0x19, 0x13, 0x15, 0x44, 0x08, 0x1a, 0xd2, 0x05,\n","  0x18, 0xf4, 0x17, 0x1a, 0xf9, 0x23, 0xe9, 0xff, 0x16, 0xff, 0xe9, 0x0f,\n","  0xf6, 0x2b, 0xe8, 0xec, 0xe7, 0xf8, 0x20, 0x10, 0x15, 0x5d, 0xdb, 0x00,\n","  0xe4, 0xe3, 0xe1, 0x2b, 0x04, 0x4e, 0xec, 0x05, 0xe3, 0xb5, 0xf7, 0xda,\n","  0x16, 0x2c, 0xe8, 0xfd, 0x01, 0xfd, 0x10, 0xe9, 0x11, 0x17, 0xec, 0x13,\n","  0x1d, 0x15, 0xeb, 0xf5, 0x09, 0x00, 0xf8, 0x20, 0x0e, 0xf5, 0xef, 0x0a,\n","  0x03, 0xec, 0x13, 0x2a, 0x02, 0xfb, 0x1d, 0xea, 0xf0, 0x01, 0xea, 0xf9,\n","  0x16, 0x0b, 0x01, 0x07, 0xfd, 0xf4, 0xe1, 0xff, 0x19, 0x04, 0x14, 0xeb,\n","  0xf5, 0xf8, 0xfc, 0xf5, 0x0d, 0x0e, 0xde, 0xe2, 0x15, 0xff, 0xfa, 0xe5,\n","  0x03, 0x25, 0xf6, 0xec, 0xf9, 0x06, 0xfe, 0x29, 0xee, 0xfc, 0xee, 0xe5,\n","  0x0d, 0xea, 0xe5, 0x01, 0x01, 0xf8, 0x0d, 0xeb, 0x09, 0x00, 0xca, 0xff,\n","  0x0c, 0x03, 0xf1, 0xef, 0xf7, 0xf1, 0xed, 0x03, 0xf2, 0xe8, 0xe9, 0xe9,\n","  0x07, 0xfc, 0xeb, 0x1f, 0xdb, 0x19, 0x01, 0x17, 0x03, 0x0e, 0xfb, 0x11,\n","  0x08, 0x51, 0xdc, 0x2d, 0x09, 0xef, 0xeb, 0x18, 0x07, 0x21, 0xec, 0x4b,\n","  0xf7, 0x43, 0xd9, 0x00, 0x00, 0xee, 0xf5, 0x19, 0xe4, 0x25, 0xe3, 0xfc,\n","  0x09, 0x05, 0xf6, 0x11, 0x07, 0x30, 0xcb, 0x0f, 0xef, 0x04, 0x01, 0x0f,\n","  0x06, 0x4b, 0xfa, 0xf2, 0xe7, 0xe9, 0xea, 0x10, 0x0d, 0x4f, 0xe5, 0xf2,\n","  0xf9, 0xd3, 0x07, 0xe4, 0x22, 0x37, 0xeb, 0xed, 0xfb, 0xf5, 0xda, 0xd7,\n","  0x16, 0x12, 0x0d, 0x11, 0x07, 0x1f, 0x11, 0xe0, 0xff, 0xf2, 0x07, 0x1d,\n","  0xfa, 0x03, 0xfe, 0xf6, 0xf4, 0xe6, 0xde, 0xe9, 0x05, 0xed, 0xfd, 0xfa,\n","  0xf3, 0x03, 0xe8, 0x01, 0x26, 0x20, 0xfd, 0xf3, 0x04, 0xd1, 0xff, 0x09,\n","  0x28, 0x20, 0xfc, 0xfe, 0x02, 0xed, 0x03, 0x02, 0x0d, 0x04, 0xe5, 0xd4,\n","  0x04, 0xf8, 0xea, 0xfb, 0xfc, 0x14, 0x1b, 0xd6, 0x0b, 0xfb, 0xf9, 0x15,\n","  0xf5, 0xf6, 0x08, 0xd9, 0x03, 0x05, 0xed, 0x00, 0x12, 0xfe, 0xfb, 0xf6,\n","  0x13, 0xf3, 0xd7, 0xe3, 0xed, 0xfd, 0x13, 0xfb, 0x00, 0xf2, 0xe6, 0x29,\n","  0xfc, 0x09, 0x01, 0xdf, 0x03, 0x08, 0x04, 0xfe, 0x07, 0x25, 0xf4, 0x1d,\n","  0x0a, 0xdb, 0xf6, 0x1a, 0x09, 0x41, 0x12, 0x2c, 0x0a, 0xf6, 0xe4, 0xf9,\n","  0x0e, 0x13, 0x27, 0x45, 0xe1, 0x29, 0xd8, 0x05, 0x0c, 0xf0, 0x09, 0x19,\n","  0xf5, 0x1f, 0xef, 0xef, 0x08, 0xeb, 0xf2, 0x0f, 0x0c, 0x24, 0xde, 0x06,\n","  0xee, 0xfb, 0xf5, 0x14, 0x18, 0x30, 0xdd, 0x0f, 0xe8, 0xfb, 0x1a, 0x1b,\n","  0x06, 0x50, 0xe0, 0xfa, 0xf5, 0xb5, 0xf2, 0xf5, 0x27, 0x35, 0xed, 0x12,\n","  0xf3, 0x08, 0x0a, 0xd1, 0x1a, 0x15, 0xf7, 0x03, 0xf3, 0x0c, 0xec, 0xe7,\n","  0xff, 0xfb, 0xef, 0x2a, 0xf7, 0xf5, 0x12, 0x13, 0x08, 0xe3, 0x00, 0x2a,\n","  0x05, 0x0f, 0xfb, 0xf1, 0xed, 0x13, 0xf5, 0x02, 0x0c, 0x14, 0xf1, 0xf4,\n","  0xfa, 0xdb, 0x00, 0x03, 0x26, 0x2e, 0x26, 0x08, 0xf5, 0xe9, 0xfd, 0xe9,\n","  0x04, 0x20, 0x13, 0xcc, 0xfe, 0xf9, 0x02, 0x15, 0xf7, 0x05, 0xea, 0xc3,\n","  0xee, 0xfa, 0xf8, 0x10, 0xf8, 0xf1, 0xfe, 0xdb, 0x07, 0x06, 0xdb, 0xfa,\n","  0x08, 0x01, 0x23, 0xfb, 0x0d, 0xff, 0xdf, 0xf0, 0xfc, 0xfd, 0x03, 0xff,\n","  0x02, 0x0b, 0xf7, 0x04, 0xea, 0xf0, 0x0a, 0x19, 0x04, 0xfa, 0xee, 0x00,\n","  0xf5, 0x25, 0x09, 0x24, 0x09, 0xfc, 0xff, 0xff, 0x11, 0x39, 0x05, 0x2a,\n","  0xf8, 0xf9, 0xcc, 0x28, 0x08, 0x05, 0x07, 0x4c, 0xe3, 0x27, 0xd4, 0x06,\n","  0xf8, 0xe8, 0xf9, 0x1d, 0xee, 0x10, 0xdb, 0x06, 0xfd, 0xf2, 0x05, 0xf9,\n","  0x16, 0x26, 0xe3, 0xf3, 0xf8, 0x00, 0xdd, 0xf9, 0x16, 0x3b, 0xe9, 0xfa,\n","  0xe8, 0xfd, 0xf0, 0x26, 0xf1, 0x30, 0xc5, 0xe0, 0xe6, 0xbd, 0xf1, 0xd7,\n","  0x00, 0x24, 0xf6, 0x19, 0xea, 0xca, 0xf1, 0xf8, 0x1f, 0x16, 0xf7, 0xf2,\n","  0xf7, 0x16, 0x00, 0xf6, 0x09, 0xe5, 0x06, 0xfb, 0x12, 0x1f, 0xfc, 0xe7,\n","  0xf8, 0xfc, 0xed, 0x01, 0x03, 0x13, 0x07, 0xff, 0xd3, 0x17, 0xfb, 0x01,\n","  0x12, 0x1d, 0x1c, 0xf6, 0xf1, 0xef, 0xf3, 0x02, 0x15, 0x22, 0x06, 0xed,\n","  0xff, 0xea, 0xef, 0x11, 0x0d, 0x0d, 0xe7, 0xe4, 0xff, 0x09, 0x02, 0xf8,\n","  0xf0, 0x00, 0x02, 0xe2, 0x0d, 0x0c, 0xf7, 0x1b, 0xfa, 0xff, 0xe3, 0xe8,\n","  0x10, 0xe9, 0xea, 0x01, 0x0e, 0xfe, 0x1f, 0xf8, 0x0b, 0x04, 0xe7, 0xfe,\n","  0xf9, 0x02, 0x01, 0xf5, 0x09, 0xf8, 0xfe, 0x0a, 0xfb, 0x06, 0x1b, 0xe2,\n","  0x00, 0xef, 0xde, 0x15, 0xf8, 0x2d, 0xf1, 0x1a, 0x05, 0xff, 0xf0, 0x11,\n","  0x00, 0x41, 0xe2, 0x26, 0x14, 0xd3, 0xde, 0xf3, 0x09, 0x0d, 0xfa, 0x28,\n","  0xdc, 0x37, 0xc7, 0x06, 0xf3, 0xf9, 0x07, 0x27, 0xe9, 0x14, 0xd4, 0x24,\n","  0xfa, 0x04, 0x13, 0x08, 0xf7, 0x11, 0xf0, 0x0d, 0x01, 0x03, 0x06, 0x16,\n","  0x08, 0x47, 0xe2, 0x13, 0xe6, 0xf0, 0xdc, 0x21, 0xf4, 0x3e, 0xeb, 0x19,\n","  0xe2, 0xcc, 0xf5, 0xf7, 0x15, 0x34, 0xde, 0x2c, 0xe0, 0xd6, 0xde, 0xd3,\n","  0x11, 0x0f, 0x01, 0x0f, 0xf2, 0x14, 0x02, 0xee, 0x16, 0xdb, 0xe1, 0xfd,\n","  0x01, 0x13, 0x1d, 0x09, 0x14, 0xf2, 0xd2, 0x05, 0xfe, 0x0a, 0xe9, 0x03,\n","  0x0b, 0x13, 0xf2, 0x21, 0x35, 0x0d, 0x0a, 0xf6, 0xed, 0xf5, 0xf5, 0x0d,\n","  0x2c, 0x2a, 0xf3, 0xec, 0xf3, 0xde, 0xef, 0x0c, 0x07, 0x06, 0x16, 0xd1,\n","  0xf4, 0xfe, 0xe7, 0x1c, 0xf9, 0xfe, 0xf3, 0xc6, 0x04, 0x01, 0xef, 0x03,\n","  0xeb, 0x04, 0x06, 0xd1, 0x05, 0xee, 0xf7, 0x19, 0x25, 0x09, 0x2a, 0xff,\n","  0x20, 0x11, 0xf3, 0x02, 0x0c, 0xf7, 0x08, 0xf2, 0x00, 0xf5, 0xd9, 0x24,\n","  0xfd, 0xfb, 0xe7, 0x06, 0x04, 0xd9, 0x0f, 0xe2, 0xf5, 0x16, 0x03, 0x07,\n","  0xfd, 0xf3, 0xe3, 0xfa, 0xfc, 0x30, 0x27, 0x22, 0x04, 0xf3, 0xdf, 0x0b,\n","  0x12, 0x09, 0xe5, 0x2d, 0xf9, 0x34, 0xbb, 0x13, 0xeb, 0xff, 0xe8, 0x0a,\n","  0xf4, 0x03, 0xea, 0xed, 0xdf, 0xf8, 0x0a, 0xfe, 0x07, 0x31, 0xe7, 0xe8,\n","  0xfc, 0x03, 0x03, 0x03, 0x1a, 0x2a, 0xe5, 0x0a, 0xe5, 0x0d, 0x1d, 0x2a,\n","  0xed, 0x40, 0xd3, 0x05, 0xee, 0xc5, 0xda, 0xf8, 0x12, 0x3f, 0xe6, 0xfc,\n","  0xde, 0xe0, 0xd6, 0xc6, 0x0b, 0x0d, 0x05, 0x01, 0xe7, 0x18, 0xd7, 0xec,\n","  0x05, 0xed, 0xfb, 0x19, 0x0d, 0xf9, 0x03, 0x02, 0x0a, 0xe9, 0xe2, 0x1e,\n","  0x0e, 0x11, 0x05, 0xe6, 0xed, 0x05, 0xe5, 0xe0, 0x1d, 0x18, 0xfb, 0xed,\n","  0xf1, 0xcf, 0xf7, 0x17, 0x2f, 0x20, 0x0a, 0x11, 0x02, 0xed, 0xf0, 0x01,\n","  0x0d, 0x14, 0x09, 0xc8, 0xf0, 0x00, 0xf9, 0xf9, 0xf1, 0x01, 0xe5, 0xce,\n","  0x02, 0xf4, 0xdb, 0x13, 0xfe, 0x07, 0xf5, 0xee, 0x05, 0xe9, 0xef, 0x25,\n","  0x1a, 0x1a, 0x0d, 0x02, 0x18, 0x05, 0xc8, 0xe2, 0xf8, 0xf1, 0x00, 0xf9,\n","  0x1a, 0xf7, 0xf5, 0xf0, 0xef, 0x07, 0xff, 0xf0, 0xee, 0xeb, 0xf5, 0x28,\n","  0xd0, 0x15, 0x1b, 0x1e, 0x08, 0xdc, 0xeb, 0xfa, 0xf3, 0x3b, 0xee, 0x18,\n","  0x03, 0xfa, 0xdb, 0x11, 0x00, 0x10, 0x21, 0x43, 0xe6, 0x39, 0xea, 0xfd,\n","  0xf0, 0xf3, 0xef, 0x1f, 0xd8, 0x19, 0xbc, 0xfb, 0xea, 0xf5, 0xda, 0x01,\n","  0x0e, 0x09, 0xd3, 0xf7, 0x01, 0xfa, 0xec, 0x12, 0x0a, 0x40, 0xd9, 0xec,\n","  0xea, 0x05, 0x13, 0x17, 0xed, 0x4b, 0xc5, 0xfc, 0xf0, 0xc8, 0xf7, 0x07,\n","  0x02, 0x2a, 0xe4, 0xef, 0xd7, 0xed, 0x04, 0xcc, 0x00, 0xf7, 0xf8, 0x0c,\n","  0xe7, 0x1d, 0xfc, 0xe3, 0x07, 0xd8, 0xf7, 0x06, 0x00, 0x15, 0x0c, 0xff,\n","  0x0c, 0xe6, 0xf2, 0xda, 0x1a, 0x1a, 0x0f, 0x04, 0xec, 0xf2, 0xe4, 0x15,\n","  0x13, 0x10, 0x04, 0xf0, 0x01, 0xeb, 0x04, 0x02, 0x21, 0x29, 0x25, 0x03,\n","  0xf9, 0xde, 0xe4, 0x07, 0x0b, 0x13, 0x13, 0xce, 0x1c, 0xfd, 0xed, 0xf3,\n","  0x00, 0x14, 0x1b, 0xd1, 0x0b, 0xf3, 0xf0, 0x06, 0x01, 0x0a, 0x05, 0xe0,\n","  0x16, 0xe2, 0xec, 0xeb, 0x07, 0x05, 0x03, 0xd8, 0x14, 0x02, 0xdd, 0xf8,\n","  0x16, 0x03, 0x07, 0xda, 0x27, 0xf0, 0xf3, 0x10, 0xfc, 0x14, 0xf3, 0x0f,\n","  0x01, 0x0f, 0xfe, 0xee, 0xe0, 0x14, 0x02, 0x22, 0xfd, 0xd8, 0xff, 0xfe,\n","  0xe7, 0x2b, 0x21, 0x2d, 0x05, 0xfc, 0xcb, 0x07, 0xe4, 0x12, 0x17, 0x36,\n","  0xe4, 0x23, 0xf6, 0x19, 0xcf, 0x05, 0xd7, 0x16, 0xd4, 0xfb, 0xc2, 0x20,\n","  0xe3, 0xfe, 0xe9, 0xf8, 0xfc, 0xfd, 0xee, 0x15, 0xf0, 0xf4, 0xe2, 0x12,\n","  0x04, 0x39, 0xdc, 0xff, 0xf9, 0xf4, 0xf9, 0x0b, 0xf4, 0x45, 0xed, 0x0e,\n","  0xcd, 0xda, 0x16, 0xfc, 0x15, 0x37, 0xe4, 0x26, 0xe1, 0xda, 0x22, 0xd8,\n","  0xfc, 0x03, 0x06, 0x06, 0xec, 0x01, 0x04, 0xec, 0x1f, 0xdf, 0xfa, 0xf6,\n","  0x1c, 0x0a, 0x22, 0xda, 0xf7, 0xea, 0x07, 0xe2, 0x0d, 0x0e, 0x04, 0xfa,\n","  0xf1, 0x01, 0xe7, 0x10, 0x2a, 0x18, 0x0d, 0xfa, 0xf0, 0xe9, 0x03, 0xf5,\n","  0x18, 0x24, 0x1b, 0xf0, 0xf2, 0xe0, 0xf2, 0xea, 0x1a, 0x05, 0x13, 0xde,\n","  0x0d, 0xfb, 0xe6, 0x15, 0x0a, 0xf9, 0x0d, 0xe0, 0x00, 0x00, 0xf6, 0x12,\n","  0xf6, 0x09, 0x06, 0xe4, 0x0c, 0xfb, 0xe7, 0xeb, 0xff, 0xfe, 0xf2, 0xde,\n","  0x21, 0x14, 0x03, 0x04, 0x0d, 0xff, 0x21, 0xe9, 0x24, 0xf9, 0x03, 0x00,\n","  0xf0, 0xfb, 0xff, 0xf5, 0xf6, 0x20, 0xfd, 0x25, 0xe7, 0x06, 0xf8, 0x09,\n","  0x00, 0xdf, 0xef, 0xf0, 0xdd, 0x30, 0xde, 0x33, 0x11, 0xe9, 0x01, 0x04,\n","  0x00, 0x13, 0xf7, 0x32, 0xf9, 0x1f, 0xf6, 0x05, 0xda, 0xfb, 0x1c, 0x1d,\n","  0xdf, 0x18, 0xda, 0x10, 0xda, 0x04, 0x1a, 0xe0, 0x15, 0x09, 0xd7, 0x0c,\n","  0xe6, 0x19, 0xf9, 0x0a, 0xfe, 0x47, 0xdb, 0x09, 0xdf, 0x00, 0xe8, 0x22,\n","  0xe6, 0x4e, 0xd7, 0x0d, 0xde, 0xeb, 0xf7, 0x0d, 0x0f, 0x2d, 0xe5, 0xfd,\n","  0xdf, 0xe7, 0x14, 0xed, 0x0c, 0x09, 0xf7, 0x11, 0x02, 0x1c, 0x0f, 0xcc,\n","  0x1e, 0xf7, 0xf2, 0xf1, 0x09, 0x0e, 0xe2, 0xfb, 0xfd, 0xe4, 0x07, 0x07,\n","  0x15, 0xfd, 0x28, 0xf6, 0xf2, 0xec, 0xe7, 0xf5, 0x17, 0x1e, 0xe8, 0x0a,\n","  0xec, 0xd7, 0xe9, 0x27, 0x1f, 0x36, 0xf0, 0xee, 0xf6, 0xe0, 0xf2, 0x0e,\n","  0x26, 0x1d, 0x0d, 0xdd, 0x02, 0xe7, 0xe0, 0x09, 0xf9, 0x0d, 0xde, 0xe2,\n","  0xfe, 0xef, 0xee, 0x06, 0xfa, 0x1a, 0x0e, 0xd9, 0x10, 0xf9, 0x10, 0x08,\n","  0x0f, 0xfb, 0xf8, 0xcf, 0x23, 0x13, 0xf5, 0x04, 0x07, 0x0e, 0x22, 0xfd,\n","  0x29, 0xf3, 0xf2, 0x18, 0xf1, 0x04, 0x0b, 0x11, 0x00, 0x09, 0xed, 0x07,\n","  0xe0, 0x27, 0x07, 0x1e, 0xe4, 0xda, 0x0e, 0xdb, 0xd6, 0x3d, 0x0f, 0x25,\n","  0x03, 0xdf, 0xdc, 0x20, 0xe5, 0x09, 0xea, 0x36, 0xee, 0x29, 0xda, 0x00,\n","  0xde, 0xfc, 0x27, 0x21, 0xdb, 0x38, 0xed, 0x05, 0xd2, 0xfe, 0xf4, 0xf4,\n","  0x07, 0xf5, 0xd3, 0xfc, 0xe1, 0x0e, 0xf7, 0x19, 0x00, 0x35, 0xe0, 0xf6,\n","  0x01, 0xfe, 0x08, 0x27, 0xf1, 0x52, 0xec, 0x06, 0xdd, 0xd7, 0x1a, 0xfd,\n","  0x0b, 0x2d, 0xdf, 0x02, 0xd8, 0xe9, 0xdd, 0xec, 0x0b, 0x14, 0x0a, 0x02,\n","  0xec, 0x1d, 0x08, 0xde, 0x11, 0xec, 0x0e, 0x0b, 0x2b, 0xee, 0x0c, 0xf4,\n","  0x0c, 0xdc, 0x03, 0x05, 0x17, 0xfb, 0xf5, 0xf9, 0xea, 0xeb, 0x06, 0x12,\n","  0x20, 0x20, 0x0e, 0xf9, 0x00, 0xd7, 0x11, 0x03, 0x1d, 0x2b, 0x01, 0xe5,\n","  0x01, 0xf6, 0xf9, 0x03, 0x10, 0x1d, 0x14, 0xcd, 0xfd, 0xe3, 0xe9, 0x1d,\n","  0xfa, 0x11, 0x07, 0xd9, 0x09, 0xfd, 0xeb, 0x02, 0xfc, 0x0e, 0xe6, 0xe9,\n","  0x0c, 0xeb, 0xeb, 0x08, 0x0e, 0xf6, 0xf7, 0xe2, 0x14, 0x09, 0x02, 0xfd,\n","  0x09, 0x02, 0x0e, 0xe9, 0x31, 0x05, 0xfc, 0x11, 0xef, 0x05, 0x05, 0x09,\n","  0xfc, 0xd7, 0x09, 0xee, 0xdc, 0x05, 0x0e, 0x18, 0xd2, 0xd3, 0xf1, 0x26,\n","  0xcc, 0x3a, 0x0d, 0x2a, 0x22, 0xf7, 0xce, 0x14, 0xed, 0x1a, 0xfe, 0x54,\n","  0xf3, 0x2f, 0xfb, 0x14, 0xe6, 0xf3, 0xe7, 0x15, 0xd3, 0x1a, 0xe2, 0x0b,\n","  0xe0, 0xf6, 0x0a, 0xf6, 0xe8, 0xea, 0xf4, 0x17, 0xd4, 0x11, 0xfb, 0x11,\n","  0xfd, 0x37, 0xf6, 0x0b, 0xe3, 0x00, 0x05, 0x1a, 0xdc, 0x59, 0xd7, 0x17,\n","  0xbf, 0xe7, 0xe3, 0x2a, 0xfa, 0x30, 0xfb, 0xf3, 0xde, 0xee, 0x01, 0xfd,\n","  0x10, 0x1b, 0x06, 0xf5, 0xee, 0xf8, 0xd9, 0xe4, 0x0a, 0xe0, 0x01, 0x13,\n","  0x22, 0x0d, 0xda, 0xfe, 0x06, 0xeb, 0xfe, 0xe9, 0x1c, 0x1e, 0xff, 0xfb,\n","  0xfb, 0xf9, 0xe6, 0xec, 0x14, 0x2d, 0x01, 0x09, 0xd9, 0xd9, 0x0c, 0xe7,\n","  0x2e, 0x12, 0xef, 0xe9, 0xfe, 0xee, 0x0b, 0x13, 0x0d, 0x0d, 0xf1, 0xf2,\n","  0xf6, 0xf5, 0xf0, 0x1e, 0xf6, 0xf7, 0x01, 0xcd, 0xfd, 0x04, 0xed, 0xfd,\n","  0xfd, 0x22, 0x16, 0xde, 0x09, 0xee, 0xee, 0xe3, 0x19, 0x14, 0xd7, 0xee,\n","  0x1d, 0xf6, 0x02, 0xfd, 0x21, 0x02, 0xe0, 0xfa, 0x21, 0xf4, 0xfe, 0xf8,\n","  0xf4, 0xf1, 0xff, 0x19, 0xf5, 0x02, 0xea, 0x23, 0x01, 0x07, 0x0e, 0x2f,\n","  0xf2, 0xec, 0x04, 0xfb, 0xd9, 0x40, 0xee, 0x19, 0x30, 0xed, 0xf5, 0xe7,\n","  0xe6, 0x20, 0xf6, 0x27, 0xf3, 0x2a, 0xec, 0x14, 0xff, 0x00, 0x0f, 0x05,\n","  0xd0, 0x0d, 0xfe, 0x1e, 0xd7, 0xf8, 0xeb, 0xfe, 0xf3, 0x16, 0xe9, 0x07,\n","  0xec, 0x10, 0x19, 0x08, 0x0e, 0x2a, 0x11, 0x08, 0xe1, 0xe9, 0x11, 0x1c,\n","  0xf1, 0x53, 0xd4, 0xf8, 0xc7, 0xed, 0xf4, 0x03, 0xf1, 0x29, 0xfc, 0xf0,\n","  0xc3, 0xf5, 0xf4, 0x0c, 0x21, 0x11, 0xf9, 0x0b, 0xe0, 0xfc, 0x08, 0xfc,\n","  0x12, 0xe1, 0x18, 0x03, 0x17, 0x0f, 0xfc, 0xdb, 0x06, 0xeb, 0x05, 0x0f,\n","  0x17, 0x0b, 0x0f, 0x16, 0xdd, 0xf1, 0xf1, 0xfd, 0x0f, 0x29, 0xec, 0x21,\n","  0xe7, 0xe3, 0x0c, 0x09, 0x0d, 0x10, 0xe7, 0x12, 0xf6, 0xe9, 0x01, 0x0f,\n","  0x17, 0xf0, 0xec, 0xd2, 0xff, 0x0c, 0xd5, 0x0b, 0xff, 0x09, 0xf5, 0xe1,\n","  0xfe, 0xf5, 0x0f, 0x0e, 0x01, 0x18, 0xfa, 0xe7, 0x02, 0x0f, 0xf4, 0x1b,\n","  0x24, 0x03, 0xfe, 0xf1, 0x16, 0x0d, 0xfa, 0xf3, 0x13, 0x17, 0x04, 0xec,\n","  0x17, 0x00, 0xf0, 0x17, 0x01, 0xff, 0x0e, 0xfb, 0xfc, 0x23, 0x0c, 0x02,\n","  0xe8, 0x00, 0xfe, 0x1c, 0xd8, 0xd9, 0x05, 0xda, 0xa8, 0x2a, 0xdf, 0x3f,\n","  0x0d, 0xe6, 0xe8, 0x0d, 0xf1, 0x17, 0x15, 0x57, 0xe5, 0x24, 0xfa, 0x0b,\n","  0xe9, 0xef, 0xfc, 0x03, 0xd0, 0x2e, 0xe7, 0x1e, 0xe8, 0x18, 0x15, 0xfe,\n","  0xfd, 0x0b, 0xe9, 0x0d, 0xe8, 0x00, 0x00, 0x08, 0xfd, 0x27, 0xea, 0x01,\n","  0xda, 0x01, 0xfa, 0x2a, 0xf8, 0x46, 0x03, 0x04, 0xd8, 0xed, 0xfb, 0x28,\n","  0xed, 0x43, 0xeb, 0x17, 0xce, 0xed, 0xe0, 0xe2, 0x0b, 0x18, 0x03, 0x24,\n","  0xdd, 0xfe, 0xee, 0xf3, 0x27, 0xef, 0x00, 0x1b, 0x1d, 0x19, 0xee, 0xf0,\n","  0x0b, 0x03, 0xf6, 0x03, 0x12, 0x1f, 0x0a, 0x1b, 0xf0, 0xec, 0xf5, 0x02,\n","  0x25, 0x15, 0x0d, 0x23, 0xe0, 0xe1, 0x14, 0xf5, 0x07, 0x24, 0xf8, 0x0e,\n","  0xf3, 0x00, 0xf8, 0xf4, 0x07, 0x00, 0xfa, 0xe5, 0x06, 0xe9, 0xf7, 0x16,\n","  0x13, 0xfb, 0x04, 0xe6, 0xf1, 0xde, 0x07, 0xfc, 0x08, 0x05, 0xf2, 0xfe,\n","  0x0f, 0xfd, 0xea, 0xe8, 0x0e, 0x03, 0xf6, 0xe9, 0x15, 0xf5, 0xf4, 0xf1,\n","  0x04, 0xf8, 0x06, 0xe6, 0x23, 0xe5, 0xff, 0xf1, 0xfa, 0x00, 0xf4, 0xfb,\n","  0xf6, 0x08, 0x02, 0xfe, 0x07, 0x08, 0xfe, 0x10, 0xef, 0xd6, 0x02, 0xed,\n","  0xbd, 0x21, 0xff, 0x29, 0x22, 0xe3, 0xe1, 0xfa, 0xfe, 0x33, 0x00, 0x31,\n","  0xf0, 0x1b, 0x03, 0x06, 0xf1, 0xe5, 0xe8, 0xdb, 0xe6, 0x23, 0xeb, 0x09,\n","  0xef, 0x0b, 0x1b, 0xfc, 0xff, 0x00, 0xee, 0x0a, 0xc2, 0x14, 0xfe, 0x0e,\n","  0xf8, 0x33, 0xe6, 0x0f, 0xeb, 0x08, 0x27, 0x24, 0xf0, 0x54, 0xea, 0xeb,\n","  0xc4, 0xe1, 0xf3, 0xe2, 0xfd, 0x3c, 0xfe, 0xfa, 0xca, 0xec, 0xef, 0x1a,\n","  0x08, 0x13, 0x03, 0x17, 0xf2, 0x09, 0xfa, 0xe8, 0x26, 0xee, 0x09, 0x22,\n","  0x07, 0x0f, 0x08, 0xf8, 0x00, 0x02, 0xfe, 0xda, 0x2a, 0x0d, 0x0e, 0x23,\n","  0xf8, 0xfa, 0xfb, 0xf5, 0x1e, 0x14, 0xf8, 0xf2, 0xdd, 0xf9, 0xf1, 0x1b,\n","  0x0e, 0x0e, 0xf9, 0xf7, 0xe6, 0xfb, 0x0a, 0x03, 0xf6, 0x12, 0xf8, 0xff,\n","  0xed, 0x14, 0x04, 0x12, 0x12, 0x0b, 0x05, 0xf1, 0x03, 0xe2, 0xf2, 0x18,\n","  0xfb, 0x00, 0xf1, 0xc6, 0x0f, 0xfd, 0xeb, 0xef, 0x16, 0xf8, 0x00, 0xf5,\n","  0x20, 0xf7, 0x04, 0xf0, 0x08, 0x06, 0x0a, 0xf9, 0x11, 0xf4, 0xf8, 0xec,\n","  0xf2, 0x0d, 0x15, 0xe0, 0xe4, 0x07, 0xef, 0xfd, 0xf3, 0x13, 0xfa, 0x1e,\n","  0xf2, 0xee, 0xec, 0xf3, 0xc9, 0x18, 0x13, 0x34, 0x16, 0xeb, 0xf3, 0xe0,\n","  0xd2, 0x24, 0xf4, 0x25, 0xe4, 0xfe, 0x03, 0x04, 0xf0, 0xfc, 0xf6, 0xf1,\n","  0xcc, 0xfe, 0xf9, 0xeb, 0xdb, 0x1c, 0xf0, 0x18, 0xf6, 0xff, 0x00, 0x0b,\n","  0xe0, 0x14, 0xf6, 0x14, 0xff, 0x2b, 0x19, 0xd9, 0xed, 0xf8, 0x00, 0x07,\n","  0xe2, 0x48, 0xf0, 0xf3, 0xd5, 0xeb, 0x04, 0xef, 0xee, 0x38, 0xf4, 0x20,\n","  0xc9, 0xfe, 0xe9, 0x08, 0x0a, 0x0d, 0xf7, 0x22, 0xf8, 0x12, 0xfe, 0xe4,\n","  0x25, 0xdb, 0x0d, 0x05, 0xfe, 0x02, 0xe6, 0x03, 0xfc, 0xf5, 0x08, 0xee,\n","  0x16, 0x20, 0x07, 0xf3, 0xfb, 0xec, 0x00, 0xec, 0x16, 0x22, 0x00, 0x1e,\n","  0xe0, 0xd6, 0x00, 0x1a, 0x09, 0x1a, 0xeb, 0x13, 0xfe, 0xe2, 0xf7, 0xff,\n","  0xe6, 0xf3, 0x28, 0xf9, 0x01, 0xf8, 0xee, 0xe8, 0x07, 0x0a, 0x1d, 0xf0,\n","  0xed, 0xde, 0x06, 0x15, 0xfd, 0xf5, 0x13, 0xcf, 0xfc, 0x00, 0x0b, 0xdc,\n","  0x0e, 0xf9, 0xfa, 0xed, 0x12, 0xfa, 0xf7, 0x07, 0x22, 0xe6, 0x01, 0xee,\n","  0x0e, 0xef, 0xf5, 0x13, 0x01, 0xe9, 0xed, 0x03, 0xe7, 0xda, 0xdc, 0x19,\n","  0xf5, 0x1d, 0x07, 0x2c, 0xee, 0xd5, 0xf1, 0xf2, 0xc6, 0x21, 0x01, 0x3a,\n","  0x0e, 0xe0, 0x06, 0x0f, 0xdc, 0x33, 0xed, 0x30, 0xec, 0x02, 0x23, 0x06,\n","  0x01, 0x13, 0xfe, 0xed, 0xdb, 0xfc, 0x13, 0xf9, 0xfc, 0x0d, 0x09, 0x18,\n","  0xeb, 0x0d, 0xeb, 0x02, 0xcc, 0x10, 0x1a, 0xed, 0x05, 0x2c, 0xf6, 0x07,\n","  0xf0, 0xf0, 0xf0, 0x17, 0xe9, 0x33, 0xeb, 0x19, 0xb8, 0xf0, 0xeb, 0xed,\n","  0xdf, 0x1b, 0xf6, 0xfa, 0xd2, 0xf4, 0xfe, 0x18, 0xf1, 0x09, 0xf0, 0xe5,\n","  0xd8, 0x08, 0x1a, 0x19, 0x16, 0xe1, 0x12, 0x02, 0x14, 0x1c, 0x02, 0x1e,\n","  0x11, 0xf1, 0x08, 0x01, 0x0b, 0x1d, 0xfe, 0x21, 0xed, 0xf1, 0x03, 0x0c,\n","  0x21, 0xfe, 0xfa, 0x18, 0xec, 0xe4, 0x02, 0x09, 0x15, 0x03, 0x0b, 0x0c,\n","  0xfb, 0xeb, 0xfa, 0xfc, 0xf7, 0xf8, 0xf0, 0x14, 0xf5, 0xfa, 0x0a, 0x09,\n","  0x13, 0x06, 0xea, 0xf5, 0x06, 0xfb, 0xfa, 0x0f, 0x10, 0xf9, 0xfa, 0xe7,\n","  0xf2, 0xe2, 0x01, 0x14, 0x06, 0x02, 0xf3, 0xfe, 0x0c, 0xfb, 0xf0, 0xfa,\n","  0x07, 0xe9, 0xea, 0xf1, 0xf5, 0xf7, 0xe5, 0xf7, 0xf8, 0xf5, 0xf9, 0xf5,\n","  0xe7, 0xfd, 0xe7, 0xfc, 0xe2, 0x28, 0x0e, 0x00, 0xf3, 0xd9, 0x10, 0x16,\n","  0xf4, 0x39, 0xe8, 0x28, 0xed, 0xf3, 0xdc, 0x07, 0x0f, 0x3a, 0xec, 0x09,\n","  0xeb, 0xf7, 0x16, 0x09, 0xf9, 0x1f, 0x02, 0x26, 0xd9, 0xfb, 0x0a, 0xf5,\n","  0xf5, 0x2b, 0xe7, 0xfb, 0xfd, 0x05, 0x1b, 0x0a, 0x05, 0x16, 0xf3, 0xfd,\n","  0xe8, 0x23, 0x01, 0xe7, 0xfb, 0x0f, 0x0d, 0x03, 0xe0, 0x3c, 0x1f, 0xe3,\n","  0xd2, 0x13, 0xfd, 0xeb, 0xe4, 0x11, 0xfb, 0x08, 0xde, 0xeb, 0x02, 0xf2,\n","  0x10, 0xf8, 0xf0, 0x16, 0xe3, 0x21, 0x05, 0x14, 0x17, 0xe6, 0xf7, 0xed,\n","  0x09, 0x19, 0x14, 0x23, 0x0b, 0xfe, 0x13, 0x26, 0x00, 0x25, 0xef, 0xee,\n","  0x05, 0x00, 0x07, 0xf3, 0xfb, 0x15, 0xf5, 0xfc, 0xe0, 0xf4, 0xf4, 0xe4,\n","  0xe5, 0x10, 0xf6, 0x03, 0xdc, 0xe5, 0x09, 0xf5, 0xe7, 0xf1, 0xf1, 0xdb,\n","  0x01, 0x09, 0xfd, 0xdc, 0xdc, 0xfc, 0x12, 0xee, 0x15, 0xdf, 0x15, 0xe1,\n","  0xf9, 0x08, 0x05, 0xde, 0x04, 0xea, 0x25, 0x1d, 0x12, 0xd9, 0xf0, 0xdf,\n","  0x31, 0xe1, 0xe5, 0xe5, 0xf6, 0xf1, 0x00, 0xe9, 0x07, 0xf2, 0x08, 0x0b,\n","  0xee, 0xe1, 0xfa, 0x06, 0x76, 0xfe, 0xff, 0xff, 0x04, 0x00, 0x00, 0x00,\n","  0x10, 0x00, 0x00, 0x00, 0xc9, 0x01, 0x00, 0x00, 0x59, 0xfe, 0xff, 0xff,\n","  0x8f, 0xfe, 0xff, 0xff, 0x50, 0x01, 0x00, 0x00, 0x60, 0xfb, 0xff, 0xff,\n","  0x0f, 0x00, 0x00, 0x00, 0x54, 0x4f, 0x43, 0x4f, 0x20, 0x43, 0x6f, 0x6e,\n","  0x76, 0x65, 0x72, 0x74, 0x65, 0x64, 0x2e, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x58, 0xfa, 0xff, 0xff, 0xbc, 0x01, 0x00, 0x00,\n","  0xb0, 0x01, 0x00, 0x00, 0xa4, 0x01, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x06, 0x00, 0x00, 0x00, 0x78, 0x01, 0x00, 0x00, 0x18, 0x01, 0x00, 0x00,\n","  0xb4, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0xaa, 0xfe, 0xff, 0xff, 0x05, 0x00, 0x00, 0x00,\n","  0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x0b, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,\n","  0xce, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x03, 0x00, 0x00, 0x00,\n","  0x1c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x1a, 0xff, 0xff, 0xff, 0x00, 0x00, 0x80, 0x3f, 0x01, 0x00, 0x00, 0x00,\n","  0x09, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00,\n","  0x07, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08,\n","  0x01, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x28, 0xfc, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,\n","  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n","  0x07, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,\n","  0x16, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x07, 0x00, 0x10, 0x00,\n","  0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0x38, 0x00, 0x00, 0x00,\n","  0x2c, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,\n","  0x14, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00,\n","  0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x00, 0x00, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00,\n","  0x1a, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x07, 0x00, 0x14, 0x00,\n","  0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x02, 0x00, 0x00, 0x00,\n","  0x38, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff,\n","  0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n","  0x03, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00,\n","  0x10, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x0a, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x0a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0xac, 0x04, 0x00, 0x00, 0x44, 0x04, 0x00, 0x00, 0xc4, 0x03, 0x00, 0x00,\n","  0x4c, 0x03, 0x00, 0x00, 0xd0, 0x02, 0x00, 0x00, 0x90, 0x02, 0x00, 0x00,\n","  0x20, 0x02, 0x00, 0x00, 0xb4, 0x01, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,\n","  0x6c, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0xd4, 0xff, 0xff, 0xff, 0x1c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x0e, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,\n","  0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00,\n","  0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f, 0x31, 0x00, 0x00, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xa8, 0x07, 0x00, 0x00,\n","  0xf2, 0xfb, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x4c, 0x00, 0x00, 0x00,\n","  0x07, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0xe4, 0xfb, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,\n","  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80, 0x3b,\n","  0x13, 0x00, 0x00, 0x00, 0x6c, 0x61, 0x62, 0x65, 0x6c, 0x73, 0x5f, 0x73,\n","  0x6f, 0x66, 0x74, 0x6d, 0x61, 0x78, 0x5f, 0x69, 0x6e, 0x74, 0x38, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x0e, 0x00, 0x1a, 0x00, 0x08, 0x00, 0x07, 0x00, 0x0c, 0x00,\n","  0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,\n","  0xb4, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,\n","  0x18, 0x00, 0x00, 0x00, 0x00, 0x00, 0x12, 0x00, 0x10, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0c, 0x00,\n","  0x12, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n","  0x03, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0xd6, 0x72, 0xec, 0x39, 0x57, 0x66, 0x72, 0x3a,\n","  0x1e, 0xe6, 0x14, 0x3a, 0x27, 0x15, 0x3a, 0x39, 0x33, 0xb7, 0x25, 0x3a,\n","  0xf6, 0x03, 0x80, 0x3a, 0xd2, 0x73, 0x28, 0x39, 0x79, 0xbb, 0x5c, 0x3a,\n","  0x12, 0x00, 0x00, 0x00, 0x66, 0x69, 0x72, 0x73, 0x74, 0x5f, 0x77, 0x65,\n","  0x69, 0x67, 0x68, 0x74, 0x73, 0x2f, 0x72, 0x65, 0x61, 0x64, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x32, 0xfd, 0xff, 0xff,\n","  0x00, 0x00, 0x00, 0x09, 0x54, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,\n","  0x28, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x24, 0xfd, 0xff, 0xff,\n","  0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x4a, 0xb2, 0xf3, 0x39, 0x1f, 0x00, 0x00, 0x00, 0x66, 0x69, 0x6e, 0x61,\n","  0x6c, 0x5f, 0x66, 0x63, 0x5f, 0x77, 0x65, 0x69, 0x67, 0x68, 0x74, 0x73,\n","  0x2f, 0x72, 0x65, 0x61, 0x64, 0x2f, 0x74, 0x72, 0x61, 0x6e, 0x73, 0x70,\n","  0x6f, 0x73, 0x65, 0x00, 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0xa0, 0x0f, 0x00, 0x00, 0x9a, 0xfd, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n","  0x58, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x6c, 0xfe, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,\n","  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xbb, 0xb0, 0xba, 0x3d,\n","  0x01, 0x00, 0x00, 0x00, 0xd8, 0x1c, 0x35, 0x41, 0x01, 0x00, 0x00, 0x00,\n","  0x3b, 0xcf, 0x3e, 0xc1, 0x05, 0x00, 0x00, 0x00, 0x61, 0x64, 0x64, 0x5f,\n","  0x31, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x06, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02,\n","  0x2c, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0x04, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x0f, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68, 0x61, 0x70, 0x65, 0x5f,\n","  0x32, 0x2f, 0x73, 0x68, 0x61, 0x70, 0x65, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x42, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n","  0x5c, 0x00, 0x00, 0x00, 0x05, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x14, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,\n","  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,\n","  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x50, 0x50, 0xd0, 0x3d,\n","  0x01, 0x00, 0x00, 0x00, 0x00, 0x80, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,\n","  0x61, 0x70, 0x65, 0x5f, 0x32, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x31, 0x00, 0x00, 0x00, 0x28, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0xba, 0xfe, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09,\n","  0x60, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x44, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x8c, 0xff, 0xff, 0xff, 0x30, 0x00, 0x00, 0x00,\n","  0x24, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,\n","  0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x50, 0x50, 0xd0, 0x3d,\n","  0x01, 0x00, 0x00, 0x00, 0x00, 0x80, 0xcf, 0x41, 0x01, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x52, 0x65, 0x73, 0x68,\n","  0x61, 0x70, 0x65, 0x5f, 0x31, 0x5f, 0x69, 0x6e, 0x74, 0x38, 0x00, 0x00,\n","  0x02, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0xa8, 0x07, 0x00, 0x00,\n","  0x2e, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x09, 0x60, 0x00, 0x00, 0x00,\n","  0x09, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,\n","  0x0c, 0x00, 0x14, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x10, 0x00,\n","  0x0c, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,\n","  0x14, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x80, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x00, 0x00, 0x00,\n","  0xbd, 0xad, 0x93, 0x3d, 0x01, 0x00, 0x00, 0x00, 0x0f, 0x1a, 0x93, 0x41,\n","  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x52, 0x65, 0x6c, 0x75, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x14, 0x00, 0x00, 0x00,\n","  0x08, 0x00, 0x00, 0x00, 0xaa, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x02,\n","  0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x9c, 0xff, 0xff, 0xff, 0x18, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0xc4, 0x94, 0x0c, 0x38, 0x0b, 0x00, 0x00, 0x00, 0x4d, 0x61, 0x74, 0x4d,\n","  0x75, 0x6c, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00, 0x01, 0x00, 0x00, 0x00,\n","  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x18, 0x00, 0x08, 0x00,\n","  0x07, 0x00, 0x0c, 0x00, 0x10, 0x00, 0x14, 0x00, 0x0e, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x02, 0xa4, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,\n","  0x8c, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x0c, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x08, 0x00, 0x0c, 0x00, 0x00, 0x00,\n","  0x50, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,\n","  0x7c, 0x67, 0x40, 0x38, 0x32, 0x3f, 0xc5, 0x38, 0x5e, 0x53, 0x72, 0x38,\n","  0x90, 0x6b, 0x97, 0x37, 0xd6, 0xd8, 0x86, 0x38, 0xc2, 0x56, 0xd0, 0x38,\n","  0xf3, 0x12, 0x89, 0x37, 0x92, 0x9d, 0xb3, 0x38, 0x0b, 0x00, 0x00, 0x00,\n","  0x43, 0x6f, 0x6e, 0x76, 0x32, 0x44, 0x5f, 0x62, 0x69, 0x61, 0x73, 0x00,\n","  0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,\n","  0x70, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,\n","  0x28, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,\n","  0xca, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x06, 0x02, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x06, 0x00, 0x08, 0x00, 0x07, 0x00, 0x06, 0x00, 0x00, 0x00,\n","  0x00, 0x00, 0x00, 0x72, 0xe6, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x19,\n","  0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00, 0x06, 0x00, 0x05, 0x00,\n","  0x06, 0x00, 0x00, 0x00, 0x00, 0x16, 0x0a, 0x00, 0x0e, 0x00, 0x07, 0x00,\n","  0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x09,\n","  0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0a, 0x00, 0x0c, 0x00, 0x07, 0x00,\n","  0x00, 0x00, 0x08, 0x00, 0x0a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x04,\n","  0x03, 0x00, 0x00, 0x00\n","};\n","unsigned int g_model_len = 18952;\n"],"name":"stdout"}]}]}
\ No newline at end of file
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"train_micro_speech_model.ipynb","provenance":[{"file_id":"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb","timestamp":1587690382292}],"collapsed_sections":[],"toc_visible":true},"kernelspec":{"name":"python3","display_name":"Python 3"},"accelerator":"GPU"},"cells":[{"cell_type":"markdown","metadata":{"id":"pO4-CY_TCZZS","colab_type":"text"},"source":["# Train a Simple Audio Recognition Model"]},{"cell_type":"markdown","metadata":{"id":"BaFfr7DHRmGF","colab_type":"text"},"source":["This notebook demonstrates how to train a 20 kB [Simple Audio Recognition](https://www.tensorflow.org/tutorials/sequences/audio_recognition) model to recognize keywords in speech.\n","\n","The model created in this notebook is used in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) example for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview).\n","\n","<table class=\"tfo-notebook-buttons\" align=\"left\">\n","  <td>\n","    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n","  </td>\n","  <td>\n","    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n","  </td>\n","</table>\n"]},{"cell_type":"markdown","metadata":{"id":"XaVtYN4nlCft","colab_type":"text"},"source":["**Training is much faster using GPU acceleration.** Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and set **Hardware accelerator: GPU**. Training 15,000 iterations will take 1.5 - 2 hours on a GPU runtime.\n","\n","## Configure Defaults\n","\n","**MODIFY** the following constants for your specific use case."]},{"cell_type":"code","metadata":{"id":"ludfxbNIaegy","colab_type":"code","colab":{}},"source":["# A comma-delimited list of the words you want to train for.\n","# The options are: yes,no,up,down,left,right,on,off,stop,go\n","# All the other words will be used to train an \"unknown\" label and silent\n","# audio data with no spoken words will be used to train a \"silence\" label.\n","WANTED_WORDS = \"yes,no\"\n","\n","# The number of steps and learning rates can be specified as comma-separated\n","# lists to define the rate at each stage. For example,\n","# TRAINING_STEPS=12000,3000 and LEARNING_RATE=0.001,0.0001\n","# will run 12,000 training loops in total, with a rate of 0.001 for the first\n","# 8,000, and 0.0001 for the final 3,000.\n","TRAINING_STEPS = \"12000,3000\"\n","LEARNING_RATE = \"0.001,0.0001\"\n","\n","# Calculate the total number of steps, which is used to identify the checkpoint\n","# file name.\n","TOTAL_STEPS = str(sum(map(lambda string: int(string), TRAINING_STEPS.split(\",\"))))\n","\n","# Print the configuration to confirm it\n","print(\"Training these words: %s\" % WANTED_WORDS)\n","print(\"Training steps in each stage: %s\" % TRAINING_STEPS)\n","print(\"Learning rate in each stage: %s\" % LEARNING_RATE)\n","print(\"Total number of training steps: %s\" % TOTAL_STEPS)"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"gCgeOpvY9pAi","colab_type":"text"},"source":["**DO NOT MODIFY** the following constants as they include filepaths used in this notebook and data that is shared during training and inference."]},{"cell_type":"code","metadata":{"id":"Nd1iM1o2ymvA","colab_type":"code","colab":{}},"source":["# Calculate the percentage of 'silence' and 'unknown' training samples required\n","# to ensure that we have equal number of samples for each label.\n","number_of_labels = WANTED_WORDS.count(',') + 1\n","number_of_total_labels = number_of_labels + 2 # for 'silence' and 'unknown' label\n","equal_percentage_of_training_samples = int(100.0/(number_of_total_labels))\n","SILENT_PERCENTAGE = equal_percentage_of_training_samples\n","UNKNOWN_PERCENTAGE = equal_percentage_of_training_samples\n","\n","# Constants which are shared during training and inference\n","PREPROCESS = 'micro'\n","WINDOW_STRIDE = 20\n","MODEL_ARCHITECTURE = 'tiny_conv' # Other options include: single_fc, conv,\n","                      # low_latency_conv, low_latency_svdf, tiny_embedding_conv\n","\n","# Constants used during training only\n","VERBOSITY = 'WARN'\n","EVAL_STEP_INTERVAL = '1000'\n","SAVE_STEP_INTERVAL = '1000'\n","\n","# Constants for training directories and filepaths\n","DATASET_DIR =  'dataset/'\n","LOGS_DIR = 'logs/'\n","TRAIN_DIR = 'train/' # for training checkpoints and other files.\n","\n","# Constants for inference directories and filepaths\n","import os\n","MODELS_DIR = 'models'\n","if not os.path.exists(MODELS_DIR):\n","  os.mkdir(MODELS_DIR)\n","MODEL_TF = os.path.join(MODELS_DIR, 'model.pb')\n","MODEL_TFLITE = os.path.join(MODELS_DIR, 'model.tflite')\n","FLOAT_MODEL_TFLITE = os.path.join(MODELS_DIR, 'float_model.tflite')\n","MODEL_TFLITE_MICRO = os.path.join(MODELS_DIR, 'model.cc')\n","SAVED_MODEL = os.path.join(MODELS_DIR, 'saved_model')\n","\n","QUANT_INPUT_MIN = 0.0\n","QUANT_INPUT_MAX = 26.0\n","QUANT_INPUT_RANGE = QUANT_INPUT_MAX - QUANT_INPUT_MIN"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"6rLYpvtg9P4o","colab_type":"text"},"source":["## Setup Environment\n","\n","Install Dependencies"]},{"cell_type":"code","metadata":{"id":"ed_XpUrU5DvY","colab_type":"code","colab":{}},"source":["%tensorflow_version 1.x\n","import tensorflow as tf"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"T9Ty5mR58E4i","colab_type":"text"},"source":["**DELETE** any old data from previous runs\n"]},{"cell_type":"code","metadata":{"id":"APGx0fEh7hFF","colab_type":"code","colab":{}},"source":["!rm -rf {DATASET_DIR} {LOGS_DIR} {TRAIN_DIR} {MODELS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"GfEUlfFBizio","colab_type":"text"},"source":["Clone the TensorFlow Github Repository, which contains the relevant code required to run this tutorial."]},{"cell_type":"code","metadata":{"id":"yZArmzT85SLq","colab_type":"code","colab":{}},"source":["!git clone -q --depth 1 https://github.com/tensorflow/tensorflow"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"nS9swHLSi7Bi","colab_type":"text"},"source":["Load TensorBoard to visualize the accuracy and loss as training proceeds.\n"]},{"cell_type":"code","metadata":{"id":"q4qF1VxP3UE4","colab_type":"code","colab":{}},"source":["%load_ext tensorboard\n","%tensorboard --logdir {LOGS_DIR}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"x1J96Ron-O4R","colab_type":"text"},"source":["## Training\n","\n","The following script downloads the dataset and begin training."]},{"cell_type":"code","metadata":{"id":"VJsEZx6lynbY","colab_type":"code","colab":{}},"source":["!python tensorflow/tensorflow/examples/speech_commands/train.py \\\n","--data_dir={DATASET_DIR} \\\n","--wanted_words={WANTED_WORDS} \\\n","--silence_percentage={SILENT_PERCENTAGE} \\\n","--unknown_percentage={UNKNOWN_PERCENTAGE} \\\n","--preprocess={PREPROCESS} \\\n","--window_stride={WINDOW_STRIDE} \\\n","--model_architecture={MODEL_ARCHITECTURE} \\\n","--how_many_training_steps={TRAINING_STEPS} \\\n","--learning_rate={LEARNING_RATE} \\\n","--train_dir={TRAIN_DIR} \\\n","--summaries_dir={LOGS_DIR} \\\n","--verbosity={VERBOSITY} \\\n","--eval_step_interval={EVAL_STEP_INTERVAL} \\\n","--save_step_interval={SAVE_STEP_INTERVAL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"UczQKtqLi7OJ","colab_type":"text"},"source":["# Skipping the training\n","\n","If you don't want to spend an hour or two training the model from scratch, you can download pretrained checkpoints by uncommenting the lines below (removing the '#'s at the start of each line) and running them."]},{"cell_type":"code","metadata":{"id":"RZw3VNlnla-J","colab_type":"code","colab":{}},"source":["#!curl -O \"https://storage.googleapis.com/download.tensorflow.org/models/tflite/speech_micro_train_2020_05_10.tgz\"\n","#!tar xzf speech_micro_train_2020_05_10.tgz"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"XQUJLrdS-ftl","colab_type":"text"},"source":["## Generate a TensorFlow Model for Inference\n","\n","Combine relevant training results (graph, weights, etc) into a single file for inference. This process is known as freezing a model and the resulting model is known as a frozen model/graph, as it cannot be further re-trained after this process."]},{"cell_type":"code","metadata":{"id":"xyc3_eLh9sAg","colab_type":"code","colab":{}},"source":["!rm -rf {SAVED_MODEL}\n","!python tensorflow/tensorflow/examples/speech_commands/freeze.py \\\n","--wanted_words=$WANTED_WORDS \\\n","--window_stride_ms=$WINDOW_STRIDE \\\n","--preprocess=$PREPROCESS \\\n","--model_architecture=$MODEL_ARCHITECTURE \\\n","--start_checkpoint=$TRAIN_DIR$MODEL_ARCHITECTURE'.ckpt-'{TOTAL_STEPS} \\\n","--save_format=saved_model \\\n","--output_file={SAVED_MODEL}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"_DBGDxVI-nKG","colab_type":"text"},"source":["## Generate a TensorFlow Lite Model\n","\n","Convert the frozen graph into a TensorFlow Lite model, which is fully quantized for use with embedded devices.\n","\n","The following cell will also print the model size, which will be under 20 kilobytes."]},{"cell_type":"code","metadata":{"id":"RIitkqvGWmre","colab_type":"code","colab":{}},"source":["import sys\n","# We add this path so we can import the speech processing modules.\n","sys.path.append(\"/content/tensorflow/tensorflow/examples/speech_commands/\")\n","import input_data\n","import models\n","import numpy as np"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"kzqECqMxgBh4","colab_type":"code","colab":{}},"source":["SAMPLE_RATE = 16000\n","CLIP_DURATION_MS = 1000\n","WINDOW_SIZE_MS = 30.0\n","FEATURE_BIN_COUNT = 40\n","BACKGROUND_FREQUENCY = 0.8\n","BACKGROUND_VOLUME_RANGE = 0.1\n","TIME_SHIFT_MS = 100.0\n","\n","DATA_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz'\n","VALIDATION_PERCENTAGE = 10\n","TESTING_PERCENTAGE = 10"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"rNQdAplJV1fz","colab_type":"code","colab":{}},"source":["model_settings = models.prepare_model_settings(\n","    len(input_data.prepare_words_list(WANTED_WORDS.split(','))),\n","    SAMPLE_RATE, CLIP_DURATION_MS, WINDOW_SIZE_MS,\n","    WINDOW_STRIDE, FEATURE_BIN_COUNT, PREPROCESS)\n","audio_processor = input_data.AudioProcessor(\n","    DATA_URL, DATASET_DIR,\n","    SILENT_PERCENTAGE, UNKNOWN_PERCENTAGE,\n","    WANTED_WORDS.split(','), VALIDATION_PERCENTAGE,\n","    TESTING_PERCENTAGE, model_settings, LOGS_DIR)"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"lBj_AyCh1cC0","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  float_converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  float_tflite_model = float_converter.convert()\n","  float_tflite_model_size = open(FLOAT_MODEL_TFLITE, \"wb\").write(float_tflite_model)\n","  print(\"Float model is %d bytes\" % float_tflite_model_size)\n","\n","  converter = tf.lite.TFLiteConverter.from_saved_model(SAVED_MODEL)\n","  converter.optimizations = [tf.lite.Optimize.DEFAULT]\n","  converter.inference_input_type = tf.lite.constants.INT8\n","  converter.inference_output_type = tf.lite.constants.INT8\n","  def representative_dataset_gen():\n","    for i in range(100):\n","      data, _ = audio_processor.get_data(1, i*1, model_settings,\n","                                         BACKGROUND_FREQUENCY, \n","                                         BACKGROUND_VOLUME_RANGE,\n","                                         TIME_SHIFT_MS,\n","                                         'testing',\n","                                         sess)\n","      flattened_data = np.array(data.flatten(), dtype=np.float32).reshape(1, 1960)\n","      yield [flattened_data]\n","  converter.representative_dataset = representative_dataset_gen\n","  tflite_model = converter.convert()\n","  tflite_model_size = open(MODEL_TFLITE, \"wb\").write(tflite_model)\n","  print(\"Quantized model is %d bytes\" % tflite_model_size)\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"EeLiDZTbLkzv","colab_type":"text"},"source":["# Testing the TensorFlow Lite model's accuracy\n","\n","Verify that the model we've exported is still accurate, using the TF Lite Python API and our test set."]},{"cell_type":"code","metadata":{"id":"wQsEteKRLryJ","colab_type":"code","colab":{}},"source":["with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","float_interpreter = tf.lite.Interpreter(FLOAT_MODEL_TFLITE)\n","float_interpreter.allocate_tensors()\n","\n","float_input_index = float_interpreter.get_input_details()[0][\"index\"]\n","\n","float_output_index = float_interpreter.get_output_details()[0][\"index\"]\n","float_model_output = float_interpreter.tensor(float_output_index)\n","\n","float_correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  flattened_input = np.array(current_input.flatten(), dtype=np.float32).reshape(1, 1960)\n","  float_interpreter.set_tensor(float_input_index, flattened_input)\n","  float_interpreter.invoke()\n","  top_prediction = float_model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    float_correct_predictions += 1\n","\n","print('Float accuracy is %f%% (N=%d)' % ((float_correct_predictions * 100) / len(test_data), len(test_data)))\n","\n","interpreter = tf.lite.Interpreter(MODEL_TFLITE)\n","interpreter.allocate_tensors()\n","\n","input_index = interpreter.get_input_details()[0][\"index\"]\n","\n","output_index = interpreter.get_output_details()[0][\"index\"]\n","model_output = interpreter.tensor(output_index)\n","\n","with tf.Session() as sess:\n","  test_data, test_labels = audio_processor.get_data(\n","      -1, 0, model_settings, BACKGROUND_FREQUENCY, BACKGROUND_VOLUME_RANGE,\n","      TIME_SHIFT_MS, 'testing', sess)\n","\n","correct_predictions = 0\n","for i in range(len(test_data)):\n","  current_input = test_data[i]\n","  current_label = test_labels[i]\n","  quantized_input = np.zeros((1960), np.int8)\n","  for index, input_value in enumerate(current_input.flatten()):\n","    # These scaling values are derived from those used in input_data.py in the\n","    # training pipeline.\n","    value = ((input_value - QUANT_INPUT_MIN) * 256) / QUANT_INPUT_RANGE\n","    value -= 128\n","    if value < -128:\n","      value = -128\n","    if value > 127:\n","      value = 127\n","    quantized_input[index] = value\n","  flattened_input = np.array(quantized_input.flatten(), dtype=np.int8).reshape(1, 1960)\n","  interpreter.set_tensor(input_index, flattened_input)\n","  interpreter.invoke()\n","  top_prediction = model_output()[0].argmax()\n","  if top_prediction == current_label:\n","    correct_predictions += 1\n","\n","print('Quantized accuracy is %f%% (N=%d)' % ((correct_predictions * 100) / len(test_data), len(test_data)))\n"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"dt6Zqbxu-wIi","colab_type":"text"},"source":["## Generate a TensorFlow Lite for MicroControllers Model\n","Convert the TensorFlow Lite model into a C source file that can be loaded by TensorFlow Lite for Microcontrollers."]},{"cell_type":"code","metadata":{"id":"XohZOTjR8ZyE","colab_type":"code","colab":{}},"source":["# Install xxd if it is not available\n","!apt-get update && apt-get -qq install xxd\n","# Convert to a C source file\n","!xxd -i {MODEL_TFLITE} > {MODEL_TFLITE_MICRO}\n","# Update variable names\n","REPLACE_TEXT = MODEL_TFLITE.replace('/', '_').replace('.', '_')\n","!sed -i 's/'{REPLACE_TEXT}'/g_model/g' {MODEL_TFLITE_MICRO}"],"execution_count":0,"outputs":[]},{"cell_type":"markdown","metadata":{"id":"2pQnN0i_-0L2","colab_type":"text"},"source":["## Deploy to a Microcontroller\n","\n","Follow the instructions in the [micro_speech](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/micro_speech) README.md for [TensorFlow Lite for MicroControllers](https://www.tensorflow.org/lite/microcontrollers/overview) to deploy this model on a specific microcontroller.\n","\n","**Reference Model:** If you have not modified this notebook, you can follow the instructions as is, to deploy the model. Refer to the [`micro_speech/train/models`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/models) directory to access the models generated in this notebook. \n","\n","**New Model:** If you have generated a new model to identify different words: (i) Update `kCategoryCount` and `kCategoryLabels` in [`micro_speech/micro_features/micro_model_settings.h`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/micro_model_settings.h) and (ii) Update the values assigned to the variables defined in [`micro_speech/micro_features/model.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/micro_features/model.cc) with values displayed after running the following cell."]},{"cell_type":"code","metadata":{"id":"eoYyh0VU8pca","colab_type":"code","colab":{}},"source":["# Print the C source file\n","!cat {MODEL_TFLITE_MICRO}"],"execution_count":0,"outputs":[]},{"cell_type":"code","metadata":{"id":"iYlIKpO2mkhv","colab_type":"code","colab":{}},"source":[""],"execution_count":0,"outputs":[]}]}
\ No newline at end of file

From d5a5959dd33d783a2af711b777a51292b0d8a02a Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Wed, 13 May 2020 22:30:26 -0700
Subject: [PATCH 171/412] optimize for int8 add.

PiperOrigin-RevId: 311471171
Change-Id: I822d1205b1c5312ecf0e2602b6ac35082740574d
---
 .../internal/optimized/integer_ops/add.h      | 141 +++++++++++-------
 1 file changed, 91 insertions(+), 50 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index a9dae4feac5..8937fe2b26e 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -35,58 +35,99 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
+
 #ifdef USE_NEON
-  const int8x8_t output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
-  const int8x8_t output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
-  for (; i <= size - 8; i += 8) {
-    const int8x8_t input1_val_original = vld1_s8(input1_data + i);
-    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
-    const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
-    const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
-    const int16x8_t input1_val =
-        vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
-    const int16x8_t input2_val =
-        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
-    const int16x4_t input1_val_high = vget_high_s16(input1_val);
-    const int16x4_t input1_val_low = vget_low_s16(input1_val);
-    const int16x4_t input2_val_high = vget_high_s16(input2_val);
-    const int16x4_t input2_val_low = vget_low_s16(input2_val);
-    int32x4_t x11 = vmovl_s16(input1_val_low);
-    int32x4_t x12 = vmovl_s16(input1_val_high);
-    int32x4_t x21 = vmovl_s16(input2_val_low);
-    int32x4_t x22 = vmovl_s16(input2_val_high);
-    const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
-    x11 = vshlq_s32(x11, left_shift_dup);
-    x12 = vshlq_s32(x12, left_shift_dup);
-    x21 = vshlq_s32(x21, left_shift_dup);
-    x22 = vshlq_s32(x22, left_shift_dup);
-    x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
-    x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
-    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
-    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
-    const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
-    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
-    x11 = vshlq_s32(x11, input1_shift_dup);
-    x12 = vshlq_s32(x12, input1_shift_dup);
-    x21 = vshlq_s32(x21, input2_shift_dup);
-    x22 = vshlq_s32(x22, input2_shift_dup);
-    int32x4_t s1 = vaddq_s32(x11, x21);
-    int32x4_t s2 = vaddq_s32(x12, x22);
-    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
-    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+  const int8x16_t output_activation_min_vector =
+      vdupq_n_s8(params.quantized_activation_min);
+  const int8x16_t output_activation_max_vector =
+      vdupq_n_s8(params.quantized_activation_max);
+
+  const int input1_left_shift = params.left_shift + params.input1_shift;
+  const int input2_left_shift = params.left_shift + params.input2_shift;
+  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
+  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
+
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high =
+        vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low =
+        vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low =
+        vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high =
+        vaddq_s16(input1_val_s16_high, vdupq_n_s16(params.input1_offset));
+    const int16x8_t input2_val_high =
+        vaddq_s16(input2_val_s16_high, vdupq_n_s16(params.input2_offset));
+    const int16x8_t input1_val_low =
+        vaddq_s16(input1_val_s16_low, vdupq_n_s16(params.input1_offset));
+    const int16x8_t input2_val_low =
+        vaddq_s16(input2_val_s16_low, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+    int32x4_t x111 = vmovl_s16(input1_val_low_low);
+    int32x4_t x112 = vmovl_s16(input1_val_low_high);
+    int32x4_t x121 = vmovl_s16(input1_val_high_low);
+    int32x4_t x122 = vmovl_s16(input1_val_high_high);
+    int32x4_t x211 = vmovl_s16(input2_val_low_low);
+    int32x4_t x212 = vmovl_s16(input2_val_low_high);
+    int32x4_t x221 = vmovl_s16(input2_val_high_low);
+    int32x4_t x222 = vmovl_s16(input2_val_high_high);
+
+    x111 = vshlq_s32(x111, input1_left_dup);
+    x112 = vshlq_s32(x112, input1_left_dup);
+    x121 = vshlq_s32(x121, input1_left_dup);
+    x122 = vshlq_s32(x122, input1_left_dup);
+    x211 = vshlq_s32(x211, input2_left_dup);
+    x212 = vshlq_s32(x212, input2_left_dup);
+    x221 = vshlq_s32(x221, input2_left_dup);
+    x222 = vshlq_s32(x222, input2_left_dup);
+    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
+    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
+    x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
+    x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
+    x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
+    x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
+    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
+    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
+    int32x4_t s11 = vaddq_s32(x111, x211);
+    int32x4_t s12 = vaddq_s32(x112, x212);
+    int32x4_t s21 = vaddq_s32(x121, x221);
+    int32x4_t s22 = vaddq_s32(x122, x222);
+    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
+    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
+    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
+    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    s1 = RoundingDivideByPOT(s1, -params.output_shift);
-    s2 = RoundingDivideByPOT(s2, -params.output_shift);
-    const int16x4_t s1_narrowed = vmovn_s32(s1);
-    const int16x4_t s2_narrowed = vmovn_s32(s2);
-    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
-                                  vdupq_n_s16(params.output_offset));
-    const int8x8_t clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
-    vst1_s8(output_data + i, clamped);
+    s11 = RoundingDivideByPOT(s11, -params.output_shift);
+    s12 = RoundingDivideByPOT(s12, -params.output_shift);
+    s21 = RoundingDivideByPOT(s21, -params.output_shift);
+    s22 = RoundingDivideByPOT(s22, -params.output_shift);
+    const int16x4_t s11_narrowed = vmovn_s32(s11);
+    const int16x4_t s12_narrowed = vmovn_s32(s12);
+    const int16x4_t s21_narrowed = vmovn_s32(s21);
+    const int16x4_t s22_narrowed = vmovn_s32(s22);
+    const int16x8_t s1 = vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int16x8_t s2 = vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int16x8_t s = vcombine_s16(vqmovn_s16(s1), vqmovn_s16(s2));
+
+    const int8x16_t clamped =
+        vmaxq_s8(output_activation_min_vector,
+                 vminq_s8(output_activation_max_vector, s));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 

From 4afee5f519ee47d6771f78c5580c0bc6d17d8876 Mon Sep 17 00:00:00 2001
From: Jaesung Chung <jaesung@google.com>
Date: Wed, 13 May 2020 22:48:33 -0700
Subject: [PATCH 172/412] Replace SameOperandsAndResultType by
 TFL_TCresVTEtIsSameAsOp to cover quantization types

Also fixes Mobilenet-v3-quant conversion failure.

PiperOrigin-RevId: 311473695
Change-Id: I08f836a2b829772f7a8d6b39766ab67ccd2c9a10
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   | 34 +++++++++++--------
 .../mlir/lite/transforms/dense_to_sparse.cc   |  3 +-
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index fdf1501dbef..8a949a45e2d 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -1561,10 +1561,12 @@ def TFL_GreaterOp : TFL_Op<"greater", [
   let printer = [{ return mlir::impl::printOneResultOp(getOperation(), p); }];
 }
 
-def TFL_HardSwishOp: TFL_Op<"hard_swish", [NoSideEffect,
-                                           SameOperandsAndResultShape,
-                                           SameOperandsAndResultType,
-                                           TFL_GpuTargetOp]> {
+def TFL_HardSwishOp: TFL_Op<"hard_swish", [
+    NoSideEffect,
+    SameOperandsAndResultShape,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    TFL_GpuTargetOp]> {
   let summary = "Hardswish activation function.";
   let description = [{
     Computes hard-swish activation function
@@ -1574,7 +1576,7 @@ def TFL_HardSwishOp: TFL_Op<"hard_swish", [NoSideEffect,
 
   let arguments = (ins TFL_TensorOf<[F32, QUI8, QI8]>:$input);
 
-  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$out);
+  let results = (outs TFL_TensorOf<[F32, QUI8, QI8]>:$output);
 
   let hasOptions = 0;
 }
@@ -1606,7 +1608,8 @@ def TFL_L2NormalizationOp : TFL_Op<"l2_normalization", [NoSideEffect,
 def TFL_LeakyReluOp: TFL_Op<"leaky_relu", [
     SameOperandsAndResultShape,
     NoSideEffect,
-    SameOperandsAndResultType]> {
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "Leaky Relu operator";
 
   let description = [{
@@ -1740,7 +1743,8 @@ def TFL_LogOp: TFL_Op<"log", [
 def TFL_LogSoftmaxOp : TFL_Op<"log_softmax", [
     NoSideEffect,
     SameOperandsAndResultShape,
-    SameOperandsAndResultType,
+    PredOpTrait<"x and y must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
     // zero_point = max_value
     // scale = -log_softmax_output_min / (max_value + 1)
     FixedResultScale<Int8UniformQuantizedType<127, 625, -4>>,
@@ -1896,11 +1900,11 @@ Rounds the values of a tensor to the nearest integer, element-wise.
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32]>:$x
+    TFL_FpTensor:$x
   );
 
   let results = (outs
-    TFL_TensorOf<[F32]>:$y
+    TFL_FpTensor:$y
   );
 }
 
@@ -2443,9 +2447,9 @@ def TFL_RsqrtOp: TFL_Op<"rsqrt", [NoSideEffect,
     Computes element-wise reverse square root of input
   }];
 
-  let arguments = (ins AnyTensor:$x);
+  let arguments = (ins TFL_FpTensor:$x);
 
-  let results = (outs AnyTensor:$y);
+  let results = (outs TFL_FpTensor:$y);
 
   let hasFolder = 1;
 }
@@ -3361,9 +3365,11 @@ def TFL_QuantizeOp: TFL_Op<"quantize", [
   let results = (outs AnyTensor:$output);
 }
 
-def TFL_DensifyOp: TFL_Op<"densify", [NoSideEffect,
-                                      SameOperandsAndResultType,
-                                      NoQuantizableResult]> {
+def TFL_DensifyOp: TFL_Op<"densify", [
+    NoSideEffect,
+    PredOpTrait<"input and output must have same element type",
+      TFL_TCresVTEtIsSameAsOp<0, 0>>,
+    NoQuantizableResult]> {
   let summary = "Densify operator";
 
   let description = [{
diff --git a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
index 201a0bb2481..9b526f40277 100644
--- a/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/dense_to_sparse.cc
@@ -321,7 +321,8 @@ void DenseToSparse::runOnFunction() {
 
       if (result.needs_densify) {
         const auto value = op->getOperand(operand);
-        auto densify = builder.create<DensifyOp>(op->getLoc(), value);
+        auto densify =
+            builder.create<DensifyOp>(op->getLoc(), value.getType(), value);
         value.replaceAllUsesWith(densify);
         densify.setOperand(value);
       }

From 9173c6c3d3f9f3a8c58f48f803952cad83bf8730 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 22:52:09 -0700
Subject: [PATCH 173/412] Internal change

PiperOrigin-RevId: 311474047
Change-Id: I2c8bcfc0c13d5bf82eaeeadc43202171eeecab8b
---
 .../kernels/data/experimental/snapshot_util.cc | 18 +++++++-----------
 tensorflow/core/platform/tensor_coding.cc      |  9 +--------
 2 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.cc b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
index 3ad1345d776..6c4d6424146 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
@@ -503,12 +503,10 @@ Status Reader::ReadTensors(std::vector<Tensor>* read_tensors) {
       size_t tensor_proto_size = tensor_proto_strs[complex_index].second;
       TensorProto tp;
 #if defined(PLATFORM_GOOGLE)
-      auto tensor_proto_ptr = tensor_proto_str.release();
-      absl::Cord c;
-      c.AppendExternalMemory(
-          absl::string_view(tensor_proto_ptr, tensor_proto_size),
-          tensor_proto_ptr,
-          [](void* arg) { delete[] static_cast<char*>(arg); });
+      absl::string_view tensor_proto_view(tensor_proto_str.get(),
+                                          tensor_proto_size);
+      absl::Cord c = absl::MakeCordFromExternal(
+          tensor_proto_view, [s = std::move(tensor_proto_str)] {});
       if (!tp.ParseFromCord(c)) {
         return errors::Internal("Could not parse TensorProto");
       }
@@ -615,11 +613,9 @@ Status Reader::ReadRecord(absl::Cord* record) {
   } else {
     auto tmp_str = absl::make_unique<tstring>();
     TF_RETURN_IF_ERROR(input_stream_->ReadNBytes(length, tmp_str.get()));
-    tstring* tmp_str_raw = tmp_str.release();
-    record->AppendExternalMemory(*tmp_str_raw, tmp_str_raw,
-                                 [](absl::string_view unused_data, void* arg) {
-                                   delete static_cast<tstring*>(arg);
-                                 });
+    absl::string_view tmp_str_view(*tmp_str);
+    record->Append(
+        absl::MakeCordFromExternal(tmp_str_view, [s = std::move(tmp_str)] {}));
     return Status::OK();
   }
 }
diff --git a/tensorflow/core/platform/tensor_coding.cc b/tensorflow/core/platform/tensor_coding.cc
index 66d28d7b15f..cd938a5be1d 100644
--- a/tensorflow/core/platform/tensor_coding.cc
+++ b/tensorflow/core/platform/tensor_coding.cc
@@ -134,14 +134,7 @@ std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in) {
 #if defined(TENSORFLOW_PROTOBUF_USES_CORD)
 void AssignRefCounted(StringPiece src, core::RefCounted* obj, absl::Cord* out) {
   obj->Ref();
-  out->Clear();
-  // Defines a lambda to unref "obj" when Cord deletes this piece of
-  // memory. +[] converts the lambda to a C style function pointer.
-  auto cleanup = +[](absl::string_view donotcare, void* obj) {
-    reinterpret_cast<core::RefCounted*>(obj)->Unref();
-  };
-  out->AppendExternalMemory(absl::string_view(src.data(), src.size()), obj,
-                            cleanup);
+  *out = absl::MakeCordFromExternal(src, [obj] { obj->Unref(); });
 }
 
 void EncodeStringList(const tstring* strings, int64 n, absl::Cord* out) {

From 2f8ea36a4475a726e564223023d98e09959919a9 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Wed, 13 May 2020 22:57:23 -0700
Subject: [PATCH 174/412] Pin keras_preprocessing to 1.1.0

PiperOrigin-RevId: 311474544
Change-Id: I1911f5b834e61cd269c39ca30559d1304b3a787f
---
 tensorflow/tools/pip_package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index f61e00c01d5..806ad2d0cdb 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -60,7 +60,7 @@ REQUIRED_PACKAGES = [
     'gast == 0.3.3',
     'google_pasta >= 0.1.8',
     'h5py >= 2.10.0, < 2.11.0',
-    'keras_preprocessing >= 1.1.0',
+    'keras_preprocessing == 1.1.0',
     'numpy >= 1.16.0, < 2.0',
     'opt_einsum >= 2.3.2',
     'protobuf >= 3.9.2',

From 2a55f049241fae552009b8e520894da1404f281f Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Wed, 13 May 2020 23:02:24 -0700
Subject: [PATCH 175/412] Cleanup `setup.py`

Remove python 2 and TF 1.x stanzas. Also make keras_preprocessing be between 1.1.1 and 1.2.

PiperOrigin-RevId: 311475047
Change-Id: I4ba517cb8babd609e83d031c86afb6670d34c757
---
 tensorflow/tools/pip_package/setup.py | 23 +++--------------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 806ad2d0cdb..4b8289a6202 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -55,12 +55,10 @@ _VERSION = '2.2.0'
 REQUIRED_PACKAGES = [
     'absl-py >= 0.7.0',
     'astunparse == 1.6.3',
-    'backports.weakref >= 1.0rc1;python_version<"3.4"',
-    'enum34 >= 1.1.6;python_version<"3.4"',
     'gast == 0.3.3',
     'google_pasta >= 0.1.8',
     'h5py >= 2.10.0, < 2.11.0',
-    'keras_preprocessing == 1.1.0',
+    'keras_preprocessing >= 1.1.1, < 1.2',
     'numpy >= 1.16.0, < 2.0',
     'opt_einsum >= 2.3.2',
     'protobuf >= 3.9.2',
@@ -68,18 +66,10 @@ REQUIRED_PACKAGES = [
     'tensorflow_estimator >= 2.2.0, < 2.3.0',
     'termcolor >= 1.1.0',
     'wrapt >= 1.11.1',
-    # python3 requires wheel 0.26
-    'wheel >= 0.26;python_version>="3"',
-    'wheel;python_version<"3"',
-    # mock comes with unittest.mock for python3, need to install for python2
-    'mock >= 2.0.0;python_version<"3"',
-    # functools comes with python3, need to install the backport for python2
-    'functools32 >= 3.2.3;python_version<"3"',
+    'wheel >= 0.26',
     'six >= 1.12.0',
     # scipy < 1.4.1 causes segfaults due to pybind11
-    # Latest scipy pip for py2 is scipy==1.2.2
-    'scipy == 1.4.1;python_version>="3"',
-    'scipy == 1.2.2;python_version<"3"',
+    'scipy == 1.4.1',
 ]
 
 if sys.byteorder == 'little':
@@ -100,8 +90,6 @@ if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
       REQUIRED_PACKAGES[i] = 'tb-nightly >= 2.3.0a0, < 2.4.0a0'
-    elif 'tensorflow_estimator' in pkg and '2.0' in project_name:
-      REQUIRED_PACKAGES[i] = 'tensorflow-estimator-2.0-preview'
     elif 'tensorflow_estimator' in pkg:
       REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
@@ -121,11 +109,6 @@ CONSOLE_SCRIPTS = [
 ]
 # pylint: enable=line-too-long
 
-# Only keep freeze_graph console script in 1.X.
-if _VERSION.startswith('1.') and '_2.0' not in project_name:
-  CONSOLE_SCRIPTS.append(
-      'freeze_graph = tensorflow.python.tools.freeze_graph:run_main')
-
 # remove the tensorboard console script if building tf_nightly
 if 'tf_nightly' in project_name:
   CONSOLE_SCRIPTS.remove('tensorboard = tensorboard.main:run_main')

From e40aeb534e10db518ec44ccf32b09db8446d6aa3 Mon Sep 17 00:00:00 2001
From: Nat Jeffries <njeff@google.com>
Date: Wed, 13 May 2020 23:20:32 -0700
Subject: [PATCH 176/412] Change xtensa optimized softmax to use precomputed
 lookup table for quantized exponent calculation.  Use new memory API for
 softmax.

PiperOrigin-RevId: 311476576
Change-Id: I1026f6eca0e098c42f7b784ab599ed362dc533c9
---
 .../micro/kernels/xtensa_hifimini/softmax.cc  | 134 ++++++++++++++----
 1 file changed, 108 insertions(+), 26 deletions(-)

diff --git a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
index c95fd0e40a4..a7c5604ef64 100644
--- a/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
+++ b/tensorflow/lite/micro/kernels/xtensa_hifimini/softmax.cc
@@ -29,16 +29,88 @@ namespace micro {
 namespace activations {
 namespace {
 
-// TODO(b/141176180): This code is currently a strict subset of the portable
-// implementation (softmax.cc one directory up). When TFLM implements
-// registrations for selective types (e.g. compile without float support), this
-// can be removed. Otherwise, any HiFi specific optimizations should land here.
+struct OpData {
+  uint16_t* exp_lut;
+};
+
+// Number of unique int8 and int16 values.  Used in exponent lookup table
+// conputation.
+constexpr int kInt8Range =
+    std::numeric_limits<int8_t>::max() - std::numeric_limits<int8>::min() + 1;
+constexpr int kInt16Range =
+    std::numeric_limits<int16_t>::max() - std::numeric_limits<int16>::min() + 1;
+// Each 16-bit precalculated exponent is expressed as a Q0.16 fixedpoint
+// value. We special-case e^0 since 1.0 requires 1 integer bit to
+// express.
+constexpr int kExpFractionalBits = 16;
+// e^0 expressed as Q1.15 exceeds the int16_t range, so it must be handled
+// specially.
+constexpr int kMaxExponentValue = (1 << kExpFractionalBits);
+
+// Quantized softmax with int8 input and int16 output.
+// TODO(b/155656675): Investigate removing const ref params.
+inline TfLiteStatus Softmax(const OpData& op_data,
+                            const RuntimeShape& input_shape,
+                            const int8_t* input_data,
+                            const RuntimeShape& output_shape,
+                            int16_t* output_data) {
+  // The last dimension is depth.  Outer size is the the total input size
+  // divided by depth.
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth =
+      MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  for (int i = 0; i < outer_size; ++i) {
+    int8_t max_in_row = std::numeric_limits<int8_t>::min();
+    for (int c = 0; c < depth; ++c) {
+      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+    }
+
+    uint32_t sum_of_exps = 0;
+    for (int c = 0; c < depth; ++c) {
+      TFLITE_DCHECK(max_in_row >= input_data[i * depth + c]);
+      uint8_t input_diff = max_in_row - input_data[i * depth + c];
+
+      sum_of_exps +=
+          input_diff == 0 ? kMaxExponentValue : op_data.exp_lut[input_diff];
+    }
+
+    // Ensure we cannnot overflow the full_range_output value.  We need to
+    // guarantee that kInt16Range * max(input_data) / sum_of_exps < kInt16Range.
+    TFLITE_DCHECK(sum_of_exps >= kMaxExponentValue);
+
+    for (int c = 0; c < depth; ++c) {
+      uint8_t input_diff = max_in_row - input_data[i * depth + c];
+      // Special case for diff == 0
+      uint32_t unscaled_output =
+          input_diff == 0 ? kMaxExponentValue : op_data.exp_lut[input_diff];
+      int64_t scaled_output = static_cast<int64_t>(unscaled_output) *
+                              static_cast<int64_t>(kInt16Range);
+      int32_t full_range_output =
+          scaled_output / sum_of_exps + std::numeric_limits<int16_t>::min();
+      // Round up if remainder exceeds half of the divider value.
+      uint32_t remainder = scaled_output % sum_of_exps;
+      if (remainder * 2 >= sum_of_exps) {
+        full_range_output++;
+      }
+      output_data[i * depth + c] = static_cast<int16_t>(std::max(
+          std::min(full_range_output,
+                   static_cast<int32>(std::numeric_limits<int16_t>::max())),
+          static_cast<int32_t>(std::numeric_limits<int16_t>::min())));
+    }
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace
 
 TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
                                     const TfLiteTensor* input,
                                     TfLiteTensor* output,
                                     const TfLiteSoftmaxParams* params,
-                                    SoftmaxParams* op_data) {
+                                    OpData* op_data) {
   if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8) {
     if (input->type == kTfLiteUInt8) {
       TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
@@ -55,28 +127,30 @@ TfLiteStatus CalculateSoftmaxOpData(TfLiteContext* context,
       }
     }
 
-    static const int kScaledDiffIntegerBits = 5;
+    // Precompute e^(-x * input_scale * beta) for every possible int8 input.
+    // This computation is used for every iteration of Softmax.  We must compute
+    // using pre-scaled inputs to avoid introducing additional error, while
+    // restricting our input range to the int8 range. This is valid since beta
+    // and input scale are constant for a given op in the graph. Skip index 0
+    // since that is a special case which requires 1 integer bit instead of 0.
+    for (int i = 1; i <= kInt8Range; i++) {
+      float scaled_input = i * input->params.scale;
+      float exp_value =
+          std::exp((-scaled_input) * static_cast<float>(params->beta));
 
-    int input_left_shift;
-    tflite::PreprocessSoftmaxScaling(
-        static_cast<double>(params->beta),
-        static_cast<double>(input->params.scale), kScaledDiffIntegerBits,
-        &op_data->input_multiplier, &input_left_shift);
-    op_data->input_left_shift = input_left_shift;
-    op_data->diff_min =
-        -1.0 * tflite::CalculateInputRadius(kScaledDiffIntegerBits,
-                                            op_data->input_left_shift);
+      float exponent_scaled =
+          std::round(exp_value * static_cast<float>(1 << kExpFractionalBits));
+      op_data->exp_lut[i] = static_cast<uint16_t>(exponent_scaled);
+    }
   }
   return kTfLiteOk;
 }
 
-}  // namespace
-
 void* SoftmaxInit(TfLiteContext* context, const char* buffer, size_t length) {
   TFLITE_DCHECK(context->AllocatePersistentBuffer != nullptr);
   void* data = nullptr;
-  if (context->AllocatePersistentBuffer(context, sizeof(SoftmaxParams),
-                                        &data) == kTfLiteError) {
+  if (context->AllocatePersistentBuffer(context, sizeof(OpData), &data) ==
+      kTfLiteError) {
     return nullptr;
   }
   return data;
@@ -92,26 +166,34 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
   TFLITE_DCHECK(node->user_data != nullptr);
-  SoftmaxParams* op_params = static_cast<SoftmaxParams*>(node->user_data);
+  OpData* op_data = static_cast<OpData*>(node->user_data);
+
+  // Allocate an array to precompute exponents over all int8 inputs, applying
+  // the scale and beta before calculating exp. It is mandatory to apply beta
+  // and scale here, since each softmax op may have different beta and scale
+  // values. Beta and scale will remain constant for a given softmax op.
+  void* allocated_ptr;
+  TF_LITE_ENSURE_STATUS(context->AllocatePersistentBuffer(
+      context, kInt8Range * sizeof(int16_t), &allocated_ptr));
+  op_data->exp_lut = static_cast<uint16_t*>(allocated_ptr);
 
   TF_LITE_ENSURE_STATUS(
-      CalculateSoftmaxOpData(context, input, output, params, op_params));
+      CalculateSoftmaxOpData(context, input, output, params, op_data));
 
   return kTfLiteOk;
 }
 
 TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
-  auto* op_params = static_cast<SoftmaxParams*>(node->user_data);
+  auto* op_data = static_cast<OpData*>(node->user_data);
 
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
   if (input->type == kTfLiteInt8 && output->type == kTfLiteInt16) {
     // TODO(b/155656675): Const ref params can be slow on xtensa.
-    tflite::reference_ops::Softmax(
-        *op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-        GetTensorShape(output), GetTensorData<int16_t>(output));
-    return kTfLiteOk;
+    return Softmax(*op_data, GetTensorShape(input),
+                   GetTensorData<int8_t>(input), GetTensorShape(output),
+                   GetTensorData<int16_t>(output));
   } else {
     TF_LITE_KERNEL_LOG(context, "Type %s (%d) not supported.",
                        TfLiteTypeGetName(input->type), input->type);

From 112288586dd69d5eede04be059b7eddc5635bc98 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 23:31:44 -0700
Subject: [PATCH 177/412] Switch weights from per-value to per-input-item.

PiperOrigin-RevId: 311477582
Change-Id: I749c4edfcfd4dd3acd036a1d14b2c493b8d8bfc8
---
 .../api_def_DenseCountSparseOutput.pbtxt      |  23 +-
 .../api_def_RaggedCountSparseOutput.pbtxt     |  27 +-
 .../api_def_SparseCountSparseOutput.pbtxt     |  29 +--
 tensorflow/core/kernels/count_ops.cc          | 246 +++++++++++-------
 tensorflow/core/ops/count_ops.cc              |  39 ++-
 tensorflow/python/ops/bincount.py             | 151 ++---------
 tensorflow/python/ops/bincount_test.py        | 188 ++++---------
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v1/tensorflow.sparse.pbtxt     |   2 +-
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v2/tensorflow.sparse.pbtxt     |   2 +-
 11 files changed, 278 insertions(+), 441 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
index 8296bfe6d7b..416da1ccaab 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
@@ -4,62 +4,61 @@ op {
   in_arg {
     name: "values"
     description: <<END
-Tensor containing data to count.
+int32 or int64; Tensor containing data to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-A Tensor of the same shape as indices containing per-index weight values. May
-also be the empty tensor if no weights are used.
+float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-Indices tensor for the resulting sparse tensor object.
+int64; indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-Values tensor for the resulting sparse tensor object.
+int64 or float32; values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-Shape tensor for the resulting sparse tensor object.
+int64; shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-Dtype of the input values tensor.
+dtype; dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-Minimum value to count. Can be set to -1 for no minimum.
+int32; minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-Maximum value to count. Can be set to -1 for no maximum.
+int32; maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_output"
+    name: "binary_count"
     description: <<END
-Whether to output the number of occurrences of each value or 1.
+bool; whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-Dtype of the output values tensor.
+dtype; dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a tf.tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
index 37224d841de..1763aea1fa6 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
@@ -4,68 +4,67 @@ op {
   in_arg {
     name: "splits"
     description: <<END
-Tensor containing the row splits of the ragged tensor to count.
+int64; Tensor containing the row splits of the ragged tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-Tensor containing values of the sparse tensor to count.
+int32 or int64; Tensor containing values of the sparse tensor to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-A Tensor of the same shape as indices containing per-index weight values.
-May also be the empty tensor if no weights are used.
+float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-Indices tensor for the resulting sparse tensor object.
+int64; indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-Values tensor for the resulting sparse tensor object.
-END
+int64 or float32; values tensor for the resulting sparse tensor object.
+  END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-Shape tensor for the resulting sparse tensor object.
+int64; shape tensor for the resulting sparse tensor object.
   END
   }
   attr {
     name: "T"
     description: <<END
-Dtype of the input values tensor.
+dtype; dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-Minimum value to count. Can be set to -1 for no minimum.
+int32; minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-Maximum value to count. Can be set to -1 for no maximum.
+int32; maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_output"
+    name: "binary_count"
     description: <<END
-Whether to output the number of occurrences of each value or 1.
+bool; whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-Dtype of the output values tensor.
+dtype; dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a ragged tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
index a346710c8b3..62538e36a45 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
@@ -4,74 +4,73 @@ op {
   in_arg {
     name: "indices"
     description: <<END
-Tensor containing the indices of the sparse tensor to count.
+int64; Tensor containing the indices of the sparse tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-Tensor containing values of the sparse tensor to count.
+int32 or int64; Tensor containing values of the sparse tensor to count.
 END
   }
 in_arg {
     name: "dense_shape"
     description: <<END
-Tensor containing the dense shape of the sparse tensor to count.
+int64; Tensor containing the dense shape of the sparse tensor to count.
 END
   }
- in_arg {
+  in_arg {
     name: "weights"
     description: <<END
-A Tensor of the same shape as indices containing per-index weight values.
-May also be the empty tensor if no weights are used.
+float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-Indices tensor for the resulting sparse tensor object.
+int64; indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_values"
       description: <<END
-Values tensor for the resulting sparse tensor object.
+int64 or float32; values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_dense_shape"
       description: <<END
-Shape tensor for the resulting sparse tensor object.
+int64; shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-Dtype of the input values tensor.
+dtype; dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-Minimum value to count. Can be set to -1 for no minimum.
+int32; minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-Maximum value to count. Can be set to -1 for no maximum.
+int32; maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_output"
+    name: "binary_count"
     description: <<END
-Whether to output the number of occurrences of each value or 1.
+bool; whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-Dtype of the output values tensor.
+dtype; dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a sparse tensor input."
diff --git a/tensorflow/core/kernels/count_ops.cc b/tensorflow/core/kernels/count_ops.cc
index 7c85b050039..e7cc18ac454 100644
--- a/tensorflow/core/kernels/count_ops.cc
+++ b/tensorflow/core/kernels/count_ops.cc
@@ -16,20 +16,17 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
-#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-template <class T>
-using BatchedMap = std::vector<absl::flat_hash_map<int64, T>>;
+using BatchedIntMap = std::vector<absl::flat_hash_map<int64, int64>>;
 
 namespace {
 // TODO(momernick): Extend this function to work with outputs of rank > 2.
-template <class T>
-Status OutputSparse(const BatchedMap<T>& per_batch_counts, int num_values,
+Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
                     bool is_1d, OpKernelContext* context) {
   int total_values = 0;
   int num_batches = per_batch_counts.size();
@@ -47,12 +44,12 @@ Status OutputSparse(const BatchedMap<T>& per_batch_counts, int num_values,
       context->allocate_output(1, TensorShape({total_values}), &values));
 
   auto output_indices = indices->matrix<int64>();
-  auto output_values = values->flat<T>();
+  auto output_values = values->flat<int64>();
   int64 value_loc = 0;
   for (int b = 0; b < num_batches; ++b) {
     const auto& per_batch_count = per_batch_counts[b];
-    std::vector<std::pair<int, T>> pairs(per_batch_count.begin(),
-                                         per_batch_count.end());
+    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
+                                           per_batch_count.end());
     std::sort(pairs.begin(), pairs.end());
     for (const auto& x : pairs) {
       if (is_1d) {
@@ -80,19 +77,85 @@ Status OutputSparse(const BatchedMap<T>& per_batch_counts, int num_values,
   return Status::OK();
 }
 
-int GetOutputSize(int max_seen, int max_length, int min_length) {
+Status OutputWeightedSparse(const BatchedIntMap& per_batch_counts,
+                            int num_values, const Tensor& weights, bool is_1d,
+                            OpKernelContext* context) {
+  if (!TensorShapeUtils::IsVector(weights.shape())) {
+    return errors::InvalidArgument(
+        "Weights must be a 1-dimensional tensor. Got: ",
+        weights.shape().DebugString());
+  }
+
+  if (num_values > weights.dim_size(0)) {
+    return errors::InvalidArgument("The maximum array value was ", num_values,
+                                   ", but the weight array has size ",
+                                   weights.shape().DebugString());
+  }
+  auto weight_values = weights.flat<float>();
+
+  int total_values = 0;
+  int num_batches = per_batch_counts.size();
+  for (const auto& per_batch_count : per_batch_counts) {
+    total_values += per_batch_count.size();
+  }
+
+  Tensor* indices;
+  int inner_dim = is_1d ? 1 : 2;
+  TF_RETURN_IF_ERROR(context->allocate_output(
+      0, TensorShape({total_values, inner_dim}), &indices));
+
+  Tensor* values;
+  TF_RETURN_IF_ERROR(
+      context->allocate_output(1, TensorShape({total_values}), &values));
+
+  auto output_indices = indices->matrix<int64>();
+  auto output_values = values->flat<float>();
+  int64 value_loc = 0;
+  for (int b = 0; b < num_batches; ++b) {
+    const auto& per_batch_count = per_batch_counts[b];
+    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
+                                           per_batch_count.end());
+    std::sort(pairs.begin(), pairs.end());
+    for (const auto& x : pairs) {
+      if (is_1d) {
+        output_indices(value_loc, 0) = x.first;
+      } else {
+        output_indices(value_loc, 0) = b;
+        output_indices(value_loc, 1) = x.first;
+      }
+      output_values(value_loc) = x.second * weight_values(x.first);
+      ++value_loc;
+    }
+  }
+
+  Tensor* dense_shape;
+  if (is_1d) {
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(2, TensorShape({1}), &dense_shape));
+    dense_shape->flat<int64>().data()[0] = num_values;
+  } else {
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(2, TensorShape({2}), &dense_shape));
+    dense_shape->flat<int64>().data()[0] = num_batches;
+    dense_shape->flat<int64>().data()[1] = num_values;
+  }
+  return Status::OK();
+}
+
+template <class T>
+T GetOutputSize(T max_seen, T max_length, T min_length) {
   return max_length > 0 ? max_length : std::max((max_seen + 1), min_length);
 }
 
 }  // namespace
 
-template <class T, class W>
+template <class T>
 class DenseCount : public OpKernel {
  public:
   explicit DenseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -107,15 +170,6 @@ class DenseCount : public OpKernel {
                     "Input must be a 1 or 2-dimensional tensor. Got: ",
                     data.shape().DebugString()));
 
-    if (use_weights) {
-      OP_REQUIRES(
-          context, weights.shape() == data.shape(),
-          errors::InvalidArgument(
-              "Weights and data must have the same shape. Weight shape: ",
-              weights.shape().DebugString(),
-              "; data shape: ", data.shape().DebugString()));
-    }
-
     bool is_1d = TensorShapeUtils::IsVector(data.shape());
     int negative_valued_axis = -1;
     int num_batch_dimensions = (data.shape().dims() + negative_valued_axis);
@@ -125,23 +179,19 @@ class DenseCount : public OpKernel {
       num_batch_elements *= data.shape().dim_size(i);
     }
     int num_value_elements = data.shape().num_elements() / num_batch_elements;
-    auto per_batch_counts = BatchedMap<W>(num_batch_elements);
-
+    auto per_batch_counts = BatchedIntMap(num_batch_elements);
     T max_value = 0;
 
     const auto data_values = data.flat<T>();
-    const auto weight_values = weights.flat<W>();
     int i = 0;
     for (int b = 0; b < num_batch_elements; ++b) {
       for (int v = 0; v < num_value_elements; ++v) {
         const auto& value = data_values(i);
         if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-          if (binary_output_) {
-            per_batch_counts[b][value] = 1;
-          } else if (use_weights) {
-            per_batch_counts[b][value] += weight_values(i);
+          if (binary_count_) {
+            (per_batch_counts[b])[value] = 1;
           } else {
-            per_batch_counts[b][value]++;
+            (per_batch_counts[b])[value]++;
           }
           if (value > max_value) {
             max_value = value;
@@ -151,24 +201,30 @@ class DenseCount : public OpKernel {
       }
     }
 
-    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
-    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
-                                            is_1d, context));
+    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
+    if (use_weights) {
+      OP_REQUIRES_OK(context,
+                     OutputWeightedSparse(per_batch_counts, num_output_values,
+                                          weights, is_1d, context));
+    } else {
+      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
+                                           is_1d, context));
+    }
   }
 
  private:
-  int maxlength_;
-  int minlength_;
-  bool binary_output_;
+  T minlength_;
+  T maxlength_;
+  bool binary_count_;
 };
 
-template <class T, class W>
+template <class T>
 class SparseCount : public OpKernel {
  public:
   explicit SparseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -179,27 +235,23 @@ class SparseCount : public OpKernel {
     bool use_weights = weights.NumElements() > 0;
 
     bool is_1d = shape.NumElements() == 1;
+    const auto indices_values = indices.matrix<int64>();
+    const auto values_values = values.flat<T>();
+
     int num_batches = is_1d ? 1 : shape.flat<int64>()(0);
     int num_values = values.NumElements();
 
-    const auto indices_values = indices.matrix<int64>();
-    const auto values_values = values.flat<T>();
-    const auto weight_values = weights.flat<W>();
-
-    auto per_batch_counts = BatchedMap<W>(num_batches);
-
+    auto per_batch_counts = BatchedIntMap(num_batches);
     T max_value = 0;
 
     for (int idx = 0; idx < num_values; ++idx) {
       int batch = is_1d ? 0 : indices_values(idx, 0);
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_output_) {
-          per_batch_counts[batch][value] = 1;
-        } else if (use_weights) {
-          per_batch_counts[batch][value] += weight_values(idx);
+        if (binary_count_) {
+          (per_batch_counts[batch])[value] = 1;
         } else {
-          per_batch_counts[batch][value]++;
+          (per_batch_counts[batch])[value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -207,25 +259,30 @@ class SparseCount : public OpKernel {
       }
     }
 
-    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
-    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
-                                            is_1d, context));
+    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
+    if (use_weights) {
+      OP_REQUIRES_OK(context,
+                     OutputWeightedSparse(per_batch_counts, num_output_values,
+                                          weights, is_1d, context));
+    } else {
+      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
+                                           is_1d, context));
+    }
   }
 
  private:
-  int maxlength_;
-  int minlength_;
-  bool binary_output_;
-  bool validate_;
+  T minlength_;
+  T maxlength_;
+  bool binary_count_;
 };
 
-template <class T, class W>
+template <class T>
 class RaggedCount : public OpKernel {
  public:
   explicit RaggedCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -233,15 +290,13 @@ class RaggedCount : public OpKernel {
     const Tensor& values = context->input(1);
     const Tensor& weights = context->input(2);
     bool use_weights = weights.NumElements() > 0;
-    bool is_1d = false;
 
     const auto splits_values = splits.flat<int64>();
     const auto values_values = values.flat<T>();
-    const auto weight_values = weights.flat<W>();
     int num_batches = splits.NumElements() - 1;
     int num_values = values.NumElements();
 
-    auto per_batch_counts = BatchedMap<W>(num_batches);
+    auto per_batch_counts = BatchedIntMap(num_batches);
     T max_value = 0;
     int batch_idx = 0;
 
@@ -251,12 +306,10 @@ class RaggedCount : public OpKernel {
       }
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_output_) {
-          per_batch_counts[batch_idx - 1][value] = 1;
-        } else if (use_weights) {
-          per_batch_counts[batch_idx - 1][value] += weight_values(idx);
+        if (binary_count_) {
+          (per_batch_counts[batch_idx - 1])[value] = 1;
         } else {
-          per_batch_counts[batch_idx - 1][value]++;
+          (per_batch_counts[batch_idx - 1])[value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -264,47 +317,42 @@ class RaggedCount : public OpKernel {
       }
     }
 
-    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
-    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
-                                            is_1d, context));
+    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
+    if (use_weights) {
+      OP_REQUIRES_OK(context,
+                     OutputWeightedSparse(per_batch_counts, num_output_values,
+                                          weights, false, context));
+    } else {
+      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
+                                           false, context));
+    }
   }
 
  private:
-  int maxlength_;
-  int minlength_;
-  bool binary_output_;
-  bool validate_;
+  T minlength_;
+  T maxlength_;
+  bool binary_count_;
 };
 
-#define REGISTER_W(W_TYPE) \
-  REGISTER(int32, W_TYPE)  \
-  REGISTER(int64, W_TYPE)
+#define REGISTER(TYPE)                                    \
+                                                          \
+  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")  \
+                              .TypeConstraint<TYPE>("T")  \
+                              .Device(DEVICE_CPU),        \
+                          DenseCount<TYPE>)               \
+                                                          \
+  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput") \
+                              .TypeConstraint<TYPE>("T")  \
+                              .Device(DEVICE_CPU),        \
+                          SparseCount<TYPE>)              \
+                                                          \
+  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput") \
+                              .TypeConstraint<TYPE>("T")  \
+                              .Device(DEVICE_CPU),        \
+                          RaggedCount<TYPE>)
 
-#define REGISTER(I_TYPE, W_TYPE)                                     \
-                                                                     \
-  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")             \
-                              .TypeConstraint<I_TYPE>("T")           \
-                              .TypeConstraint<W_TYPE>("output_type") \
-                              .Device(DEVICE_CPU),                   \
-                          DenseCount<I_TYPE, W_TYPE>)                \
-                                                                     \
-  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput")            \
-                              .TypeConstraint<I_TYPE>("T")           \
-                              .TypeConstraint<W_TYPE>("output_type") \
-                              .Device(DEVICE_CPU),                   \
-                          SparseCount<I_TYPE, W_TYPE>)               \
-                                                                     \
-  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput")            \
-                              .TypeConstraint<I_TYPE>("T")           \
-                              .TypeConstraint<W_TYPE>("output_type") \
-                              .Device(DEVICE_CPU),                   \
-                          RaggedCount<I_TYPE, W_TYPE>)
-
-TF_CALL_INTEGRAL_TYPES(REGISTER_W);
-TF_CALL_float(REGISTER_W);
-TF_CALL_double(REGISTER_W);
-
-#undef REGISTER_W
+REGISTER(int32);
+REGISTER(int64);
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/count_ops.cc b/tensorflow/core/ops/count_ops.cc
index 8de0a2ef954..c9fbe1f8d8e 100644
--- a/tensorflow/core/ops/count_ops.cc
+++ b/tensorflow/core/ops/count_ops.cc
@@ -19,21 +19,12 @@ limitations under the License.
 
 namespace tensorflow {
 
+using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
-using shape_inference::ShapeHandle;
 
 Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
-  auto values = c->input(0);
-  auto weights = c->input(1);
-  ShapeHandle output;
-  auto num_weights = c->NumElements(weights);
-  if (c->ValueKnown(num_weights) && c->Value(num_weights) == 0) {
-    output = values;
-  } else {
-    TF_RETURN_IF_ERROR(c->Merge(weights, values, &output));
-  }
-  auto rank = c->Rank(output);
-  auto nvals = c->UnknownDim();
+  int32 rank = c->Rank(c->input(0));
+  DimensionHandle nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -41,8 +32,8 @@ Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
 }
 
 Status SparseCountSparseOutputShapeFn(InferenceContext *c) {
-  auto rank = c->Dim(c->input(0), 1);
-  auto nvals = c->UnknownDim();
+  DimensionHandle rank = c->Dim(c->input(0), 1);
+  DimensionHandle nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -54,7 +45,7 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
   if (rank != c->kUnknownRank) {
     ++rank;  // Add the ragged dimension
   }
-  auto nvals = c->UnknownDim();
+  DimensionHandle nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -63,12 +54,12 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
 
 REGISTER_OP("DenseCountSparseOutput")
     .Input("values: T")
-    .Input("weights: output_type")
+    .Input("weights: float")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_output: bool")
-    .Attr("output_type: {int32, int64, float, double}")
+    .Attr("binary_count: bool")
+    .Attr("output_type: {int64, float}")
     .SetShapeFn(DenseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -78,12 +69,12 @@ REGISTER_OP("SparseCountSparseOutput")
     .Input("indices: int64")
     .Input("values: T")
     .Input("dense_shape: int64")
-    .Input("weights: output_type")
+    .Input("weights: float")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_output: bool")
-    .Attr("output_type: {int32, int64, float, double}")
+    .Attr("binary_count: bool")
+    .Attr("output_type: {int64, float}")
     .SetShapeFn(SparseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -92,12 +83,12 @@ REGISTER_OP("SparseCountSparseOutput")
 REGISTER_OP("RaggedCountSparseOutput")
     .Input("splits: int64")
     .Input("values: T")
-    .Input("weights: output_type")
+    .Input("weights: float")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_output: bool")
-    .Attr("output_type: {int32, int64, float, double}")
+    .Attr("binary_count: bool")
+    .Attr("output_type: {int64, float}")
     .SetShapeFn(RaggedCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
diff --git a/tensorflow/python/ops/bincount.py b/tensorflow/python/ops/bincount.py
index 68950eaf596..e1b3bebaaaa 100644
--- a/tensorflow/python/ops/bincount.py
+++ b/tensorflow/python/ops/bincount.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_count_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util.tf_export import tf_export
@@ -33,7 +33,7 @@ def sparse_bincount(values,
                     axis=0,
                     minlength=None,
                     maxlength=None,
-                    binary_output=False,
+                    binary_count=False,
                     name=None):
   """Count the number of times an integer value appears in a tensor.
 
@@ -58,9 +58,8 @@ def sparse_bincount(values,
     maxlength: If given, skips `values` that are greater than or equal to
       `maxlength`, and ensures that the output has a `dense_shape` of at most
       `maxlength` in the inner dimension.
-    binary_output: If True, this op will output 1 instead of the number of times
-      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
-      reduce_add). Defaults to False.
+    binary_count: Whether to do a binary count. When True, this op will return 1
+      for any value that exists instead of counting the number of occurrences.
     name: A name for this op.
 
   Returns:
@@ -79,7 +78,7 @@ def sparse_bincount(values,
   SparseTensor) and returns a SparseTensor where the value of (i,j) is the
   number of times value j appears in batch i.
 
-  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
   >>> output = tf.sparse.bincount(data, axis=-1)
   >>> print(output)
   SparseTensor(indices=tf.Tensor(
@@ -103,7 +102,7 @@ def sparse_bincount(values,
   dense shape is [2, 500] instead of [2,10002] or [2, 102].
 
   >>> minlength = maxlength = 500
-  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
   >>> output = tf.sparse.bincount(
   ...    data, axis=-1, minlength=minlength, maxlength=maxlength)
   >>> print(output)
@@ -124,8 +123,8 @@ def sparse_bincount(values,
   some values (like 20 in batch 1 and 11 in batch 2) appear more than once,
   the 'values' tensor is all 1s.
 
-  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
-  >>> output = tf.sparse.bincount(data, binary_output=True, axis=-1)
+  >>> dense = [[10, 20, 30, 20], [11, 101, 11, 10001]]
+  >>> output = tf.sparse.bincount(dense, binary_count=True, axis=-1)
   >>> print(output)
   SparseTensor(indices=tf.Tensor(
   [[    0    10]
@@ -137,42 +136,20 @@ def sparse_bincount(values,
    values=tf.Tensor([1 1 1 1 1 1], shape=(6,), dtype=int64),
    dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
 
-  **Weighted bin-counting**
-
-  This example takes two inputs - a values tensor and a weights tensor. These
-  tensors must be identically shaped, and have the same row splits or indices
-  in the case of RaggedTensors or SparseTensors. When performing a weighted
-  count, the op will output a SparseTensor where the value of (i, j) is the
-  sum of the values in the weight tensor's batch i in the locations where
-  the values tensor has the value j. In this case, the output dtype is the
-  same as the dtype of the weights tensor.
-
-  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
-  >>> weights = [[2, 0.25, 15, 0.5], [2, 17, 3, 0.9]]
-  >>> output = tf.sparse.bincount(data, weights=weights, axis=-1)
-  >>> print(output)
-  SparseTensor(indices=tf.Tensor(
-  [[    0    10]
-   [    0    20]
-   [    0    30]
-   [    1    11]
-   [    1   101]
-   [    1 10001]], shape=(6, 2), dtype=int64),
-   values=tf.Tensor([2. 0.75 15. 5. 17. 0.9], shape=(6,), dtype=float32),
-   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
-
   """
   with ops.name_scope(name, "count", [values, weights]):
     if not isinstance(values, sparse_tensor.SparseTensor):
       values = ragged_tensor.convert_to_tensor_or_ragged_tensor(
           values, name="values")
-    if weights is not None:
-      if not isinstance(weights, sparse_tensor.SparseTensor):
-        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-            weights, name="weights")
 
-    if weights is not None and binary_output:
-      raise ValueError("binary_output and weights are mutually exclusive.")
+    if weights is not None and binary_count:
+      raise ValueError("binary_count and weights are mutually exclusive.")
+
+    if weights is None:
+      weights = []
+      output_type = dtypes.int64
+    else:
+      output_type = dtypes.float32
 
     if axis is None:
       axis = 0
@@ -185,114 +162,38 @@ def sparse_bincount(values,
     maxlength_value = maxlength if maxlength is not None else -1
 
     if axis == 0:
-      if isinstance(values, sparse_tensor.SparseTensor):
-        if weights is not None:
-          weights = validate_sparse_weights(values, weights)
-        values = values.values
-      elif isinstance(values, ragged_tensor.RaggedTensor):
-        if weights is not None:
-          weights = validate_ragged_weights(values, weights)
+      if isinstance(values,
+                    (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
         values = values.values
       else:
-        if weights is not None:
-          weights = array_ops.reshape(weights, [-1])
         values = array_ops.reshape(values, [-1])
 
     if isinstance(values, sparse_tensor.SparseTensor):
-      weights = validate_sparse_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.sparse_count_sparse_output(
           values.indices,
           values.values,
           values.dense_shape,
-          weights,
+          weights=weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_output=binary_output)
+          binary_count=binary_count,
+          output_type=output_type)
     elif isinstance(values, ragged_tensor.RaggedTensor):
-      weights = validate_ragged_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.ragged_count_sparse_output(
           values.row_splits,
           values.values,
-          weights,
+          weights=weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_output=binary_output)
+          binary_count=binary_count,
+          output_type=output_type)
     else:
-      weights = validate_dense_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.dense_count_sparse_output(
           values,
           weights=weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_output=binary_output)
+          binary_count=binary_count,
+          output_type=output_type)
 
     return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
-
-
-def validate_dense_weights(values, weights):
-  """Validates the passed weight tensor or creates an empty one."""
-  if weights is None:
-    return array_ops.constant([], dtype=values.dtype)
-
-  if not isinstance(weights, ops.Tensor):
-    raise ValueError(
-        "`weights` must be a tf.Tensor if `values` is a tf.Tensor.")
-
-  return weights
-
-
-def validate_sparse_weights(values, weights):
-  """Validates the passed weight tensor or creates an empty one."""
-  if weights is None:
-    return array_ops.constant([], dtype=values.values.dtype)
-
-  if not isinstance(weights, sparse_tensor.SparseTensor):
-    raise ValueError(
-        "`weights` must be a SparseTensor if `values` is a SparseTensor.")
-
-  checks = []
-  if weights.dense_shape is not values.dense_shape:
-    checks.append(
-        check_ops.assert_equal(
-            weights.dense_shape,
-            values.dense_shape,
-            message="'weights' and 'values' must have the same dense shape."))
-  if weights.indices is not values.indices:
-    checks.append(
-        check_ops.assert_equal(
-            weights.indices,
-            values.indices,
-            message="'weights' and 'values' must have the same indices.")
-    )
-  if checks:
-    with ops.control_dependencies(checks):
-      weights = array_ops.identity(weights.values)
-  else:
-    weights = weights.values
-
-  return weights
-
-
-def validate_ragged_weights(values, weights):
-  """Validates the passed weight tensor or creates an empty one."""
-  if weights is None:
-    return array_ops.constant([], dtype=values.values.dtype)
-
-  if not isinstance(weights, ragged_tensor.RaggedTensor):
-    raise ValueError(
-        "`weights` must be a RaggedTensor if `values` is a RaggedTensor.")
-
-  checks = []
-  if weights.row_splits is not values.row_splits:
-    checks.append(
-        check_ops.assert_equal(
-            weights.row_splits,
-            values.row_splits,
-            message="'weights' and 'values' must have the same row splits."))
-  if checks:
-    with ops.control_dependencies(checks):
-      weights = array_ops.identity(weights.values)
-  else:
-    weights = weights.values
-
-  return weights
diff --git a/tensorflow/python/ops/bincount_test.py b/tensorflow/python/ops/bincount_test.py
index 839af8dcc35..776b65b72d0 100644
--- a/tensorflow/python/ops/bincount_test.py
+++ b/tensorflow/python/ops/bincount_test.py
@@ -21,8 +21,6 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import errors
 from tensorflow.python.ops import bincount
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -67,7 +65,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 6],
-          "binary_output": True,
+          "binary_count": True,
       }, {
           "testcase_name": "_maxlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -75,7 +73,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 7],
-          "binary_output": True,
+          "binary_count": True,
       }, {
           "testcase_name": "_minlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -84,7 +82,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 9],
-          "binary_output": True,
+          "binary_count": True,
       }, {
           "testcase_name": "_minlength_larger_values_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -93,40 +91,40 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 8],
-          "binary_output": True,
+          "binary_count": True,
       }, {
           "testcase_name": "_no_maxlength_weights",
           "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
-          "expected_values": [2, 1, 0.5, 9, 3],
+          "expected_values": [1, 2, 3, 8, 5],
           "expected_shape": [2, 6],
-          "weights": [[0.5, 1, 2], [3, 4, 5]]
+          "weights": [0.5, 1, 2, 3, 4, 5]
       }, {
           "testcase_name": "_maxlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "maxlength": 7,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
-          "expected_values": [2, 1, 0.5, 3, 9],
+          "expected_values": [1, 2, 3, 0.5, 8],
           "expected_shape": [2, 7],
-          "weights": [[0.5, 1, 2, 11], [7, 3, 4, 5]]
+          "weights": [0.5, 1, 2, 3, 4, 5, 6]
       }, {
           "testcase_name": "_minlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 9,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
+          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
           "expected_shape": [2, 9],
-          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
+          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
       }, {
           "testcase_name": "_minlength_larger_values_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 3,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
+          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
           "expected_shape": [2, 8],
-          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
+          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
       }, {
           "testcase_name": "_1d",
           "x": np.array([3, 2, 1, 1], dtype=np.int32),
@@ -148,7 +146,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                        expected_shape,
                        minlength=None,
                        maxlength=None,
-                       binary_output=False,
+                       binary_count=False,
                        weights=None,
                        axis=-1):
     y = bincount.sparse_bincount(
@@ -156,7 +154,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
         weights=weights,
         minlength=minlength,
         maxlength=maxlength,
-        binary_output=binary_output,
+        binary_count=binary_count,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -218,7 +216,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
           "expected_values": [1, 1, 1, 1],
           "expected_shape": [3, 6],
-          "binary_output":
+          "binary_count":
               True,
       },
       {
@@ -232,7 +230,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "binary_output":
+          "binary_count":
               True,
       },
       {
@@ -246,7 +244,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "binary_output":
+          "binary_count":
               True,
       },
       {
@@ -260,7 +258,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "binary_output":
+          "binary_count":
               True,
       },
       {
@@ -270,10 +268,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [2, 6, 7, 10],
+          "expected_values": [1, 3, 8, 5],
           "expected_shape": [3, 6],
-          "weights":
-              np.array([[6, 0, 2, 0], [0, 0, 0, 0], [10, 0, 3.5, 3.5]]),
+          "weights": [0.5, 1, 2, 3, 4, 5]
       },
       {
           "testcase_name":
@@ -282,12 +279,11 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [2, 6, 7, 10],
+          "expected_values": [1, 3, 8, 5],
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "weights":
-              np.array([[6, 0, 2, 0], [0, 0, 14, 0], [10, 0, 3.5, 3.5]]),
+          "weights": [0.5, 1, 2, 3, 4, 5, 6]
       },
       {
           "testcase_name":
@@ -296,12 +292,11 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [2, 6, 14, 6.5, 10],
+          "expected_values": [1, 3, 7, 8, 5],
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "weights":
-              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
+          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
       },
       {
           "testcase_name":
@@ -310,12 +305,11 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [2, 6, 14, 6.5, 10],
+          "expected_values": [1, 3, 7, 8, 5],
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "weights":
-              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
+          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
       },
       {
           "testcase_name": "_1d",
@@ -344,17 +338,16 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_output=False,
+                        binary_count=False,
                         weights=None,
                         axis=-1):
     x_sparse = sparse_ops.from_dense(x)
-    w_sparse = sparse_ops.from_dense(weights) if weights is not None else None
     y = bincount.sparse_bincount(
         x_sparse,
-        weights=w_sparse,
+        weights=weights,
         minlength=minlength,
         maxlength=maxlength,
-        binary_output=binary_output,
+        binary_count=binary_count,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -400,7 +393,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 6],
-          "binary_output": True,
+          "binary_count": True,
       },
       {
           "testcase_name": "_maxlength_binary",
@@ -409,7 +402,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 7],
-          "binary_output": True,
+          "binary_count": True,
       },
       {
           "testcase_name": "_minlength_binary",
@@ -419,13 +412,13 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 9],
-          "binary_output": True,
+          "binary_count": True,
       },
       {
           "testcase_name": "_minlength_larger_values_binary",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "minlength": 3,
-          "binary_output": True,
+          "binary_count": True,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
@@ -435,18 +428,18 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "testcase_name": "_no_maxlength_weights",
           "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
+          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
           "expected_shape": [5, 6],
-          "weights": [[], [], [6, 0.5, 2], [], [10, 0.25, 5, 3]],
+          "weights": [0.5, 1, 2, 3, 4, 5]
       },
       {
           "testcase_name": "_maxlength_weights",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "maxlength": 7,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
+          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
           "expected_shape": [5, 7],
-          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
+          "weights": [0.5, 1, 2, 3, 4, 5, 6]
       },
       {
           "testcase_name": "_minlength_weights",
@@ -454,9 +447,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 9,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
+          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
           "expected_shape": [5, 9],
-          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
+          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
       },
       {
           "testcase_name": "_minlength_larger_values_weights",
@@ -464,9 +457,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 3,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
+          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
           "expected_shape": [5, 8],
-          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
+          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
       },
       {
           "testcase_name": "_1d",
@@ -491,114 +484,21 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_output=False,
+                        binary_count=False,
                         weights=None,
                         axis=-1):
     x_ragged = ragged_factory_ops.constant(x)
-    w = ragged_factory_ops.constant(weights) if weights is not None else None
     y = bincount.sparse_bincount(
         x_ragged,
-        weights=w,
+        weights=weights,
         minlength=minlength,
         maxlength=maxlength,
-        binary_output=binary_output,
+        binary_count=binary_count,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
     self.assertAllEqual(expected_shape, y.dense_shape)
 
 
-class TestSparseCountFailureModes(test.TestCase):
-
-  def test_dense_input_sparse_weights_fails(self):
-    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    weights = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_dense_input_ragged_weights_fails(self):
-    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
-    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_dense_input_wrong_shape_fails(self):
-    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    weights = np.array([[3, 2], [5, 4], [4, 3]])
-    # Note: Eager mode and graph mode throw different errors here. Graph mode
-    # will fail with a ValueError from the shape checking logic, while Eager
-    # will fail with an InvalidArgumentError from the kernel itself.
-    if context.executing_eagerly():
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "must have the same shape"):
-        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-    else:
-      with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
-        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_dense_weights_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_ragged_weights_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
-    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_wrong_indices_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = sparse_ops.from_dense(
-        np.array([[3, 1, 0, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "must have the same indices"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_too_many_indices_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = sparse_ops.from_dense(
-        np.array([[3, 1, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "Incompatible shapes"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_wrong_shape_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4], [0, 0, 0, 0]],
-                 dtype=np.int32))
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "must have the same dense shape"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_ragged_input_dense_weights_fails(self):
-    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
-    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_ragged_input_sparse_weights_fails(self):
-    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
-    weights = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_ragged_input_different_shape_fails(self):
-    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
-    weights = ragged_factory_ops.constant([[6, 0.5, 2], [], [10, 0.25, 5, 3]])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 "must have the same row splits"):
-      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 44fb74ac63a..05b8842be66 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1078,7 +1078,7 @@ tf_module {
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -3074,7 +3074,7 @@ tf_module {
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -4094,7 +4094,7 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index f8f8edb26a8..4c4f6c62291 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 44fb74ac63a..05b8842be66 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1078,7 +1078,7 @@ tf_module {
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -3074,7 +3074,7 @@ tf_module {
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -4094,7 +4094,7 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index 67235bb2cf2..a9ad81920dd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -10,7 +10,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"

From 03f3e8153c405578fdd6aea6694569859eecaac9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 13 May 2020 23:46:17 -0700
Subject: [PATCH 178/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311478670
Change-Id: Ib8c15d5cba307629a0d8fc55e07efc401502899e
---
 tensorflow/go/op/wrappers.go | 124 ++++++++++++++++++-----------------
 1 file changed, 65 insertions(+), 59 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index e6725269279..c6d67c9ad44 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -4715,7 +4715,7 @@ type DenseCountSparseOutputAttr func(optionalAttr)
 
 // DenseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: Minimum value to count. Can be set to -1 for no minimum.
+// value: int32; minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4727,7 +4727,7 @@ func DenseCountSparseOutputMinlength(value int64) DenseCountSparseOutputAttr {
 
 // DenseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: Maximum value to count. Can be set to -1 for no maximum.
+// value: int32; maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4742,20 +4742,20 @@ func DenseCountSparseOutputMaxlength(value int64) DenseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	values: Tensor containing data to count.
-//	weights: A Tensor of the same shape as indices containing per-index weight values. May
-// also be the empty tensor if no weights are used.
-//	binary_output: Whether to output the number of occurrences of each value or 1.
+//	values: int32 or int64; Tensor containing data to count.
+//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+//	binary_count: bool; whether to output the number of occurrences of each value or 1.
+//	output_type: dtype; dtype of the output values tensor.
 //
 // Returns:
-//	output_indices: Indices tensor for the resulting sparse tensor object.
-//	output_values: Values tensor for the resulting sparse tensor object.
-//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
-func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_output bool, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: int64; indices tensor for the resulting sparse tensor object.
+//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
+//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
+func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_output": binary_output}
+	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -8607,7 +8607,7 @@ type RaggedCountSparseOutputAttr func(optionalAttr)
 
 // RaggedCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: Minimum value to count. Can be set to -1 for no minimum.
+// value: int32; minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8619,7 +8619,7 @@ func RaggedCountSparseOutputMinlength(value int64) RaggedCountSparseOutputAttr {
 
 // RaggedCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: Maximum value to count. Can be set to -1 for no maximum.
+// value: int32; maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8634,27 +8634,33 @@ func RaggedCountSparseOutputMaxlength(value int64) RaggedCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	splits: Tensor containing the row splits of the ragged tensor to count.
-//	values: Tensor containing values of the sparse tensor to count.
-//	weights: A Tensor of the same shape as indices containing per-index weight values.
-// May also be the empty tensor if no weights are used.
-//	binary_output: Whether to output the number of occurrences of each value or 1.
+//	splits: int64; Tensor containing the row splits of the ragged tensor to count.
+//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
+//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+//	binary_count: bool; whether to output the number of occurrences of each value or 1.
+//	output_type: dtype; dtype of the output values tensor.
 //
 // Returns:
-//	output_indices: Indices tensor for the resulting sparse tensor object.
-//	output_values: Values tensor for the resulting sparse tensor object.
-//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+//	output_indices: int64; indices tensor for the resulting sparse tensor object.
+//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
+//   END
+//   }
+//   out_arg {
+//     name: "output_dense_shape"
+//     description: <<END
+// int64; shape tensor for the resulting sparse tensor object.
 //   END
 //   }
 //   attr {
 //     name: "T"
 //     description: <<END
-// Dtype of the input values tensor.
-func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_output bool, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+// dtype; dtype of the input values tensor.
+//	output_dense_shape
+func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_output": binary_output}
+	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -12053,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12064,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -13700,7 +13706,7 @@ type SparseCountSparseOutputAttr func(optionalAttr)
 
 // SparseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: Minimum value to count. Can be set to -1 for no minimum.
+// value: int32; minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13712,7 +13718,7 @@ func SparseCountSparseOutputMinlength(value int64) SparseCountSparseOutputAttr {
 
 // SparseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: Maximum value to count. Can be set to -1 for no maximum.
+// value: int32; maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13727,22 +13733,22 @@ func SparseCountSparseOutputMaxlength(value int64) SparseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	indices: Tensor containing the indices of the sparse tensor to count.
-//	values: Tensor containing values of the sparse tensor to count.
-//	dense_shape: Tensor containing the dense shape of the sparse tensor to count.
-//	weights: A Tensor of the same shape as indices containing per-index weight values.
-// May also be the empty tensor if no weights are used.
-//	binary_output: Whether to output the number of occurrences of each value or 1.
+//	indices: int64; Tensor containing the indices of the sparse tensor to count.
+//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
+//	dense_shape: int64; Tensor containing the dense shape of the sparse tensor to count.
+//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+//	binary_count: bool; whether to output the number of occurrences of each value or 1.
+//	output_type: dtype; dtype of the output values tensor.
 //
 // Returns:
-//	output_indices: Indices tensor for the resulting sparse tensor object.
-//	output_values: Values tensor for the resulting sparse tensor object.
-//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
-func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_output bool, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: int64; indices tensor for the resulting sparse tensor object.
+//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
+//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
+func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_output": binary_output}
+	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -18969,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18980,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19384,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20455,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21627,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22335,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22531,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22600,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22715,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22774,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22948,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23325,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25648,7 +25654,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25711,7 +25717,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25962,7 +25968,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26446,7 +26452,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45534,7 +45540,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47474,7 +47480,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47545,7 +47551,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48534,7 +48540,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From d33cb73389c4198c01d8dac55cbbd6620abe7d4b Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Wed, 13 May 2020 23:48:03 -0700
Subject: [PATCH 179/412] Expose inference type in the mlir quantizer

This is to prepare the 16 bits activation quantization release. The data type
specified by this flag is only applied on the activations.

PiperOrigin-RevId: 311478782
Change-Id: I5f63f0508011cc0b1b47a0debb35c17d3284eae9
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.td   |  6 ++--
 .../lite/quantization/lite/quantize_model.cc  | 10 ++++--
 .../lite/quantization/lite/quantize_model.h   |  4 ++-
 .../lite/quantization/lite/tfl_quantizer.cc   |  3 +-
 tensorflow/lite/python/convert.py             |  7 ++--
 tensorflow/lite/python/lite_v2_test.py        | 36 +++++++++++++++++++
 tensorflow/lite/python/wrap_toco.py           |  6 ++--
 tensorflow/lite/toco/python/BUILD             |  1 +
 .../lite/toco/python/toco_python_api.cc       | 21 +++++++++--
 tensorflow/lite/toco/python/toco_python_api.h |  2 +-
 .../python/lite/toco_python_api_wrapper.cc    |  7 ++--
 11 files changed, 84 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 8a949a45e2d..a585b8e1520 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -414,9 +414,9 @@ class TFL_ConvOp<string mnemonic, string opSummary, int index> :
   }];
 
   let arguments = (
-    ins TFL_TensorOf<[F32, QI8, QUI8]>:$input,
+    ins TFL_TensorOf<[F32, QI8, QUI8, QI16]>:$input,
     TFL_TensorOf<[F32, QI8, QUI8]>:$filter,
-    TFL_TensorOfOrNone<[F32, I32]>:$bias,
+    TFL_TensorOfOrNone<[F32, I32, I64]>:$bias,
     I32Attr:$dilation_h_factor,
     I32Attr:$dilation_w_factor,
     TFL_AFAttr:$fused_activation_function,
@@ -425,7 +425,7 @@ class TFL_ConvOp<string mnemonic, string opSummary, int index> :
     I32Attr:$stride_w
   );
 
-  let results = (outs TFL_TensorOf<[F32, QI8, QUI8]>:$output);
+  let results = (outs TFL_TensorOf<[F32, QI8, QUI8, QI16]>:$output);
 
   let hasOptions = 0b1;
 }
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index 0ac3fa419bc..a2e3c065113 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 
 namespace mlir {
 namespace lite {
@@ -38,6 +39,7 @@ namespace lite {
 TfLiteStatus QuantizeModel(
     const tflite::ModelT& input_model, const tflite::TensorType& input_type,
     const tflite::TensorType& output_type,
+    const tflite::TensorType& inference_type,
     const std::unordered_set<std::string>& operator_names,
     bool disable_per_channel, bool fully_quantize,
     flatbuffers::FlatBufferBuilder* builder,
@@ -73,7 +75,7 @@ TfLiteStatus QuantizeModel(
   // Apply quantization passes
   PassManager pm(module->getContext());
   TFL::QuantizationSpecs quant_specs;
-  quant_specs.inference_type = tensorflow::DT_QINT8;
+  quant_specs.inference_type = tflite::TflTypeToTfType(inference_type);
   quant_specs.post_training_quantization = true;
   quant_specs.disable_per_channel = disable_per_channel;
 
@@ -81,8 +83,10 @@ TfLiteStatus QuantizeModel(
   auto input_tf_type = tflite::TflTypeToTfType(input_type);
   if (input_tf_type == tensorflow::DT_FLOAT) {
     emit_adaptor = true;
-  } else if (input_tf_type == tensorflow::DT_UINT8) {
-    quant_specs.inference_type = tensorflow::DT_QUINT8;
+  } else if (input_tf_type == tensorflow::DT_UINT8 ||
+             input_tf_type == tensorflow::DT_INT8 ||
+             input_tf_type == tensorflow::DT_INT16) {
+    quant_specs.inference_type = input_tf_type;
   }
 
   pm.addPass(TFL::CreatePrepareQuantizePass(quant_specs));
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
index 578aa6438de..d60df56b473 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
@@ -26,11 +26,13 @@ namespace mlir {
 namespace lite {
 
 // Quantize the `input_model` and write the result to a flatbuffer `builder`.
-// The `input_type` and `output_type` can be float32/qint8/int8.
+// The `input_type`, `output_type` and `inference_type` can be
+// float32/qint8/int8/int16.
 // Return partially quantized model if `fully_quantize` is false.
 TfLiteStatus QuantizeModel(
     const tflite::ModelT& input_model, const tflite::TensorType& input_type,
     const tflite::TensorType& output_type,
+    const tflite::TensorType& inference_type,
     const std::unordered_set<std::string>& operator_names,
     bool disable_per_channel, bool fully_quantize,
     flatbuffers::FlatBufferBuilder* builder,
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
index 77bd87a3c03..5bd1b71e631 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
@@ -46,7 +46,8 @@ TfLiteStatus QuantizeAnnotatedModel(llvm::StringRef buffer,
 
   tflite::StderrReporter error_reporter;
   return mlir::lite::QuantizeModel(
-      *model, tflite::TensorType_INT8, tflite::TensorType_INT8, {},
+      *model, tflite::TensorType_INT8, tflite::TensorType_INT8,
+      tflite::TensorType_INT8, {},
       /*disable_per_channel=*/false,
       /*fully_quantize=*/true, builder, &error_reporter);
 }
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index ae70afd6962..6b7a32f1bcc 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -108,7 +108,8 @@ class ConverterError(Exception):
   pass
 
 
-def mlir_quantize(input_data_str, disable_per_channel=False):
+def mlir_quantize(input_data_str, disable_per_channel=False,
+                  inference_type=_types_pb2.INT8):
   """Quantize `input_data_str` with calibration results.
 
   Args:
@@ -116,13 +117,15 @@ def mlir_quantize(input_data_str, disable_per_channel=False):
                     calibration results).
     disable_per_channel: Bool indicating whether to do per-channel or
                          per-tensor quantization
+    inference_type: Data type for the activations. The default value is int8.
 
   Returns:
     Quantized model in serialized form (e.g. a TFLITE model) with floating-point
     inputs and outputs.
   """
   return wrap_toco.wrapped_experimental_mlir_quantize(input_data_str,
-                                                      disable_per_channel)
+                                                      disable_per_channel,
+                                                      inference_type)
 
 
 def mlir_sparsify(input_data_str):
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 4768892f359..9af37df2975 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -29,7 +29,9 @@ import tensorflow as tf
 
 from tensorflow.lite.python import lite
 from tensorflow.lite.python import lite_v2_test_util
+from tensorflow.lite.python.convert import mlir_quantize
 from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.layers import recurrent
@@ -204,6 +206,40 @@ class FromConcreteFunctionTest(lite_v2_test_util.ModelTest):
     # Ensure that the quantized weights tflite model is smaller.
     self.assertLess(len(quantized_tflite), len(float_tflite))
 
+  def testCalibrateAndQuantizeBuiltinInt16(self):
+    func, calibration_gen = self._getCalibrationQuantizeModel()
+
+    # Convert float model.
+    float_converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
+
+    converter = lite.TFLiteConverterV2.from_concrete_functions([func])
+    # TODO(b/156309549): We should add INT16 to the builtin types.
+    converter.target_spec.supported_ops = [
+        lite.OpsSet.TFLITE_BUILTINS_INT8
+    ]
+    converter.representative_dataset = calibration_gen
+    converter._experimental_calibrate_only = True
+    calibrated_tflite = converter.convert()
+    quantized_tflite = mlir_quantize(calibrated_tflite,
+                                     inference_type=_types_pb2.QUANTIZED_INT16)
+
+    self.assertTrue(quantized_tflite)
+
+    # The default input and output types should be float.
+    interpreter = Interpreter(model_content=quantized_tflite)
+    interpreter.allocate_tensors()
+    input_details = interpreter.get_input_details()
+    self.assertLen(input_details, 1)
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    output_details = interpreter.get_output_details()
+    self.assertLen(output_details, 1)
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertLess(len(quantized_tflite), len(float_tflite))
+
   def _getTrainingTimeQuantizedModel(self):
 
     class QLinear(tf.keras.layers.Layer):
diff --git a/tensorflow/lite/python/wrap_toco.py b/tensorflow/lite/python/wrap_toco.py
index 3c1f98ff42d..8f72cc8cbbd 100644
--- a/tensorflow/lite/python/wrap_toco.py
+++ b/tensorflow/lite/python/wrap_toco.py
@@ -43,10 +43,12 @@ def wrapped_get_potentially_supported_ops():
   return _pywrap_toco_api.TocoGetPotentiallySupportedOps()
 
 
-def wrapped_experimental_mlir_quantize(input_data_str, disable_per_channel):
+def wrapped_experimental_mlir_quantize(input_data_str, disable_per_channel,
+                                       inference_type):
   """Wraps experimental mlir quantize model."""
   return _pywrap_toco_api.ExperimentalMlirQuantizeModel(input_data_str,
-                                                        disable_per_channel)
+                                                        disable_per_channel,
+                                                        inference_type)
 
 
 def wrapped_experimental_mlir_sparsify(input_data_str):
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index bea582d83a5..7dfa714d1d6 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -54,6 +54,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/python:saved_model_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/lite/quantization/lite:quantize_model",
         "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
+        "//tensorflow/lite/toco:types_proto_cc",
     ] + select({
         # This is required when running `tflite_convert` from `bazel`.
         # It requires to link with TensorFlow Ops to get the op definitions.
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index aafd14f9da8..441aabf0ffe 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/lite/toco/toco_tooling.h"
 #include "tensorflow/lite/toco/toco_types.h"
 #include "tensorflow/lite/toco/tooling_util.h"
+#include "tensorflow/lite/toco/types.pb.h"
 
 namespace toco {
 
@@ -229,7 +230,7 @@ PyObject* TocoGetPotentiallySupportedOps() {
 }
 
 PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
-                            bool fully_quantize) {
+                            bool fully_quantize, int inference_type) {
   using tflite::interpreter_wrapper::PythonErrorReporter;
   char* buf = nullptr;
   Py_ssize_t length;
@@ -249,11 +250,25 @@ PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
   auto tflite_model = absl::make_unique<tflite::ModelT>();
   model->GetModel()->UnPackTo(tflite_model.get(), nullptr);
 
+  tflite::TensorType inference_tensor_type;
+  switch (inference_type) {
+    case toco::IODataType::QUANTIZED_INT16:
+      inference_tensor_type = tflite::TensorType_INT16;
+      break;
+    case toco::IODataType::QUANTIZED_UINT8:
+      inference_tensor_type = tflite::TensorType_UINT8;
+      break;
+    case toco::IODataType::INT8:
+      inference_tensor_type = tflite::TensorType_INT8;
+      break;
+    default:
+      return nullptr;
+  }
   flatbuffers::FlatBufferBuilder builder;
   auto status = mlir::lite::QuantizeModel(
       *tflite_model, tflite::TensorType::TensorType_FLOAT32,
-      tflite::TensorType::TensorType_FLOAT32, {}, disable_per_channel,
-      fully_quantize, &builder, error_reporter.get());
+      tflite::TensorType::TensorType_FLOAT32, inference_tensor_type, {},
+      disable_per_channel, fully_quantize, &builder, error_reporter.get());
 
   if (status != kTfLiteOk) {
     error_reporter->exception();
diff --git a/tensorflow/lite/toco/python/toco_python_api.h b/tensorflow/lite/toco/python/toco_python_api.h
index 7afb097fd4a..058ae9fb942 100644
--- a/tensorflow/lite/toco/python/toco_python_api.h
+++ b/tensorflow/lite/toco/python/toco_python_api.h
@@ -44,7 +44,7 @@ PyObject* TocoGetPotentiallySupportedOps();
 // is specified by the calibration data are not sufficient to quantize the
 // model.
 PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
-                            bool fully_quantize);
+                            bool fully_quantize, int inference_type);
 
 // Sparsifies model to encode sparse tensors with proper format. Throws error if
 // sparsification fails.
diff --git a/tensorflow/python/lite/toco_python_api_wrapper.cc b/tensorflow/python/lite/toco_python_api_wrapper.cc
index e6e0e111ec4..b77200a3bee 100644
--- a/tensorflow/python/lite/toco_python_api_wrapper.cc
+++ b/tensorflow/python/lite/toco_python_api_wrapper.cc
@@ -57,12 +57,13 @@ PYBIND11_MODULE(_pywrap_toco_api, m) {
   m.def(
       "ExperimentalMlirQuantizeModel",
       [](py::object input_contents_txt_raw, bool disable_per_channel,
-         bool fully_quantize) {
+         bool fully_quantize, int inference_type) {
         return tensorflow::PyoOrThrow(toco::MlirQuantizeModel(
-            input_contents_txt_raw.ptr(), disable_per_channel, fully_quantize));
+            input_contents_txt_raw.ptr(), disable_per_channel, fully_quantize,
+            inference_type));
       },
       py::arg("input_contents_txt_raw"), py::arg("disable_per_channel") = false,
-      py::arg("fully_quantize") = true,
+      py::arg("fully_quantize") = true, py::arg("inference_type") = 9,
       R"pbdoc(
       Returns a quantized model.
     )pbdoc");

From d36ad412f43672e84366c34428663553238e85c1 Mon Sep 17 00:00:00 2001
From: YoungSeok Yoon <youngseokyoon@google.com>
Date: Thu, 14 May 2020 00:11:31 -0700
Subject: [PATCH 180/412] Add IOS_BENCHMARK enum value to BenchmarkType

PiperOrigin-RevId: 311481101
Change-Id: I142b5b6231a817df6b688786fa508379ce06dd79
---
 tensorflow/core/util/test_log.proto | 41 +++++++++++++++--------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/tensorflow/core/util/test_log.proto b/tensorflow/core/util/test_log.proto
index ddb7a0275ac..6d3af02e657 100644
--- a/tensorflow/core/util/test_log.proto
+++ b/tensorflow/core/util/test_log.proto
@@ -1,6 +1,8 @@
 // Protocol messages for describing the results of benchmarks and unit tests.
 syntax = "proto3";
 
+package tensorflow;
+
 import "google/protobuf/any.proto";
 import "google/protobuf/wrappers.proto";
 
@@ -9,14 +11,12 @@ option java_outer_classname = "TestLogProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util.testlog";
 
-package tensorflow;
-
 message EntryValue {
   oneof kind {
     double double_value = 1;
     string string_value = 2;
   }
-};
+}
 
 message MetricEntry {
   // Metric name
@@ -62,7 +62,7 @@ message BenchmarkEntry {
   // Metric name, value and expected range. This can include accuracy metrics
   // typically used to determine whether the accuracy test has passed
   repeated MetricEntry metrics = 7;
-};
+}
 
 message BenchmarkEntries {
   repeated BenchmarkEntry entry = 1;
@@ -72,7 +72,7 @@ message BuildConfiguration {
   string mode = 1;               // opt, dbg, etc
   repeated string cc_flags = 2;  // CC compiler flags, if known
   repeated string opts = 3;      // Bazel compilation options, if known
-};
+}
 
 message CommitId {
   oneof kind {
@@ -85,7 +85,7 @@ message CommitId {
   string snapshot = 3;
   // Changelist tested if the change list is not already submitted.
   int64 pending_changelist = 4;
-};
+}
 
 message CPUInfo {
   int64 num_cores = 1;
@@ -105,7 +105,7 @@ message CPUInfo {
 
   // Cache sizes (in bytes), e.g. "L2": 262144 (for 256KB)
   map<string, int64> cache_size = 6;
-};
+}
 
 message MemoryInfo {
   int64 total = 1;      // Total virtual memory in bytes
@@ -113,26 +113,26 @@ message MemoryInfo {
 }
 
 message GPUInfo {
-  string model = 1;  // e.g. "Tesla K40c"
-  string uuid = 2;   // Final entry in output of "nvidia-smi -L"
+  string model = 1;   // e.g. "Tesla K40c"
+  string uuid = 2;    // Final entry in output of "nvidia-smi -L"
   string bus_id = 3;  // e.g. "0000:04:00.0"
-};
+}
 
 message PlatformInfo {
-  string bits = 1;       // e.g. '64bit'
-  string linkage = 2;    // e.g. 'ELF'
-  string machine = 3;    // e.g. 'i386'
-  string release = 4;    // e.g. '3.13.0-76-generic'
-  string system = 5;     // e.g. 'Linux'
-  string version = 6;    // e.g. '#120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016'
-};
+  string bits = 1;     // e.g. '64bit'
+  string linkage = 2;  // e.g. 'ELF'
+  string machine = 3;  // e.g. 'i386'
+  string release = 4;  // e.g. '3.13.0-76-generic'
+  string system = 5;   // e.g. 'Linux'
+  string version = 6;  // e.g. '#120-Ubuntu SMP Mon Jan 18 15:59:10 UTC 2016'
+}
 
 message AvailableDeviceInfo {       // Matches DeviceAttributes
   string name = 1;                  // Device name.
   string type = 2;                  // Device type, e.g. 'CPU' or 'GPU'.
   int64 memory_limit = 3;           // Memory capacity in bytes.
   string physical_description = 4;  // The physical description of this device.
-};
+}
 
 message MachineConfiguration {
   // Host name of machine that ran the benchmark.
@@ -154,7 +154,7 @@ message MachineConfiguration {
   repeated AvailableDeviceInfo available_device_info = 5;
 
   MemoryInfo memory_info = 6;
-};
+}
 
 // Run-specific items such as arguments to the test / benchmark.
 message RunConfiguration {
@@ -206,6 +206,7 @@ message TestResults {
     PYTHON_BENCHMARK = 2;
     ANDROID_BENCHMARK = 3;
     EDGE_BENCHMARK = 4;
+    IOS_BENCHMARK = 5;
   }
   BenchmarkType benchmark_type = 10;
 
@@ -219,4 +220,4 @@ message TestResults {
   // TensorFlow version this benchmark runs against.
   // This can be either set to full version or just the major version.
   string tf_version = 12;
-};
+}

From da78c46560fbccec8e61039e6b836fee5ebdc8c1 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Thu, 14 May 2020 00:31:22 -0700
Subject: [PATCH 181/412] Fix comment to reflect actual logic.

PiperOrigin-RevId: 311483016
Change-Id: Ib66f41fd4c470bdcd516f4f03de7d78fb8ddde1c
---
 tensorflow/lite/graph_info.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/graph_info.cc b/tensorflow/lite/graph_info.cc
index 875a03af817..a419a56a9e6 100644
--- a/tensorflow/lite/graph_info.cc
+++ b/tensorflow/lite/graph_info.cc
@@ -191,11 +191,11 @@ class PartitionGraphIntoIndependentNodeSubsetsImpl {
   std::vector<NodeSubset>* node_subsets_;
   std::vector<NodeSubset::Type> node_type_;
   // Maps from tensor index to the epoch in which it is assigned. Also special
-  // negative values of kEpochNotAssigned if not assigned, kEpochNotReady if it
-  // is an input or constant.
+  // negative values of kEpochNotReady if not assigned, kEpochAlwaysReady if it
+  // is an input to the whole model or a constant that has no dependencies.
   std::vector<int> tensor_epochs_;
   // Maps from tensor index to the epoch in which it is assigned. Also special
-  // negative values of kEpochNotAssigned if not assigned.
+  // negative values of kEpochNotReady if not assigned.
   std::vector<int> node_epochs_;
 };
 

From ca18db7f3f5057bb83c41f4710d7a6a75224300d Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Thu, 14 May 2020 01:38:34 -0700
Subject: [PATCH 182/412] Return a meaningful error for dynamic shape inputs
 with outside compilation head extraction in TPUs.

PiperOrigin-RevId: 311490072
Change-Id: Idc7bf1764aba1fcbfcf830e36a5b575b387923d7
---
 .../python/distribute/tpu_strategy_test.py    | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index de4c975d5ef..6c93e29c028 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -140,6 +141,9 @@ class TPUStrategyTest(test.TestCase):
     # for non-local TPU.
     if FLAGS.tpu:
       self.skipTest("Recovery fails for non-local TPU, see b/148150981")
+
+    # Disable automatic outside compilation.
+    config.set_soft_device_placement(False)
     strategy = get_tpu_strategy()
 
     @def_function.function
@@ -164,6 +168,28 @@ class TPUStrategyTest(test.TestCase):
 
     good_run()
 
+  def test_dynamic_shape_with_outside_compilation_failure(self):
+    # Enable automatic outside compilation.
+    config.set_soft_device_placement(True)
+    strategy = get_tpu_strategy()
+    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
+        2, drop_remainder=False)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+    iterator = iter(dataset)
+
+    @def_function.function
+    def train_fn(iterator):
+
+      def step_fn(inputs):
+        _, inputs = inputs
+        return math_ops.reduce_sum(inputs)
+
+      return strategy.experimental_local_results(
+          strategy.run(step_fn, args=(next(iterator),)))
+
+    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
+      logging.info(train_fn(iterator))
+
   def test_computation_on_subset_cores(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)

From b187ba0bcc04d471ee7cd60aaddbcdfc892e24c6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 01:46:54 -0700
Subject: [PATCH 183/412] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/bfa200ebcf37

PiperOrigin-RevId: 311490759
Change-Id: Icd37195b07135947a26f185a8d2a1ddc1adf718c
---
 .../mlir/xla/tests/lhlo-fuse-linalg.mlir      | 52 +++++++++----------
 .../lhlo-legalize-select-and-scatter.mlir     | 34 ++++++------
 .../mlir/xla/tests/lhlo-legalize-to-gpu.mlir  |  2 +-
 .../lhlo-legalize-to-parallel-loops.mlir      | 50 +++++++++---------
 .../lhlo_legalize_to_parallel_loops.cc        | 48 ++++++++---------
 .../compiler/mlir/xla/transforms/passes.h     |  4 +-
 6 files changed, 95 insertions(+), 95 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
index 013748fea28..99b1766e73c 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-fuse-linalg.mlir
@@ -24,9 +24,9 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 // CHECK-LABEL: func @fusion
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
-//       CHECK:  loop.for {{.*}} step %[[C1]]
-//       CHECK:    loop.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  loop.for
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:        addf
 //       CHECK:      linalg.generic
@@ -36,9 +36,9 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 //   TILED-DAG:  %[[C2:.*]] = constant 2
 //   TILED-DAG:  %[[C3:.*]] = constant 3
 //   TILED-NOT:  linalg.generic
-//       TILED:  loop.for {{.*}} step %[[C2]]
-//       TILED:    loop.for {{.*}} step %[[C3]]
-//   TILED-NOT:  loop.for
+//       TILED:  scf.for {{.*}} step %[[C2]]
+//       TILED:    scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:  scf.for
 //       TILED:      linalg.generic
 //       TILED:        addf
 //       TILED:      linalg.generic
@@ -46,8 +46,8 @@ func @fusion(%multiplier: memref<6x6xf32>, %summand_1: memref<6x6xf32>,
 
 // PLOOP-LABEL: func @fusion
 //   PLOOP-NOT:  linalg.generic
-//       PLOOP:  loop.parallel
-//   PLOOP-NOT:  loop.parallel
+//       PLOOP:  scf.parallel
+//   PLOOP-NOT:  scf.parallel
 //       PLOOP:      linalg.generic
 //       PLOOP:        addf
 //       PLOOP:      linalg.generic
@@ -94,9 +94,9 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 // CHECK-LABEL: func @fusion
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
-//       CHECK:  loop.for {{.*}} step %[[C1]]
-//       CHECK:    loop.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  loop.for
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:      linalg.generic
 //       CHECK:        subf
@@ -107,9 +107,9 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 //   TILED-DAG:   %[[C2:.*]] = constant 2
 //   TILED-DAG:   %[[C3:.*]] = constant 3
 //   TILED-NOT:   linalg.generic
-//       TILED:   loop.for {{.*}} step %[[C2]]
-//       TILED:     loop.for {{.*}} step %[[C3]]
-//   TILED-NOT:   loop.for
+//       TILED:   scf.for {{.*}} step %[[C2]]
+//       TILED:     scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:   scf.for
 //       TILED:       linalg.generic
 //       TILED:       linalg.generic
 //       TILED:         subf
@@ -118,8 +118,8 @@ func @fusion_of_three(%arg0: memref<100x10xf32>,
 
 // PLOOP-LABEL: func @fusion_of_three
 //   PLOOP-NOT:   linalg.generic
-//       PLOOP:   loop.parallel
-//   PLOOP-NOT:   loop.parallel
+//       PLOOP:   scf.parallel
+//   PLOOP-NOT:   scf.parallel
 //       PLOOP:       linalg.generic
 //       PLOOP:       linalg.generic
 //       PLOOP:         subf
@@ -147,11 +147,11 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 // CHECK-LABEL: func @fusion_4d
 //       CHECK:  %[[C1:.*]] = constant 1
 //   CHECK-NOT:  linalg.generic
-//       CHECK:  loop.for {{.*}} step %[[C1]]
-//       CHECK:    loop.for {{.*}} step %[[C1]]
-//       CHECK:      loop.for {{.*}} step %[[C1]]
-//       CHECK:        loop.for {{.*}} step %[[C1]]
-//   CHECK-NOT:  loop.for
+//       CHECK:  scf.for {{.*}} step %[[C1]]
+//       CHECK:    scf.for {{.*}} step %[[C1]]
+//       CHECK:      scf.for {{.*}} step %[[C1]]
+//       CHECK:        scf.for {{.*}} step %[[C1]]
+//   CHECK-NOT:  scf.for
 //       CHECK:      linalg.generic
 //       CHECK:        addf
 //       CHECK:      linalg.generic
@@ -161,9 +161,9 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 //   TILED-DAG:  %[[C2:.*]] = constant 2
 //   TILED-DAG:  %[[C3:.*]] = constant 3
 //   TILED-NOT:  linalg.generic
-//       TILED:  loop.for {{.*}} step %[[C2]]
-//       TILED:    loop.for {{.*}} step %[[C3]]
-//   TILED-NOT:  loop.for
+//       TILED:  scf.for {{.*}} step %[[C2]]
+//       TILED:    scf.for {{.*}} step %[[C3]]
+//   TILED-NOT:  scf.for
 //       TILED:      linalg.generic
 //       TILED:        addf
 //       TILED:      linalg.generic
@@ -171,8 +171,8 @@ func @fusion_4d(%multiplier: memref<6x6x6x6xf32>, %summand_1: memref<6x6x6x6xf32
 
 // PLOOP-LABEL: func @fusion_4d
 //   PLOOP-NOT:  linalg.generic
-//       PLOOP:  loop.parallel
-//   PLOOP-NOT:  loop.parallel
+//       PLOOP:  scf.parallel
+//   PLOOP-NOT:  scf.parallel
 //       PLOOP:      linalg.generic
 //       PLOOP:        addf
 //       PLOOP:      linalg.generic
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
index 5b763cde2ed..c640b395f4d 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-select-and-scatter.mlir
@@ -50,19 +50,19 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 
 // Parallel loop to initialize the output buffer.
 // CHECK:    [[INIT:%.*]] = load [[INIT_BUF]][] : memref<f32>
-// CHECK:    loop.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
+// CHECK:    scf.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:          to ([[C112]], [[C112]]) step ([[C1]], [[C1]]) {
 // CHECK:      store [[INIT]], [[RESULT_BUF]]{{\[}}[[I]], [[J]]]
-// CHECK:      loop.yield
+// CHECK:      scf.yield
 // CHECK:    }
 
 // Parallel loop over source buffer to compute scattered values.
-// CHECK:    loop.parallel ([[II:%.*]], [[JJ:%.*]]) = ([[C0]], [[C0]])
+// CHECK:    scf.parallel ([[II:%.*]], [[JJ:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:          to ([[C56]], [[C56]]) step ([[C1]], [[C1]]) {
 
 // Window loop w.r.t. first dim.
 // CHECK:      [[SEL_RES_I:%.*]]:4
-// CHECK-SAME:   = loop.for [[WIN_I:%.*]] = [[C0]] to [[C3]] step [[C1]]
+// CHECK-SAME:   = scf.for [[WIN_I:%.*]] = [[C0]] to [[C3]] step [[C1]]
 // CHECK-SAME:     iter_args(
 // CHECK-SAME:       [[SEL_I_0:%.*]] = [[C0]], [[SEL_J_0:%.*]] = [[C0]],
 // CHECK-SAME:       [[SEL_VAL_0:%.*]] = [[C0_F32]],
@@ -71,7 +71,7 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 
 // Window loop w.r.t. second dim.
 // CHECK:      [[SEL_RES_J:%.*]]:4
-// CHECK-SAME:   = loop.for [[WIN_J:%.*]] = [[C0]] to [[C3]] step [[C1]]
+// CHECK-SAME:   = scf.for [[WIN_J:%.*]] = [[C0]] to [[C3]] step [[C1]]
 // CHECK-SAME:     iter_args(
 // CHECK-SAME:       [[SEL_I:%.*]] = [[SEL_I_0]], [[SEL_J:%.*]] = [[SEL_J_0]],
 // CHECK-SAME:       [[SEL_VAL:%.*]] = [[SEL_VAL_0]],
@@ -102,14 +102,14 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 // be applied, current selected ivs (SEL_I, SEL_J) and value (SEL_VAL) are
 // returned in that case.
 // CHECK:  [[IF_INBOUNDS_RES:%.*]]:4
-// CHECK-SAME:  = loop.if [[INBOUNDS_1]] -> (index, index, f32, i1) {
+// CHECK-SAME:  = scf.if [[INBOUNDS_1]] -> (index, index, f32, i1) {
 
 
   // INBOUNDS-THEN-BODY, i.e. if INBOUNDS == true
 
   // CHECK: [[ARG_ELEM:%.*]] = load [[ARG_BUF]]{{\[}}[[ARG_I]], [[ARG_J]]]
   // CHECK: [[IF_INIT_RES:%.*]]:4
-  // CHECK-SAME:  = loop.if [[SEL_INIT]] -> (index, index, f32, i1) {
+  // CHECK-SAME:  = scf.if [[SEL_INIT]] -> (index, index, f32, i1) {
 
     // INIT-THEN-BODY, i.e. INBOUNDS == true and INIT = true
 
@@ -133,40 +133,40 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 
 
     // Depending on PRED, return ARG ivs & elem or current select ivs and value.
-    // CHECK:  [[IF_PRED_RES:%.*]]:4 = loop.if [[PRED]]
-    // CHECK:    loop.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]], [[CTRUE]]
+    // CHECK:  [[IF_PRED_RES:%.*]]:4 = scf.if [[PRED]]
+    // CHECK:    scf.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]], [[CTRUE]]
     // CHECK:  } else {
-    // CHECK:    loop.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]], [[SEL_INIT]]
+    // CHECK:    scf.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]], [[SEL_INIT]]
     // CHECK:  }
 
     // INIT-THEN-BODY yield.
-    // CHECK:  loop.yield [[IF_PRED_RES]]#0, [[IF_PRED_RES]]#1,
+    // CHECK:  scf.yield [[IF_PRED_RES]]#0, [[IF_PRED_RES]]#1,
     // CHECK-SAME:        [[IF_PRED_RES]]#2, [[IF_PRED_RES]]#3
 
     // INIT-ELSE-BODY, i.e. if INBOUNDS == TRUE and INIT == FALSE, returns ARG
     // ivs and element without computing Select function.
-    // CHECK:  loop.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]],
+    // CHECK:  scf.yield [[ARG_I]], [[ARG_J]], [[ARG_ELEM]],
     // CHECK-SAME:        [[CTRUE]] : index, index, f32, i1
     // CHECK:  }
 
   // INBOUNDS-THEN-BODY yield.
-  // CHECK:  loop.yield [[IF_INIT_RES]]#0, [[IF_INIT_RES]]#1, [[IF_INIT_RES]]#2,
+  // CHECK:  scf.yield [[IF_INIT_RES]]#0, [[IF_INIT_RES]]#1, [[IF_INIT_RES]]#2,
   // CHECK-SAME:        [[IF_INIT_RES]]#3 : index, index, f32, i1
   // CHECK:  }
 
   // INBOUNDS-ELSE-REGION, i.e. if INBOUNDS == FALSE
   // We are in the pad area, return current iter_args.
-  // CHECK:  loop.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]],
+  // CHECK:  scf.yield [[SEL_I]], [[SEL_J]], [[SEL_VAL]],
   // CHECK-SAME:  [[SEL_INIT]] : index, index, f32, i1
   // CHECK:  }
 
 // Window loop w.r.t. second dim yield.
-// CHECK:  loop.yield [[IF_INBOUNDS_RES]]#0, [[IF_INBOUNDS_RES]]#1,
+// CHECK:  scf.yield [[IF_INBOUNDS_RES]]#0, [[IF_INBOUNDS_RES]]#1,
 // CHECK-SAME:        [[IF_INBOUNDS_RES]]#2, [[IF_INBOUNDS_RES]]#3
 // CHECK:  }
 
 // Window loop w.r.t. first dim yield.
-// CHECK:    loop.yield [[SEL_RES_J]]#0, [[SEL_RES_J]]#1, [[SEL_RES_J]]#2,
+// CHECK:    scf.yield [[SEL_RES_J]]#0, [[SEL_RES_J]]#1, [[SEL_RES_J]]#2,
 // CHECK-SAME:          [[SEL_RES_J]]#3 : index, index, f32, i1
 // CHECK:  }
 
@@ -196,4 +196,4 @@ func @select_and_scatter(%arg: memref<112x112xf32>,
 // CHECK:  atomic_yield [[RES]] : f32
 
 // Parallel loop over source buffer yield
-// CHECK:  loop.yield
+// CHECK:  scf.yield
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir
index 4d878cee6f4..16ffbf241b0 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-gpu.mlir
@@ -22,7 +22,7 @@ func @reduce(%arg: memref<100x10xf32>,
 // CHECK-DAG: %[[LB:.*]] = constant 0 : index
 // CHECK-DAG: %[[UB:.*]] = constant 10 : index
 // CHECK-DAG: %[[STEP:.*]] = constant 1 : index
-// CHECK: loop.for %[[IDX1:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
+// CHECK: scf.for %[[IDX1:.*]] = %[[LB]] to %[[UB]] step %[[STEP]] {
 // CHECK: %[[LHS:.*]] = linalg.slice %[[ARG2]][%[[IDX]]] : memref<100xf32>, index, memref<f32, #map0>
 // CHECK: %[[RHS:.*]] = linalg.slice %[[ARG0]][%[[IDX]], %[[IDX1]]] : memref<100x10xf32>, index, index, memref<f32, #map0>
 // CHECK: "xla_lhlo.add"(%[[LHS]], %[[RHS]], %[[LHS]]) : (memref<f32, {{.*}}>, memref<f32, {{.*}}>, memref<f32, {{.*}}>) -> ()
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
index cb169e060ef..32c367f97d6 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-parallel-loops.mlir
@@ -22,13 +22,13 @@ func @reduce(%arg: memref<100x10x5xf32>,
 // CHECK-DAG:  [[C10:%.*]] = constant 10 : index
 // CHECK-DAG:  [[C100:%.*]] = constant 100 : index
 // CHECK:  [[INIT:%.*]] = load [[INIT_BUF]]
-// CHECK:  loop.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
+// CHECK:  scf.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:                     to ([[C100]], [[C5]]) step ([[C1]], [[C1]]) {
-// CHECK:    [[REDUCTION_RESULT:%.*]] = loop.parallel ([[J:%.*]]) =
+// CHECK:    [[REDUCTION_RESULT:%.*]] = scf.parallel ([[J:%.*]]) =
 // CHECK-SAME:      ([[C0]]) to ([[C10]]) step ([[C1]]) init ([[INIT]]) -> f32 {
 // CHECK:      [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]
 // CHECK-SAME:                 {{\[}}[[I]], [[J]], [[K]]] : memref<100x10x5xf32>
-// CHECK:      loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:      scf.reduce([[ELEM_TO_REDUCE]]) : f32 {
 // CHECK:      ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:        [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:        [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -37,12 +37,12 @@ func @reduce(%arg: memref<100x10x5xf32>,
 // CHECK:        store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:        "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:        [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:        loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:        scf.reduce.return [[ACC_RESULT]] : f32
 // CHECK:      }
-// CHECK:      loop.yield
+// CHECK:      scf.yield
 // CHECK:    }
 // CHECK:    store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[K]]]
-// CHECK:    loop.yield
+// CHECK:    scf.yield
 
 // -----
 
@@ -66,10 +66,10 @@ func @reduce_no_outer_loop(%arg: memref<100xf32>,
 // CHECK-DAG:  [[C1:%.*]] = constant 1 : index
 // CHECK-DAG:  [[C100:%.*]] = constant 100 : index
 // CHECK:      [[INIT:%.*]] = load [[INIT_BUF]]
-// CHECK:      [[REDUCTION_RESULT:%.*]] = loop.parallel ([[I:%.*]]) = ([[C0]])
+// CHECK:      [[REDUCTION_RESULT:%.*]] = scf.parallel ([[I:%.*]]) = ([[C0]])
 // CHECK-SAME:     to ([[C100]]) step ([[C1]]) init ([[INIT]]) -> f32 {
 // CHECK:        [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]{{\[}}[[I]]{{\]}}
-// CHECK:        loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:        scf.reduce([[ELEM_TO_REDUCE]]) : f32 {
 // CHECK:        ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:          [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:          [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -78,9 +78,9 @@ func @reduce_no_outer_loop(%arg: memref<100xf32>,
 // CHECK:          store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:          "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:          [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:          loop.reduce.return [[ACC_RESULT]]
+// CHECK:          scf.reduce.return [[ACC_RESULT]]
 // CHECK:        }
-// CHECK:        loop.yield
+// CHECK:        scf.yield
 // CHECK:      store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[C0]]]
 
 // -----
@@ -107,13 +107,13 @@ func @dynamic_reduce(%arg: memref<?x?x?xf32>,
 // CHECK:  [[DIM1:%.*]] = dim [[ARG_BUF]], 1 : memref<?x?x?xf32>
 // CHECK:  [[DIM2:%.*]] = dim [[ARG_BUF]], 2 : memref<?x?x?xf32>
 // CHECK:  [[INIT:%.*]] = load [[INIT_BUF]]
-// CHECK:  loop.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
+// CHECK:  scf.parallel ([[I:%.*]], [[K:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:                     to ([[DIM0]], [[DIM2]]) step ([[C1]], [[C1]]) {
-// CHECK:    [[REDUCTION_RESULT:%.*]] = loop.parallel ([[J:%.*]]) =
+// CHECK:    [[REDUCTION_RESULT:%.*]] = scf.parallel ([[J:%.*]]) =
 // CHECK-SAME:     ([[C0]]) to ([[DIM1]]) step ([[C1]]) init ([[INIT]]) -> f32 {
 // CHECK:      [[ELEM_TO_REDUCE:%.*]] = load [[ARG_BUF]]
 // CHECK-SAME:                 {{\[}}[[I]], [[J]], [[K]]] : memref<?x?x?xf32>
-// CHECK:      loop.reduce([[ELEM_TO_REDUCE]]) : f32 {
+// CHECK:      scf.reduce([[ELEM_TO_REDUCE]]) : f32 {
 // CHECK:      ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:        [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:        [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -122,12 +122,12 @@ func @dynamic_reduce(%arg: memref<?x?x?xf32>,
 // CHECK:        store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:        "xla_lhlo.add"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:        [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:        loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:        scf.reduce.return [[ACC_RESULT]] : f32
 // CHECK:      }
-// CHECK:      loop.yield
+// CHECK:      scf.yield
 // CHECK:    }
 // CHECK:    store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[K]]]
-// CHECK:    loop.yield
+// CHECK:    scf.yield
 
 // -----
 
@@ -158,9 +158,9 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK-DAG:  [[C56:%.*]] = constant 56 : index
 // CHECK-DAG:  [[C112:%.*]] = constant 112 : index
 // CHECK:      [[INIT:%.*]] = load [[INIT_BUF]][] : memref<f32>
-// CHECK:      loop.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
+// CHECK:      scf.parallel ([[I:%.*]], [[J:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:         to ([[C56]], [[C56]]) step ([[C1]], [[C1]]) {
-// CHECK:        [[REDUCTION_RESULT:%.*]] = loop.parallel
+// CHECK:        [[REDUCTION_RESULT:%.*]] = scf.parallel
 // CHECK-SAME:       ([[IW:%.*]], [[JW:%.*]]) = ([[C0]], [[C0]])
 // CHECK-SAME:       to ([[C3]], [[C3]]) step ([[C1]], [[C1]])
 // CHECK-SAME:       init ([[INIT]]) -> f32 {
@@ -177,15 +177,15 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK:          [[INDEX_J_FITS:%.*]] = cmpi "ult", [[INDEX_J]], [[C112]]
 // CHECK:          [[IN_BOUNDS_1:%.*]] = and [[IN_BOUNDS_0]], [[INDEX_J_FITS]]
 
-// CHECK:          [[ELEM_TO_REDUCE:%.*]] = loop.if [[IN_BOUNDS_1]] -> (f32) {
+// CHECK:          [[ELEM_TO_REDUCE:%.*]] = scf.if [[IN_BOUNDS_1]] -> (f32) {
 // CHECK:            [[OPERAND_ELEM:%.*]] =
 // CHECK-SAME:         load [[OPERAND_BUF]]{{\[}}[[INDEX_I]], [[INDEX_J]]]
-// CHECK:              loop.yield [[OPERAND_ELEM]] : f32
+// CHECK:              scf.yield [[OPERAND_ELEM]] : f32
 // CHECK:            } else {
-// CHECK:              loop.yield [[INIT]] : f32
+// CHECK:              scf.yield [[INIT]] : f32
 // CHECK:            }
 
-// CHECK:          loop.reduce([[ELEM_TO_REDUCE]])  : f32 {
+// CHECK:          scf.reduce([[ELEM_TO_REDUCE]])  : f32 {
 // CHECK:          ^bb0([[ELEM:%.*]]: f32, [[ACC:%.*]]: f32):
 // CHECK:            [[ELEM_BUF:%.*]] = alloc() : memref<f32>
 // CHECK:            [[ACC_BUF:%.*]] = alloc() : memref<f32>
@@ -194,12 +194,12 @@ func @reduce_window(%arg: memref<112x112xf32>,
 // CHECK:            store [[ACC]], [[ACC_BUF]][] : memref<f32>
 // CHECK:            "xla_lhlo.maximum"([[ELEM_BUF]], [[ACC_BUF]], [[ACC_OUT_BUF]])
 // CHECK:            [[ACC_RESULT:%.*]] = load [[ACC_OUT_BUF]][] : memref<f32>
-// CHECK:            loop.reduce.return [[ACC_RESULT]] : f32
+// CHECK:            scf.reduce.return [[ACC_RESULT]] : f32
 // CHECK:          }
-// CHECK:          loop.yield
+// CHECK:          scf.yield
 // CHECK:        }
 // CHECK:        store [[REDUCTION_RESULT]], [[RESULT_BUF]]{{\[}}[[I]], [[J]]]
-// CHECK:        loop.yield
+// CHECK:        scf.yield
 // CHECK:      }
 // CHECK:      return
 // CHECK:    }
diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
index c5f5b39e04c..734a75a4307 100644
--- a/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_legalize_to_parallel_loops.cc
@@ -61,8 +61,8 @@ Value ApplySingleResultLhloCode(Location loc, ValueRange operands,
 
 // Converts a block with LHLO ops and with signature:
 //   ^bb(%lhs: memref<f32>, %rhs: memref<f32>, %res: memref<f32>):
-// into a reduction operator of loop.reduce by doing buffer allocation for
-// scalar arguments and the result of `loop.reduce` to make it compatible with
+// into a reduction operator of scf.reduce by doing buffer allocation for
+// scalar arguments and the result of `scf.reduce` to make it compatible with
 // LHLO ops.
 void ConvertToReductionOperator(Location loc, scf::ReduceOp reduce_op,
                                 Block* lhlo_block, OpBuilder* b) {
@@ -170,10 +170,10 @@ scf::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
 //  is roughly converted into:
 //
 //  %init = load %init_buf[] : memref<f32>
-//  loop.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
-//    %result = loop.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
+//  scf.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
+//    %result = scf.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
 //      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
-//      loop.reduce(%elem_to_reduce)  {
+//      scf.reduce(%elem_to_reduce)  {
 //        ^bb0(%elem: f32, %acc: f32):   // no predecessors
 //          elem_buf = alloc() : memref<f32>
 //          store %elem, elem_buf[] : memref<f32>
@@ -181,11 +181,11 @@ scf::ParallelOp MakeLoopOverShape(Location loc, Value shaped_value,
 //          store %acc, acc_buf[] : memref<f32>
 //          <LHLO_ops>
 //          %acc_result = load acc_buf[] : memref<f32>
-//          loop.reduce.return %acc_result : f32
+//          scf.reduce.return %acc_result : f32
 //      } : f32
-//      loop.yield
+//      scf.yield
 //    } : f32
-//    loop.yield
+//    scf.yield
 //  }
 class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
  public:
@@ -206,24 +206,24 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
   }
 
  private:
-  // Creates nested `loop.parallel` ops with `loop.reduce`. The outer ParallelOp
+  // Creates nested `scf.parallel` ops with `scf.reduce`. The outer ParallelOp
   // refers to the parallel dimensions of `xla_reduce_op` if any and the inner
-  // ParallelOp refers to the reduction dimensions. The loop.reduce op is
+  // ParallelOp refers to the reduction dimensions. The scf.reduce op is
   // returned.
   //
   // If the reduction argument is a memref<100x10x5xf32> and the
   // reduction is performed along dimension 1 then this method will generate
   //
   //  %init = load %init_buf[] : memref<f32>
-  //  loop.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
-  //    %result = loop.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
+  //  scf.parallel (%i, %k) = (%c0, %c0) to (%c100, %c5) step (%c1, %c1) {
+  //    %result = scf.parallel (%j) = (%c0) to (%c10) step (%c1) init (%init) {
   //      %elem_to_reduce = load %buffer[%i, %j, %k] : memref<100x10x5xf32>
-  //      loop.reduce(%elem_to_reduce)  {
+  //      scf.reduce(%elem_to_reduce)  {
   //        <THE BLOCK PTR TO BE RETURNED>
   //      } : f32
-  //      loop.yield
+  //      scf.yield
   //    } : f32
-  //    loop.yield
+  //    scf.yield
   //  }
   scf::ReduceOp CreateReduceOpInNestedParallelLoops(
       xla_lhlo::ReduceOp xla_reduce_op,
@@ -341,20 +341,20 @@ class ReduceOpConverter : public OpConversionPattern<xla_lhlo::ReduceOp> {
 // is roughly converted into:
 //
 //    %neutral_elem = load %init_buf[] : memref<f32>
-//    loop.parallel (%i, %j) = (%c0, %c0) to (%c56, %c56) step (%c1, %c1) {
-//      %result = loop.parallel (%iw, %jw) = (%c0, %c0)
+//    scf.parallel (%i, %j) = (%c0, %c0) to (%c56, %c56) step (%c1, %c1) {
+//      %result = scf.parallel (%iw, %jw) = (%c0, %c0)
 //                  to (%c3, %c3) step (%c1, %c1) neutral_elem (%0) -> f32 {
 //        %in_bounds = <COMPUTE IF INDEX IS IN OPERAND'S pad>
 //        %elem = load %operand[%computed_i, %computed_j]
 //        %elem_or_neutral = select %in_bounds, %elem, %neutral_elem : f32
-//        loop.reduce(%elem_to_reduce)  : f32 {
+//        scf.reduce(%elem_to_reduce)  : f32 {
 //          ^bb0(%arg7: f32, %arg8: f32):
 //            <LHLO ops>
 //        }
-//        loop.yield
+//        scf.yield
 //      }
 //      store %result, %output_buffer[%i, %j] : memref<56x56xf32>
-//      loop.yield
+//      scf.yield
 //    }
 //    return
 //  }
@@ -457,16 +457,16 @@ class ReduceWindowOpConverter
 // https://www.tensorflow.org/xla/operation_semantics#selectandscatter
 //
 // Pseudocode:
-//  loop.parallel(coordinates O in the output):
+//  scf.parallel(coordinates O in the output):
 //    output[O] = init
-//  loop.parallel(coordinates S in the source):
+//  scf.parallel(coordinates S in the source):
 //    selected_ivs = 0
 //    selected_val = 0
 //    initialized_flag = false
-//    loop.for (first dim W_1 in the window)
+//    scf.for (first dim W_1 in the window)
 //         iter_args (selected_ivs, selected_val, initialized_flag):
 //    ...
-//      loop.for (last dim W_N in the window):
+//      scf.for (last dim W_N in the window):
 //           iter_args (selected_ivs, selected_val, initialized_flag):
 //        I = S * stride + W - pad_low
 //        if I within bounds of operand:
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index 2d0164981a3..39375e210d5 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -81,8 +81,8 @@ std::unique_ptr<OperationPass<FuncOp>> createLegalizeToGpuPass();
 // Fuses linalg ops obtained after LHLO lowering. To enable fusion,
 // operations are first tiled.
 //
-// When 'use_parallel_loops' is set, the tiling will use loop.parallel
-// operations. Otherwise, loop.for operations are used.
+// When 'use_parallel_loops' is set, the tiling will use scf.parallel
+// operations. Otherwise, scf.for operations are used.
 //
 // 'tile_sizes' provides the tile sizes to use for tiling. If the linalg
 // operation has more dimensions than tile sizes provided, 1 is used as

From d7503555753420aba3a4f9010bb5f7ed13d6c9ca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 02:02:43 -0700
Subject: [PATCH 184/412] Update GraphDef version to 401.

PiperOrigin-RevId: 311492238
Change-Id: I93cb2eda8127d2ca0504ba2e06911a994c190347
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 68df6a1b632..a534c0cf827 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 400  // Updated: 2020/5/13
+#define TF_GRAPH_DEF_VERSION 401  // Updated: 2020/5/14
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 9d0cf955c1e1bd2e653b93bf939c6f1617d67881 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 02:02:45 -0700
Subject: [PATCH 185/412] compat: Update forward compatibility horizon to
 2020-05-14

PiperOrigin-RevId: 311492245
Change-Id: I64918fc404fd05bb26edf1910f3bbab07a7856f5
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 26d291877cb..2a21590bb9a 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 13)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 14)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From e10d6dd07b0f08ff3e039bb7276b0417668d5928 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 02:10:24 -0700
Subject: [PATCH 186/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311493179
Change-Id: I58caf5368efe0ff0fc5d0ef72320347d677fd888
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c6d67c9ad44..598e3a48bfe 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25654,7 +25654,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25717,7 +25717,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25968,7 +25968,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26452,7 +26452,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45540,7 +45540,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47480,7 +47480,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47551,7 +47551,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48540,7 +48540,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 23d478c4228095a2c7d47bae46f8b0d3024ca284 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 14 May 2020 02:24:39 -0700
Subject: [PATCH 187/412] Add Starlark rules to generate cubin headers.

Also add a cuda_gpu_architectures macro for getting a list of CUDA GPU architectures.

PiperOrigin-RevId: 311494598
Change-Id: Ie573c2d22a42ab9e0002bdcfbee5be534b87cd2c
---
 .../compiler/mlir/tools/kernel_gen/BUILD      |   1 +
 .../core/kernels/cubin_headers/build_defs.bzl | 101 ++++++++++++++++++
 third_party/gpus/cuda/BUILD.tpl               |   8 ++
 third_party/gpus/cuda/build_defs.bzl.tpl      |   4 +
 third_party/gpus/cuda_configure.bzl           |  15 +++
 5 files changed, 129 insertions(+)
 create mode 100644 tensorflow/core/kernels/cubin_headers/build_defs.bzl

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index d4269c336e9..27a8dbd2809 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -40,6 +40,7 @@ cc_library(
 tf_cc_binary(
     name = "tf_to_cubin",
     srcs = ["tf_to_cubin.cc"],
+    visibility = ["//tensorflow/core/kernels/cubin_headers:__pkg__"],
     deps = [
         ":cubin_creator",
         "//tensorflow/core:framework_internal",
diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
new file mode 100644
index 00000000000..b09c515c883
--- /dev/null
+++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
@@ -0,0 +1,101 @@
+"""Generates cubin headers for TF dialect ops."""
+
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures", "if_cuda")
+
+def _lookup_file(filegroup, path):
+    """Extracts file at (relative) path in filegroup."""
+    for file in filegroup.files.to_list():
+        if file.path.endswith(path):
+            return file
+    return None
+
+def _gen_kernel_image_hdr_impl(ctx):
+    if not ctx.attr.gpu_archs:
+        fail("No GPU architecture specified, use --config=cuda or similar")
+
+    name = ctx.attr.name
+    tile_sizes = ctx.attr.tile_size.replace("x", ",")
+    same_shape = []
+    if ctx.attr.same_shape:
+        same_shape.append("--same_shape=%s" % ctx.attr.same_shape)
+
+    cubins = []
+    images = []
+    for arch in ctx.attr.gpu_archs:
+        filename = "%s.%s.cubin" % (name, arch)
+        cubin = ctx.actions.declare_file(filename)
+        ctx.actions.run(
+            outputs = [cubin],
+            executable = ctx.executable._tool,
+            arguments = same_shape + [
+                "--tile_sizes=%s" % tile_sizes,
+                "--arch=%s" % arch.split("_")[1],
+                "--output=%s" % cubin.path,
+                ctx.attr.op,
+            ],
+            mnemonic = "compile",
+        )
+        cubins.append(cubin)
+        images.append("--image=profile=%s,file=%s" % (arch, cubin.path))
+
+    # Generate fatbin file from all cubins.
+    fatbin = ctx.actions.declare_file("%s.fatbin" % name)
+    ctx.actions.run(
+        outputs = [fatbin],
+        inputs = cubins,
+        executable = _lookup_file(ctx.attr._cuda_root, "bin/fatbinary"),
+        arguments = [
+            "--64",
+            "--cmdline=--compile-only",
+            "--link",
+            "--compress-all",
+            "--create=%s" % fatbin.path,
+        ] + images,
+        mnemonic = "fatbinary",
+    )
+
+    bin2c = _lookup_file(ctx.attr._cuda_root, "bin/bin2c")
+    ctx.actions.run_shell(
+        outputs = [ctx.outputs.out],
+        inputs = [fatbin],
+        tools = [bin2c],
+        command = "%s --static --const --type=int --name=%s %s 1> %s" %
+                  (bin2c.path, ctx.attr.symbol, fatbin.path, ctx.outputs.out.path),
+        mnemonic = "bin2c",
+    )
+
+_gen_kernel_image_hdr = rule(
+    implementation = _gen_kernel_image_hdr_impl,
+    output_to_genfiles = True,
+    attrs = {
+        "op": attr.string(mandatory = True),
+        "tile_size": attr.string(mandatory = True),
+        "same_shape": attr.string(),
+        "out": attr.output(mandatory = True),
+        "symbol": attr.string(mandatory = True),
+        "gpu_archs": attr.string_list(mandatory = True),
+        "_cuda_root": attr.label(
+            default = Label("@local_config_cuda//cuda:cuda_root"),
+        ),
+        "_tool": attr.label(
+            executable = True,
+            default = Label("//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_cubin"),
+            cfg = "host",
+        ),
+    },
+)
+
+def gen_kernel_image_hdr(name, op, tile_size, tags = [], same_shape = None):
+    """Generates a C header with fatbin data from a Tensorflow op."""
+    if_cuda(
+        if_true = [_gen_kernel_image_hdr(
+            name = name,
+            op = op,
+            tile_size = tile_size,
+            same_shape = same_shape,
+            out = "%s.h" % name,
+            symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
+            gpu_archs = cuda_gpu_architectures(),
+            tags = tags,
+        )],
+    )
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 9d17e1b8f35..92586dd7d11 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -166,6 +166,14 @@ cc_library(
     data = [":cuda-nvvm"],
 )
 
+filegroup(
+    name = "cuda_root",
+    srcs = [
+        "cuda/bin/fatbinary",
+        "cuda/bin/bin2c",
+    ],
+)
+
 bzl_library(
     name = "build_defs_bzl",
     srcs = ["build_defs.bzl"],
diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl
index 3280d6b041f..bba772e2377 100644
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -51,6 +51,10 @@ def cuda_is_configured():
     """Returns true if CUDA was enabled during the configure process."""
     return %{cuda_is_configured}
 
+def cuda_gpu_architectures():
+    """Returns a list of supported GPU architectures."""
+    return %{cuda_gpu_architectures}
+
 def if_cuda_is_configured(x):
     """Tests if the CUDA was enabled during the configure process.
 
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index c587f117deb..aa8a2f0226d 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -714,6 +714,7 @@ def _create_dummy_repository(repository_ctx):
         {
             "%{cuda_is_configured}": "False",
             "%{cuda_extra_copts}": "[]",
+            "%{cuda_gpu_architectures}": "[]",
         },
     )
     _tpl(
@@ -842,6 +843,16 @@ def _compute_cuda_extra_copts(repository_ctx, compute_capabilities):
     ]
     return str(capability_flags)
 
+def _compute_cuda_gpu_architectures(repository_ctx, compute_capabilities):
+    gpu_architectures = [
+        "sm_" + capability.replace(".", "")
+        for capability in compute_capabilities
+    ]
+
+    # Make the list unique.
+    gpu_architectures = dict(zip(gpu_architectures, gpu_architectures)).keys()
+    return str(gpu_architectures)
+
 def _tpl_path(repository_ctx, filename):
     return repository_ctx.path(Label("//third_party/gpus/%s.tpl" % filename))
 
@@ -973,6 +984,10 @@ def _create_local_cuda_repository(repository_ctx):
                 repository_ctx,
                 cuda_config.compute_capabilities,
             ),
+            "%{cuda_gpu_architectures}": _compute_cuda_gpu_architectures(
+                repository_ctx,
+                cuda_config.compute_capabilities,
+            ),
         },
     )
 

From 5767af0cd2c3327d05f84acddcdf4152e6543f58 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 02:26:18 -0700
Subject: [PATCH 188/412] Bump open source llvm revision to
 bfa200ebcf3706fde0dde335a3c1fa3fe1b3ba3f

PiperOrigin-RevId: 311494763
Change-Id: I218a77222ac4ca3131d2614ea84d1268d5de655e
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 9b745656125..f4d60f07149 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -655,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "897d8ee5cd693e17f95a7e84194bca4c089a520b"
-    LLVM_SHA256 = "994677daedf23bc93ce04f1a527c07c09b7fbbd0986d867b60bd6710057a40de"
+    LLVM_COMMIT = "bfa200ebcf3706fde0dde335a3c1fa3fe1b3ba3f"
+    LLVM_SHA256 = "72deefcfe20434cb27a31ff9503c348dcf21065dbd27e9fa54c1fb3f5089b8e1"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 85bf5f7c202f1c656ebf169592aa4a0a9c022e8a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 02:32:13 -0700
Subject: [PATCH 189/412] Return a meaningful error for dynamic shape inputs
 with outside compilation head extraction in TPUs.

PiperOrigin-RevId: 311495416
Change-Id: I42b12ac545224c32e770d963a5f3f333ba280531
---
 .../python/distribute/tpu_strategy_test.py    | 26 -------------------
 1 file changed, 26 deletions(-)

diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 6c93e29c028..de4c975d5ef 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -28,7 +28,6 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
-from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -141,9 +140,6 @@ class TPUStrategyTest(test.TestCase):
     # for non-local TPU.
     if FLAGS.tpu:
       self.skipTest("Recovery fails for non-local TPU, see b/148150981")
-
-    # Disable automatic outside compilation.
-    config.set_soft_device_placement(False)
     strategy = get_tpu_strategy()
 
     @def_function.function
@@ -168,28 +164,6 @@ class TPUStrategyTest(test.TestCase):
 
     good_run()
 
-  def test_dynamic_shape_with_outside_compilation_failure(self):
-    # Enable automatic outside compilation.
-    config.set_soft_device_placement(True)
-    strategy = get_tpu_strategy()
-    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
-        2, drop_remainder=False)
-    dataset = strategy.experimental_distribute_dataset(dataset)
-    iterator = iter(dataset)
-
-    @def_function.function
-    def train_fn(iterator):
-
-      def step_fn(inputs):
-        _, inputs = inputs
-        return math_ops.reduce_sum(inputs)
-
-      return strategy.experimental_local_results(
-          strategy.run(step_fn, args=(next(iterator),)))
-
-    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
-      logging.info(train_fn(iterator))
-
   def test_computation_on_subset_cores(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)

From a04c8be3e7086d9e14ba37c3c0945a3ea98414ce Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Thu, 14 May 2020 03:13:12 -0700
Subject: [PATCH 190/412] Update XNNPACK dependency

Bring in memory optimization for XNNPACK delegate in TF Lite

PiperOrigin-RevId: 311500960
Change-Id: I49b093ab177ca2e4806ed42390e367b58b14dc85
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f4d60f07149..c3d097a8362 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -164,11 +164,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "15a300dec0d483af67310ed2edf76a6eff643e1438d0612ad00a372add472c22",
-        strip_prefix = "XNNPACK-5cb16e7ace0fcdcab164af01620a606ba828a3be",
+        sha256 = "0440d9ad632945f10992664be84eb0c0c76581f8474df3c124aa30350981126c",
+        strip_prefix = "XNNPACK-d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/5cb16e7ace0fcdcab164af01620a606ba828a3be.zip",
-            "https://github.com/google/XNNPACK/archive/5cb16e7ace0fcdcab164af01620a606ba828a3be.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/XNNPACK/archive/d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee.zip",
+            "https://github.com/google/XNNPACK/archive/d9a7e85c30a2bea7b6b263f21f066a93cb2b4dee.zip",
         ],
     )
 

From 015197cf8bda5010b2b12170da738bdb66482551 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Thu, 14 May 2020 06:41:25 -0700
Subject: [PATCH 191/412] Generate cubin headers for bias_add and relu.

Also, instead of checking if_cuda, check whether cuda_gpu_architectures()
is non-empty.

PiperOrigin-RevId: 311521784
Change-Id: I6a1a7e9cefc8e845e69d62fb3c19d9976b0f2196
---
 tensorflow/core/kernels/cubin_headers/BUILD   | 47 +++++++++++++++++++
 .../core/kernels/cubin_headers/build_defs.bzl |  9 ++--
 2 files changed, 51 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/core/kernels/cubin_headers/BUILD

diff --git a/tensorflow/core/kernels/cubin_headers/BUILD b/tensorflow/core/kernels/cubin_headers/BUILD
new file mode 100644
index 00000000000..bb7995dd221
--- /dev/null
+++ b/tensorflow/core/kernels/cubin_headers/BUILD
@@ -0,0 +1,47 @@
+# Generates headers containing cubin for CUDA kernels.
+load("//tensorflow/core/kernels/cubin_headers:build_defs.bzl", "gen_kernel_image_hdr")
+
+bias_add_kernel = """
+func @bias_add(%arg0: tensor<?x?xf99>,
+         %arg1: tensor<?xf99>) -> tensor<?x?xf99> {
+  %0 = "tf.BiasAdd"(%arg0, %arg1) { T = "tfdtype$DT_TYPE" }
+    : (tensor<?x?xf99>, tensor<?xf99>) -> tensor<?x?xf99>
+  return %0 : tensor<?x?xf99>
+}
+"""
+
+[
+    gen_kernel_image_hdr(
+        name = "bias_add_{type}_kernel".format(type = type),
+        op = bias_add_kernel.replace("f99", type).replace("DT_TYPE", dtype),
+        same_shape = "0,2",
+        tile_size = "16x16",
+    )
+    for (type, dtype) in [
+        ("f16", "DT_HALF"),
+        ("f32", "DT_FLOAT"),
+        ("f64", "DT_DOUBLE"),
+    ]
+]
+
+relu_kernel = """
+func @relu(%arg0: tensor<?xf99>) -> tensor<?xf99> {
+  %0 = "tf.Relu"(%arg0) { T = "tfdtype$DT_TYPE" }
+    : (tensor<?xf99>) -> tensor<?xf99>
+  return %0 : tensor<?xf99>
+}
+"""
+
+[
+    gen_kernel_image_hdr(
+        name = "relu_{type}_kernel".format(type = type),
+        op = relu_kernel.replace("f99", type).replace("DT_TYPE", dtype),
+        same_shape = "0,1",
+        tile_size = "256",
+    )
+    for (type, dtype) in [
+        ("f16", "DT_HALF"),
+        ("f32", "DT_FLOAT"),
+        ("f64", "DT_DOUBLE"),
+    ]
+]
diff --git a/tensorflow/core/kernels/cubin_headers/build_defs.bzl b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
index b09c515c883..14f47601f06 100644
--- a/tensorflow/core/kernels/cubin_headers/build_defs.bzl
+++ b/tensorflow/core/kernels/cubin_headers/build_defs.bzl
@@ -1,6 +1,6 @@
 """Generates cubin headers for TF dialect ops."""
 
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures", "if_cuda")
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
 
 def _lookup_file(filegroup, path):
     """Extracts file at (relative) path in filegroup."""
@@ -87,8 +87,8 @@ _gen_kernel_image_hdr = rule(
 
 def gen_kernel_image_hdr(name, op, tile_size, tags = [], same_shape = None):
     """Generates a C header with fatbin data from a Tensorflow op."""
-    if_cuda(
-        if_true = [_gen_kernel_image_hdr(
+    if cuda_gpu_architectures():
+        _gen_kernel_image_hdr(
             name = name,
             op = op,
             tile_size = tile_size,
@@ -97,5 +97,4 @@ def gen_kernel_image_hdr(name, op, tile_size, tags = [], same_shape = None):
             symbol = "k%s" % name.replace("_", " ").title().replace(" ", ""),
             gpu_archs = cuda_gpu_architectures(),
             tags = tags,
-        )],
-    )
+        )

From ec2cc2903f54d526dfdcfa314c9e181a8a5f76fa Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Thu, 14 May 2020 07:41:59 -0700
Subject: [PATCH 192/412] Introduce a higher-level function handling in the
 tracing oriented unified API

This patch intends to make function tracing more of a first class concept in
the API. It tries to move away from the "flat graph" model with "placeholder"
operation introduced with the expectation to turn them into function
parameters later. Instead the user starts by creating an empty function which
is an ExecutionContext (and as such can trace operations). Function parameters
can get added to this context using a dedicated API returning an AbstractTensor.

The diff in UnifiedCAPI/TestBasicGraph is probably a good illustration of the
change from a client point of view.

Another important point of this patch is to make it so that no C public API is
defined in the `c_api_unified_experimental_graph.cc` file, instead the
implementation is dispatched based on a registered factory function to create the
tracing context. This will allow to swap the tracing implementation through
injection later.

PiperOrigin-RevId: 311529850
Change-Id: I822047f4306835abc0e044dc87c14179596f64bd
---
 tensorflow/c/eager/BUILD                      |   2 +
 .../c/eager/c_api_unified_experimental.cc     |  69 +++++++++++
 .../c/eager/c_api_unified_experimental.h      |  26 ++--
 .../eager/c_api_unified_experimental_eager.cc |  11 ++
 .../eager/c_api_unified_experimental_graph.cc | 111 ++++++++----------
 .../c_api_unified_experimental_internal.h     |  17 +++
 .../eager/c_api_unified_experimental_test.cc  |  73 +++++-------
 7 files changed, 193 insertions(+), 116 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index d3059df1bef..69808f6f49f 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -448,6 +448,8 @@ tf_cuda_library(
         "//conditions:default": [],
     }) + [
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/container:flat_hash_map",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index 68afffb28b4..d29c457798e 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
@@ -26,6 +28,51 @@ using tensorflow::string;
 using tensorflow::internal::OutputList;
 using tensorflow::internal::unwrap;
 
+namespace tensorflow {
+namespace internal {
+typedef absl::flat_hash_map<std::string, FactoryFunction> FactoriesMap;
+
+static FactoriesMap& GetFactories() {
+  static FactoriesMap* factories = new FactoriesMap;
+  return *factories;
+}
+
+static const char* default_factory = "<unset>";
+
+void RegisterTracingEngineFactory(const string& name, FactoryFunction factory) {
+  assert((!GetFactories().count(name)) ||
+         (GetFactories()[name] == factory) &&
+             "Duplicate tracing factory registration");
+  GetFactories()[name] = factory;
+}
+
+void SetDefaultTracingEngine(const char* name) { default_factory = name; }
+
+static ExecutionContext* CreateTracingExecutionContext(const char* fn_name,
+                                                       TF_Status* s) {
+  auto entry = GetFactories().find(default_factory);
+  if (entry != GetFactories().end()) return entry->second(fn_name, s);
+  string msg = absl::StrCat(
+      "No tracing engine factory has been registered with the key '",
+      default_factory, "' (available: ");
+  // Ensure deterministic (sorted) order in the error message
+  std::set<string> factories_sorted;
+  for (const auto& factory : GetFactories())
+    factories_sorted.insert(factory.first);
+  const char* comma = "";
+  for (const string& factory : factories_sorted) {
+    msg += comma + factory;
+    comma = ", ";
+  }
+  msg += ")";
+
+  TF_SetStatus(s, TF_INVALID_ARGUMENT, msg.c_str());
+  return nullptr;
+}
+
+}  // end namespace internal
+}  // end namespace tensorflow
+
 // =============================================================================
 // Public C API entry points
 //
@@ -36,6 +83,28 @@ using tensorflow::internal::unwrap;
 //
 // =============================================================================
 
+void TF_SetTracingImplementation(const char* name) {
+  tensorflow::internal::SetDefaultTracingEngine(name);
+}
+
+// Creates a new TensorFlow function, it is an execution context attached to a
+// given tracing context.
+TF_ExecutionContext* TF_CreateFunction(const char* fn_name, TF_Status* s) {
+  return wrap(tensorflow::internal::CreateTracingExecutionContext(fn_name, s));
+}
+
+TF_AbstractFunction* TF_FinalizeFunction(TF_ExecutionContext* ctx,
+                                         TF_OutputList* outputs, TF_Status* s) {
+  auto* func = wrap(unwrap(ctx)->Finalize(unwrap(outputs), s));
+  TF_DeleteExecutionContext(ctx);
+  return func;
+}
+
+TF_AbstractTensor* TF_AddFunctionParameter(TF_ExecutionContext* func,
+                                           TF_DataType dtype, TF_Status* s) {
+  return wrap(unwrap(func)->AddParameter(dtype, s));
+}
+
 void TF_DeleteExecutionContext(TF_ExecutionContext* c) { delete unwrap(c); }
 
 TF_AbstractOp* TF_NewAbstractOp(TF_ExecutionContext* c) {
diff --git a/tensorflow/c/eager/c_api_unified_experimental.h b/tensorflow/c/eager/c_api_unified_experimental.h
index be8fc64c2e1..512717caa34 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.h
+++ b/tensorflow/c/eager/c_api_unified_experimental.h
@@ -49,15 +49,26 @@ typedef struct TF_AbstractOp TF_AbstractOp;
 // setting functional attributes of other composite ops e.g. control flow.
 typedef struct TF_AbstractFunction TF_AbstractFunction;
 
-// Creates a context for tracing the execution of operations into a function.
-TF_ExecutionContext* TF_NewGraphExecutionContext(TF_Status* s);
+// This allows the client to swap the implementation of the tracing engine.
+// Any future call to TF_CreateFunction will use the implementation defined
+// here.
+void TF_SetTracingImplementation(const char* name);
+
+// Creates a new TensorFlow function. A Function is an execution context, and as
+// such it can trace operations through TF_ExecuteOperation. After completing
+// tracing, a function can be obtained by TF_FinalizeFunction.
+TF_ExecutionContext* TF_CreateFunction(const char* fn_name, TF_Status* status);
 
 // Creates a context for eager execution of operations.
 TF_ExecutionContext* TF_NewEagerExecutionContext(TFE_ContextOptions*,
                                                  TF_Status* s);
-
 void TF_DeleteExecutionContext(TF_ExecutionContext*);
 
+// Add a new parameter to a TensorFlow Function.
+// TODO(aminim): what about shape?
+TF_AbstractTensor* TF_AddFunctionParameter(TF_ExecutionContext* func,
+                                           TF_DataType dtype, TF_Status* s);
+
 // Create an operation suitable to use with the provided context. The operation
 // requires its type (e.g. "AddV2") to be set independently.
 TF_AbstractOp* TF_NewAbstractOp(TF_ExecutionContext* ctx);
@@ -100,13 +111,12 @@ void TF_ExecuteOperation(TF_AbstractOp* op, int num_inputs,
                          TF_ExecutionContext* ctx, TF_Status* s);
 
 // Creates a new TF_AbstractFunction from the current tracing states in the
-// context. The returned TF_GraphToFunction must be deleted by the client.
+// context. The provided `ctx` is consumed by this API call and deleted.
+// The returned TF_AbstractFunction must be deleted by the client,
 // TODO(aminim): clarify the contract on the state of the context after this
 // call.
-TF_AbstractFunction* TF_ExecutionContextToFunction(
-    const TF_ExecutionContext* fn_body, const char* fn_name, int num_inputs,
-    const TF_AbstractTensor* inputs, int num_outputs,
-    const TF_AbstractTensor* outputs, TF_Status* status);
+TF_AbstractFunction* TF_FinalizeFunction(TF_ExecutionContext* ctx,
+                                         TF_OutputList*, TF_Status*);
 
 void TF_DeleteAbstractFunction(TF_AbstractFunction*);
 
diff --git a/tensorflow/c/eager/c_api_unified_experimental_eager.cc b/tensorflow/c/eager/c_api_unified_experimental_eager.cc
index 820c61445fb..cf8cf845834 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_eager.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_eager.cc
@@ -123,6 +123,17 @@ class EagerContext : public ExecutionContext {
     }
   }
 
+  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override {
+    TF_SetStatus(s, TF_INVALID_ARGUMENT,
+                 "Can't add function parameter on an eager context.");
+    return nullptr;
+  }
+  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override {
+    TF_SetStatus(s, TF_INVALID_ARGUMENT,
+                 "Can't use finalize function on an eager context.");
+    return nullptr;
+  }
+
   void RegisterFunction(AbstractFunction* afunc, TF_Status* s) override {
     auto* func = afunc->GetTfFunction(s);
     if (!func) {
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
index 36f8353894b..e38332e3e8e 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_graph.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_unified_experimental.h"
@@ -114,12 +115,14 @@ struct GraphFunction : public AbstractFunction {
   static constexpr AbstractFunctionKind kKind = kGraphFunc;
 };
 
-// GraphContext wraps a TF_Graph and manages the "execution" of operation, i.e.
-// adding them to the graph.
+// GraphContext wraps a TF_Graph modeling a single function and manages the
+// "execution" of operation, i.e. adding them to the function.
 class GraphContext : public ExecutionContext {
  public:
-  GraphContext()
-      : ExecutionContext(kKind), graph_(new TF_Graph(), TF_DeleteGraph) {}
+  explicit GraphContext(const char* name)
+      : ExecutionContext(kKind),
+        graph_(new TF_Graph(), TF_DeleteGraph),
+        name_(name) {}
 
   AbstractOp* CreateOperation() override {
     // TODO(srbs): Should the lifetime of this op be tied to the context.
@@ -164,24 +167,38 @@ class GraphContext : public ExecutionContext {
     }
   }
 
-  TF_Function* ToFunction(const char* fn_name, int num_inputs,
-                          const GraphTensor* inputs, int num_outputs,
-                          const GraphTensor* outputs, TF_Status* status) const {
-    std::vector<TF_Output> graph_inputs;
-    graph_inputs.resize(num_inputs);
+  AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) override {
+    TF_OperationDescription* opdesc =
+        TF_NewOperation(graph_.get(), "Placeholder",
+                        absl::StrCat("_input_", inputs_.size()).c_str());
+    TF_SetAttrType(opdesc, "dtype", dtype);
+    auto* operation = TF_FinishOperation(opdesc, s);
+    if (!s->status.ok()) return nullptr;
+
+    inputs_.push_back(TF_Output{operation, 0});
+    return new GraphTensor(inputs_.back(), this);
+  }
+
+  AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) override {
+    std::unique_ptr<GraphFunction> func(new GraphFunction);
     std::vector<TF_Output> graph_outputs;
-    graph_outputs.resize(num_outputs);
-    for (int i = 0; i < num_inputs; i++) {
-      graph_inputs[i] = inputs[i].output;
-    }
-    for (int i = 0; i < num_outputs; i++) {
-      graph_outputs[i] = outputs[i].output;
+    graph_outputs.reserve(outputs->outputs.size());
+    for (AbstractTensor* abstract_output : outputs->outputs) {
+      GraphTensor* output = dyncast<GraphTensor>(abstract_output);
+      if (!output) {
+        TF_SetStatus(s, TF_UNIMPLEMENTED,
+                     "Returning a non-graph tensor from a function has not "
+                     "been implemented yet.");
+        return nullptr;
+      }
+      graph_outputs.push_back(output->output);
     }
 
-    return TF_GraphToFunction(graph_.get(), fn_name, 0, -1, nullptr,
-                              graph_inputs.size(), graph_inputs.data(),
-                              graph_outputs.size(), graph_outputs.data(),
-                              nullptr, nullptr, fn_name, status);
+    func->func = TF_GraphToFunction(
+        graph_.get(), name_, 0, -1, nullptr, inputs_.size(), inputs_.data(),
+        graph_outputs.size(), graph_outputs.data(), nullptr, nullptr, name_, s);
+    if (TF_GetCode(s) != TF_OK) return nullptr;
+    return func.release();
   }
 
   void RegisterFunction(AbstractFunction* func, TF_Status* s) override {
@@ -195,54 +212,20 @@ class GraphContext : public ExecutionContext {
 
  private:
   std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)> graph_;
+  std::vector<TF_Output> inputs_;
+  const char* name_;
 };
 
-// Helper that converts the graph currently held in the context into a function.
-static AbstractFunction* ExecutionContextToFunction(
-    const ExecutionContext* fn_body, const char* fn_name, int num_inputs,
-    const AbstractTensor* inputs, int num_outputs,
-    const AbstractTensor* outputs, TF_Status* status) {
-  auto* graph_ctx = dyncast<const GraphContext>(fn_body);
-  if (graph_ctx == nullptr) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT,
-                 "fn_body is not a TF_GraphContext.");
-    return nullptr;
-  }
-  auto* graph_inputs = dyncast<const GraphTensor>(inputs);
-  if (!graph_inputs) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT, "inputs aren't GraphTensors.");
-    return nullptr;
-  }
-  auto* graph_outputs = dyncast<const GraphTensor>(outputs);
-  if (!graph_outputs) {
-    TF_SetStatus(status, TF_INVALID_ARGUMENT, "outputs aren't GraphTensors.");
-    return nullptr;
-  }
-  GraphFunction* func = new GraphFunction;
-  func->func = graph_ctx->ToFunction(fn_name, num_inputs, graph_inputs,
-                                     num_outputs, graph_outputs, status);
-  return func;
+static ExecutionContext* GraphTracingFactory(const char* name, TF_Status* s) {
+  return new GraphContext(name);
 }
 
+// Register the tracing implemented in this file as the default tracing engine.
+static bool register_tracing = [] {
+  RegisterTracingEngineFactory("graphdef", GraphTracingFactory);
+  SetDefaultTracingEngine("graphdef");
+  return true;
+}();
+
 }  // namespace internal
 }  // namespace tensorflow
-
-// =============================================================================
-// Public C API entry points
-// These are only the entry points specific to the Graph API.
-// =============================================================================
-
-using tensorflow::internal::unwrap;
-
-TF_ExecutionContext* TF_NewGraphExecutionContext(TF_Status* s) {
-  return wrap(new tensorflow::internal::GraphContext());
-}
-
-TF_AbstractFunction* TF_ExecutionContextToFunction(
-    const TF_ExecutionContext* fn_body, const char* fn_name, int num_inputs,
-    const TF_AbstractTensor* inputs, int num_outputs,
-    const TF_AbstractTensor* outputs, TF_Status* status) {
-  return wrap(ExecutionContextToFunction(unwrap(fn_body), fn_name, num_inputs,
-                                         unwrap(inputs), num_outputs,
-                                         unwrap(outputs), status));
-}
diff --git a/tensorflow/c/eager/c_api_unified_experimental_internal.h b/tensorflow/c/eager/c_api_unified_experimental_internal.h
index ab085a20ff0..49212a230ee 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_internal.h
+++ b/tensorflow/c/eager/c_api_unified_experimental_internal.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/c/tf_datatype.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace internal {
@@ -148,6 +149,17 @@ struct ExecutionContext {
   // Creates an empty AbstractOperation suitable to use with this context.
   virtual AbstractOp* CreateOperation() = 0;
 
+  // Add a function parameter and return the corresponding tensor.
+  // This is only valid with an ExecutionContext obtained from a TracingContext,
+  // it'll always error out with an eager context.
+  virtual AbstractTensor* AddParameter(TF_DataType dtype, TF_Status* s) = 0;
+
+  // Finalize this context and make a function out of it. The context is in a
+  // invalid state after this call and must be destroyed.
+  // This is only valid with an ExecutionContext obtained from a TracingContext,
+  // it'll always error out with an eager context.
+  virtual AbstractFunction* Finalize(OutputList* outputs, TF_Status* s) = 0;
+
   // Registers a functions with this context, after this the function is
   // available to be called/referenced by its name in this context.
   virtual void RegisterFunction(AbstractFunction* func, TF_Status* s) = 0;
@@ -156,6 +168,11 @@ struct ExecutionContext {
   const ExecutionContextKind k;
 };
 
+typedef ExecutionContext* (*FactoryFunction)(const char* fn_name, TF_Status*);
+void SetDefaultTracingEngine(const char* name);
+void RegisterTracingEngineFactory(const ::tensorflow::string& name,
+                                  FactoryFunction factory);
+
 // Create utilities to wrap/unwrap: this convert from the C opaque types to the
 // C++ implementation, and back.
 #define MAKE_WRAP_UNWRAP(C_TYPEDEF, CPP_CLASS)                              \
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index bd99189852e..9f56c8aa579 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -29,7 +29,12 @@ using tensorflow::string;
 namespace tensorflow {
 namespace {
 
-TEST(UnifiedCAPI, TestBasicEager) {
+class UnifiedCAPI : public ::testing::TestWithParam<const char*> {
+ protected:
+  void SetUp() override { TF_SetTracingImplementation(GetParam()); }
+};
+
+TEST_P(UnifiedCAPI, TestBasicEager) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -81,33 +86,18 @@ TEST(UnifiedCAPI, TestBasicEager) {
   TF_DeleteExecutionContext(ctx);
 }
 
-TEST(UnifiedCAPI, TestBasicGraph) {
+TEST_P(UnifiedCAPI, TestBasicGraph) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  // Start a new function / execution context.
+  string fn_name = "double";
+  TF_ExecutionContext* graph_ctx =
+      TF_CreateFunction(fn_name.c_str(), status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
-  // Add a placeholder to the graph.
-  auto* placeholder_op = TF_NewAbstractOp(graph_ctx);
-  TF_AbstractOpSetOpType(placeholder_op, "Placeholder", status.get());
+  auto* placeholder_t =
+      TF_AddFunctionParameter(graph_ctx, TF_FLOAT, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractOpSetOpName(placeholder_op, "my_ph", status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractOpSetAttrType(placeholder_op, "dtype", TF_FLOAT, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-
-  // Build inputs and outputs.
-  TF_OutputList* placeholder_outputs = TF_NewOutputList();
-
-  // Execute.
-  TF_ExecuteOperation(placeholder_op, 0, nullptr, placeholder_outputs,
-                      graph_ctx, status.get());
-  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  ASSERT_EQ(1, TF_OutputListNumOutputs(placeholder_outputs));
-  TF_AbstractTensor* placeholder_t = TF_OutputListGet(placeholder_outputs, 0);
-
-  // Delete placeholder op.
-  TF_DeleteAbstractOp(placeholder_op);
 
   // Build an abstract operation.
   auto* add_op = TF_NewAbstractOp(graph_ctx);
@@ -123,17 +113,13 @@ TEST(UnifiedCAPI, TestBasicGraph) {
   // Execute.
   TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_AbstractTensor* output_t = TF_OutputListGet(add_outputs, 0);
 
   // Clean up operation and inputs.
   TF_DeleteAbstractOp(add_op);
 
-  string fn_name = "double";
-  TF_AbstractFunction* func = TF_ExecutionContextToFunction(
-      graph_ctx, fn_name.c_str(), 1, placeholder_t, 1, output_t, status.get());
+  TF_AbstractFunction* func =
+      TF_FinalizeFunction(graph_ctx, add_outputs, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-  TF_DeleteAbstractTensor(placeholder_t);
-  TF_DeleteAbstractTensor(output_t);
 
   // Build eager context.
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -174,18 +160,16 @@ TEST(UnifiedCAPI, TestBasicGraph) {
   ASSERT_EQ(*f_value, 4.0);
 
   TF_DeleteOutputList(add_outputs);
-  TF_DeleteOutputList(placeholder_outputs);
   TF_DeleteAbstractOp(fn_op);
   TF_DeleteAbstractTensor(input_t);
   TF_DeleteAbstractTensor(final_result);
   TF_DeleteTensor(f_t);
   TF_DeleteAbstractFunction(func);
 
-  TF_DeleteExecutionContext(graph_ctx);
   TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
-TEST(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
+TEST_P(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();
@@ -193,18 +177,15 @@ TEST(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   TFE_DeleteContextOptions(opts);
 
-  TF_AbstractFunction* func = TF_ExecutionContextToFunction(
-      ctx, nullptr, 0, nullptr, 0, nullptr, status.get());
+  TF_AbstractFunction* func = TF_FinalizeFunction(ctx, nullptr, status.get());
   ASSERT_EQ(nullptr, func);
   ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
-
-  TF_DeleteExecutionContext(ctx);
 }
 
-TEST(UnifiedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
+TEST_P(UnifiedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Add a placeholder to the graph.
@@ -222,10 +203,10 @@ TEST(UnifiedCAPI, TF_CallingSetOpTypeAfterFinishingOpBuildingRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST(UnifiedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
+TEST_P(UnifiedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Add a placeholder to the graph.
@@ -243,7 +224,7 @@ TEST(UnifiedCAPI, TF_CallingSetOpNameAfterFinishingOpBuildingRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST(UnifiedCAPI, TestExecutingEagerOpInGraphModeRaises) {
+TEST_P(UnifiedCAPI, TestExecutingEagerOpInGraphModeRaises) {
   // Build an Eager context.
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
@@ -273,7 +254,8 @@ TEST(UnifiedCAPI, TestExecutingEagerOpInGraphModeRaises) {
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Build a Graph context.
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Execute eager op using graph context.
@@ -289,10 +271,11 @@ TEST(UnifiedCAPI, TestExecutingEagerOpInGraphModeRaises) {
   TF_DeleteExecutionContext(graph_ctx);
 }
 
-TEST(UnifiedCAPI, TestExecutingGraphOpInEagerModeRaises) {
+TEST_P(UnifiedCAPI, TestExecutingGraphOpInEagerModeRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_NewGraphExecutionContext(status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction("some_func", status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
 
   // Add a placeholder to the graph.
@@ -349,5 +332,7 @@ TEST(UnifiedCAPI, TestExecutingGraphOpInEagerModeRaises) {
   TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
+INSTANTIATE_TEST_SUITE_P(Tracing, UnifiedCAPI, ::testing::Values("graphdef"));
+
 }  // namespace
 }  // namespace tensorflow

From e2f8f5ad62b0deeff639e065f62a978c416c0c6b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 07:46:56 -0700
Subject: [PATCH 193/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311530506
Change-Id: Ifcfd3d1247eba8a92c3a44f883cf4a098afdbce6
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 598e3a48bfe..c6d67c9ad44 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12059,7 +12059,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12070,7 +12070,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18975,7 +18975,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18986,7 +18986,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19390,7 +19390,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20461,7 +20461,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21633,7 +21633,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22341,7 +22341,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22537,7 +22537,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22606,7 +22606,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22721,7 +22721,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22780,7 +22780,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22954,7 +22954,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23331,7 +23331,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25654,7 +25654,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25717,7 +25717,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25968,7 +25968,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26452,7 +26452,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45540,7 +45540,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47480,7 +47480,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47551,7 +47551,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48540,7 +48540,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 5d92849778771a475fe339d2954db12c3d4ecc2b Mon Sep 17 00:00:00 2001
From: Guozhong Zhuang <guozhong.zhuang@intel.com>
Date: Thu, 14 May 2020 08:28:07 -0700
Subject: [PATCH 194/412] fix conv_ops_test and remapper_test

---
 .../core/grappler/optimizers/remapper_test.cc |  3 +++
 tensorflow/core/kernels/conv_ops_test.cc      | 23 +++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 35e09b28205..52f420c57cc 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -607,6 +607,7 @@ TEST_F(RemapperTest, FuseMatMulWithBiasAndActivation) {
   }
 }
 
+#ifndef INTEL_MKL
 TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
   using ops::Placeholder;
 
@@ -685,6 +686,7 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
 
+
 TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
   using ops::Placeholder;
 
@@ -850,6 +852,7 @@ TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
   ASSERT_EQ(tensors.size(), 1);
   test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
 }
+#endif
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 21dffa3cc5e..9e9ca27a570 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -1028,12 +1028,14 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolution) {
   this->VerifyConv2DWithBias(filter_size, filter_count);
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, ExplicitPaddingConvolution) {
   const int filter_size = 3;
   const int filter_count = 12;
   this->VerifyConv2DWithBias(filter_size, filter_count,
                              /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
 }
+#endif
 
 TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
@@ -1062,6 +1064,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndActivation) {
   }
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBiasOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
@@ -1072,6 +1075,7 @@ TYPED_TEST_P(FusedConv2DWithBiasOpTest,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
   }
 }
+#endif
 
 // -------------------------------------------------------------------------- //
 // Conv2D + FusedBatchNorm + {Activation}                                     //
@@ -1095,6 +1099,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolution) {
   this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
   const int filter_size = 3;
   const int filter_count = 12;
@@ -1102,6 +1107,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
       filter_size, filter_count,
       /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
 }
+#endif
 
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndActivation) {
   const int filter_size = 1;
@@ -1131,6 +1137,7 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndActivation) {
   }
 }
 
+#ifndef INTEL_MKL
 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
              ExplicitPaddingConvolutionAndActivation) {
   const int filter_size = 3;
@@ -1141,34 +1148,50 @@ TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
   }
 }
+#endif
 
 REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,          //
                             OneByOneConvolution,                //
                             ImageSizeConvolution,               //
                             SpatialConvolution,                 //
+#ifndef INTEL_MKL
                             ExplicitPaddingConvolution,         //
+#endif
                             OneByOneConvolutionAndActivation,   //
                             ImageSizeConvolutionAndActivation,  //
+#ifndef INTEL_MKL
                             SpatialConvolutionAndActivation,    //
                             ExplicitPaddingConvolutionAndActivation);
+#else
+                            SpatialConvolutionAndActivation);
+#endif
 
 REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,     //
                             OneByOneConvolution,                //
                             ImageSizeConvolution,               //
                             SpatialConvolution,                 //
+#ifndef INTEL_MKL
                             ExplicitPaddingConvolution,         //
+#endif
                             OneByOneConvolutionAndActivation,   //
                             ImageSizeConvolutionAndActivation,  //
+#ifndef INTEL_MKL
                             SpatialConvolutionAndActivation,    //
                             ExplicitPaddingConvolutionAndActivation);
+#else
+                            SpatialConvolutionAndActivation);
+#endif
 
 using FusedBiasAddDataTypes = ::testing::Types<float, double>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBiasOpTest,
                                FusedBiasAddDataTypes);
 
+
+#ifndef INTEL_MKL
 using FusedBatchNormDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBatchNormOpTest,
                                FusedBatchNormDataTypes);
+#endif
 
 #endif  // TENSORFLOW_USE_ROCM
 }  // namespace tensorflow

From 10c7f276e41f6b1790d8e767f77b9f5583419ad5 Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Thu, 14 May 2020 17:37:50 +0200
Subject: [PATCH 195/412] Test autograph indirect tf.map_fn decorator

---
 tensorflow/python/kernel_tests/map_fn_test.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 1e10d689886..a5c860b407d 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -186,6 +186,24 @@ class MapFnTest(test.TestCase):
     self.assertAllEqual(-nums, received[1])
     self.assertAllEqual(nums, received[2])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMap_autograph_indirect():
+    def test_function(x):
+      cond = tf.constant(-1)
+      if cond == 0:
+        result = x
+      else:
+        result = x
+      return result
+
+    @tf.function
+    def map_call(x):
+      tf.map_fn(test_function, x)
+
+    x = constant_op.constant([1])
+    y = map_call(x)
+    self.assertAllEqual([1], self.evaluate(y))
+
   @test_util.run_in_graph_and_eager_modes
   def testMapShape(self):
     x = constant_op.constant([[1, 2, 3], [4, 5, 6]])

From ed01ecd92d4376d519247f1d3ce2d8ab5c1d99da Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 08:35:00 -0700
Subject: [PATCH 196/412] Clarify docstring: At EOF, GFile.readline() returns
 "". Along the way, fix a comment about the same topic.

PiperOrigin-RevId: 311537677
Change-Id: I8dbd4fbf12f617efc5fdff0eb615337dc9c2fa8d
---
 tensorflow/python/keras/layers/preprocessing/table_utils.py | 2 +-
 tensorflow/python/lib/io/file_io.py                         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
index 88e9d95e2ed..f5397da1f3e 100644
--- a/tensorflow/python/keras/layers/preprocessing/table_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -144,7 +144,7 @@ def get_vocabulary_from_file(vocabulary_path, encoding="utf-8"):
   vocab = []
   with gfile.GFile(vocabulary_path, "r") as reader:
     while True:
-      # Get the next line, and break if it is None.
+      # Get the next line (incl. \n), and break if nothing is left to read.
       text = reader.readline()
       if not text:
         break
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index a1db2fb056c..7c484c825d3 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -165,7 +165,7 @@ class FileIO(object):
     self._read_buf.seek(offset)
 
   def readline(self):
-    r"""Reads the next line from the file. Leaves the '\n' at the end."""
+    r"""Reads the next line, keeping \n. At EOF, returns ''."""
     self._preread_check()
     return self._prepare_value(self._read_buf.readline())
 

From 38e941dada7b7d790b4b060ec04ee78d5c9252ef Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Thu, 14 May 2020 17:40:11 +0200
Subject: [PATCH 197/412] Fix missing return

---
 tensorflow/python/kernel_tests/map_fn_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index a5c860b407d..7bf793c1e20 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -198,7 +198,7 @@ class MapFnTest(test.TestCase):
 
     @tf.function
     def map_call(x):
-      tf.map_fn(test_function, x)
+      return tf.map_fn(test_function, x)
 
     x = constant_op.constant([1])
     y = map_call(x)

From 83b0c2a225869f61cd420abdb044588bcd2f6696 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 08:37:48 -0700
Subject: [PATCH 198/412] Automated g4 rollback of changelist 311477582.

PiperOrigin-RevId: 311538137
Change-Id: Id9c4f986f0c5a6408ea60147917fb72977b83efe
---
 .../api_def_DenseCountSparseOutput.pbtxt      |  23 +-
 .../api_def_RaggedCountSparseOutput.pbtxt     |  27 +-
 .../api_def_SparseCountSparseOutput.pbtxt     |  29 ++-
 tensorflow/core/kernels/count_ops.cc          | 246 +++++++-----------
 tensorflow/core/ops/count_ops.cc              |  39 +--
 tensorflow/python/ops/bincount.py             | 151 +++++++++--
 tensorflow/python/ops/bincount_test.py        | 188 +++++++++----
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v1/tensorflow.sparse.pbtxt     |   2 +-
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   6 +-
 .../api/golden/v2/tensorflow.sparse.pbtxt     |   2 +-
 11 files changed, 441 insertions(+), 278 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
index 416da1ccaab..8296bfe6d7b 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DenseCountSparseOutput.pbtxt
@@ -4,61 +4,62 @@ op {
   in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing data to count.
+Tensor containing data to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values. May
+also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
+Values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a tf.tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
index 1763aea1fa6..37224d841de 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedCountSparseOutput.pbtxt
@@ -4,67 +4,68 @@ op {
   in_arg {
     name: "splits"
     description: <<END
-int64; Tensor containing the row splits of the ragged tensor to count.
+Tensor containing the row splits of the ragged tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing values of the sparse tensor to count.
+Tensor containing values of the sparse tensor to count.
 END
   }
   in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values.
+May also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
     name: "output_values"
     description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
-  END
+Values tensor for the resulting sparse tensor object.
+END
   }
   out_arg {
     name: "output_dense_shape"
     description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
   END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a ragged tensor input."
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
index 62538e36a45..a346710c8b3 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCountSparseOutput.pbtxt
@@ -4,73 +4,74 @@ op {
   in_arg {
     name: "indices"
     description: <<END
-int64; Tensor containing the indices of the sparse tensor to count.
+Tensor containing the indices of the sparse tensor to count.
 END
   }
 in_arg {
     name: "values"
     description: <<END
-int32 or int64; Tensor containing values of the sparse tensor to count.
+Tensor containing values of the sparse tensor to count.
 END
   }
 in_arg {
     name: "dense_shape"
     description: <<END
-int64; Tensor containing the dense shape of the sparse tensor to count.
+Tensor containing the dense shape of the sparse tensor to count.
 END
   }
-  in_arg {
+ in_arg {
     name: "weights"
     description: <<END
-float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
+A Tensor of the same shape as indices containing per-index weight values.
+May also be the empty tensor if no weights are used.
 END
   }
   out_arg {
     name: "output_indices"
     description: <<END
-int64; indices tensor for the resulting sparse tensor object.
+Indices tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_values"
       description: <<END
-int64 or float32; values tensor for the resulting sparse tensor object.
+Values tensor for the resulting sparse tensor object.
 END
   }
   out_arg {
       name: "output_dense_shape"
       description: <<END
-int64; shape tensor for the resulting sparse tensor object.
+Shape tensor for the resulting sparse tensor object.
 END
   }
   attr {
     name: "T"
     description: <<END
-dtype; dtype of the input values tensor.
+Dtype of the input values tensor.
 END
   }
   attr {
     name: "minlength"
     description: <<END
-int32; minimum value to count. Can be set to -1 for no minimum.
+Minimum value to count. Can be set to -1 for no minimum.
 END
   }
   attr {
     name: "maxlength"
     description: <<END
-int32; maximum value to count. Can be set to -1 for no maximum.
+Maximum value to count. Can be set to -1 for no maximum.
 END
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     description: <<END
-bool; whether to output the number of occurrences of each value or 1.
+Whether to output the number of occurrences of each value or 1.
 END
   }
   attr {
     name: "output_type"
     description: <<END
-dtype; dtype of the output values tensor.
+Dtype of the output values tensor.
 END
   }
   summary: "Performs sparse-output bin counting for a sparse tensor input."
diff --git a/tensorflow/core/kernels/count_ops.cc b/tensorflow/core/kernels/count_ops.cc
index e7cc18ac454..7c85b050039 100644
--- a/tensorflow/core/kernels/count_ops.cc
+++ b/tensorflow/core/kernels/count_ops.cc
@@ -16,17 +16,20 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-using BatchedIntMap = std::vector<absl::flat_hash_map<int64, int64>>;
+template <class T>
+using BatchedMap = std::vector<absl::flat_hash_map<int64, T>>;
 
 namespace {
 // TODO(momernick): Extend this function to work with outputs of rank > 2.
-Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
+template <class T>
+Status OutputSparse(const BatchedMap<T>& per_batch_counts, int num_values,
                     bool is_1d, OpKernelContext* context) {
   int total_values = 0;
   int num_batches = per_batch_counts.size();
@@ -44,12 +47,12 @@ Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
       context->allocate_output(1, TensorShape({total_values}), &values));
 
   auto output_indices = indices->matrix<int64>();
-  auto output_values = values->flat<int64>();
+  auto output_values = values->flat<T>();
   int64 value_loc = 0;
   for (int b = 0; b < num_batches; ++b) {
     const auto& per_batch_count = per_batch_counts[b];
-    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
-                                           per_batch_count.end());
+    std::vector<std::pair<int, T>> pairs(per_batch_count.begin(),
+                                         per_batch_count.end());
     std::sort(pairs.begin(), pairs.end());
     for (const auto& x : pairs) {
       if (is_1d) {
@@ -77,85 +80,19 @@ Status OutputSparse(const BatchedIntMap& per_batch_counts, int num_values,
   return Status::OK();
 }
 
-Status OutputWeightedSparse(const BatchedIntMap& per_batch_counts,
-                            int num_values, const Tensor& weights, bool is_1d,
-                            OpKernelContext* context) {
-  if (!TensorShapeUtils::IsVector(weights.shape())) {
-    return errors::InvalidArgument(
-        "Weights must be a 1-dimensional tensor. Got: ",
-        weights.shape().DebugString());
-  }
-
-  if (num_values > weights.dim_size(0)) {
-    return errors::InvalidArgument("The maximum array value was ", num_values,
-                                   ", but the weight array has size ",
-                                   weights.shape().DebugString());
-  }
-  auto weight_values = weights.flat<float>();
-
-  int total_values = 0;
-  int num_batches = per_batch_counts.size();
-  for (const auto& per_batch_count : per_batch_counts) {
-    total_values += per_batch_count.size();
-  }
-
-  Tensor* indices;
-  int inner_dim = is_1d ? 1 : 2;
-  TF_RETURN_IF_ERROR(context->allocate_output(
-      0, TensorShape({total_values, inner_dim}), &indices));
-
-  Tensor* values;
-  TF_RETURN_IF_ERROR(
-      context->allocate_output(1, TensorShape({total_values}), &values));
-
-  auto output_indices = indices->matrix<int64>();
-  auto output_values = values->flat<float>();
-  int64 value_loc = 0;
-  for (int b = 0; b < num_batches; ++b) {
-    const auto& per_batch_count = per_batch_counts[b];
-    std::vector<std::pair<int, int>> pairs(per_batch_count.begin(),
-                                           per_batch_count.end());
-    std::sort(pairs.begin(), pairs.end());
-    for (const auto& x : pairs) {
-      if (is_1d) {
-        output_indices(value_loc, 0) = x.first;
-      } else {
-        output_indices(value_loc, 0) = b;
-        output_indices(value_loc, 1) = x.first;
-      }
-      output_values(value_loc) = x.second * weight_values(x.first);
-      ++value_loc;
-    }
-  }
-
-  Tensor* dense_shape;
-  if (is_1d) {
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({1}), &dense_shape));
-    dense_shape->flat<int64>().data()[0] = num_values;
-  } else {
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({2}), &dense_shape));
-    dense_shape->flat<int64>().data()[0] = num_batches;
-    dense_shape->flat<int64>().data()[1] = num_values;
-  }
-  return Status::OK();
-}
-
-template <class T>
-T GetOutputSize(T max_seen, T max_length, T min_length) {
+int GetOutputSize(int max_seen, int max_length, int min_length) {
   return max_length > 0 ? max_length : std::max((max_seen + 1), min_length);
 }
 
 }  // namespace
 
-template <class T>
+template <class T, class W>
 class DenseCount : public OpKernel {
  public:
   explicit DenseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -170,6 +107,15 @@ class DenseCount : public OpKernel {
                     "Input must be a 1 or 2-dimensional tensor. Got: ",
                     data.shape().DebugString()));
 
+    if (use_weights) {
+      OP_REQUIRES(
+          context, weights.shape() == data.shape(),
+          errors::InvalidArgument(
+              "Weights and data must have the same shape. Weight shape: ",
+              weights.shape().DebugString(),
+              "; data shape: ", data.shape().DebugString()));
+    }
+
     bool is_1d = TensorShapeUtils::IsVector(data.shape());
     int negative_valued_axis = -1;
     int num_batch_dimensions = (data.shape().dims() + negative_valued_axis);
@@ -179,19 +125,23 @@ class DenseCount : public OpKernel {
       num_batch_elements *= data.shape().dim_size(i);
     }
     int num_value_elements = data.shape().num_elements() / num_batch_elements;
-    auto per_batch_counts = BatchedIntMap(num_batch_elements);
+    auto per_batch_counts = BatchedMap<W>(num_batch_elements);
+
     T max_value = 0;
 
     const auto data_values = data.flat<T>();
+    const auto weight_values = weights.flat<W>();
     int i = 0;
     for (int b = 0; b < num_batch_elements; ++b) {
       for (int v = 0; v < num_value_elements; ++v) {
         const auto& value = data_values(i);
         if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-          if (binary_count_) {
-            (per_batch_counts[b])[value] = 1;
+          if (binary_output_) {
+            per_batch_counts[b][value] = 1;
+          } else if (use_weights) {
+            per_batch_counts[b][value] += weight_values(i);
           } else {
-            (per_batch_counts[b])[value]++;
+            per_batch_counts[b][value]++;
           }
           if (value > max_value) {
             max_value = value;
@@ -201,30 +151,24 @@ class DenseCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, is_1d, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           is_1d, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
 };
 
-template <class T>
+template <class T, class W>
 class SparseCount : public OpKernel {
  public:
   explicit SparseCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -235,23 +179,27 @@ class SparseCount : public OpKernel {
     bool use_weights = weights.NumElements() > 0;
 
     bool is_1d = shape.NumElements() == 1;
-    const auto indices_values = indices.matrix<int64>();
-    const auto values_values = values.flat<T>();
-
     int num_batches = is_1d ? 1 : shape.flat<int64>()(0);
     int num_values = values.NumElements();
 
-    auto per_batch_counts = BatchedIntMap(num_batches);
+    const auto indices_values = indices.matrix<int64>();
+    const auto values_values = values.flat<T>();
+    const auto weight_values = weights.flat<W>();
+
+    auto per_batch_counts = BatchedMap<W>(num_batches);
+
     T max_value = 0;
 
     for (int idx = 0; idx < num_values; ++idx) {
       int batch = is_1d ? 0 : indices_values(idx, 0);
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_count_) {
-          (per_batch_counts[batch])[value] = 1;
+        if (binary_output_) {
+          per_batch_counts[batch][value] = 1;
+        } else if (use_weights) {
+          per_batch_counts[batch][value] += weight_values(idx);
         } else {
-          (per_batch_counts[batch])[value]++;
+          per_batch_counts[batch][value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -259,30 +207,25 @@ class SparseCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, is_1d, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           is_1d, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+  bool validate_;
 };
 
-template <class T>
+template <class T, class W>
 class RaggedCount : public OpKernel {
  public:
   explicit RaggedCount(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("minlength", &minlength_));
     OP_REQUIRES_OK(context, context->GetAttr("maxlength", &maxlength_));
-    OP_REQUIRES_OK(context, context->GetAttr("binary_count", &binary_count_));
+    OP_REQUIRES_OK(context, context->GetAttr("binary_output", &binary_output_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -290,13 +233,15 @@ class RaggedCount : public OpKernel {
     const Tensor& values = context->input(1);
     const Tensor& weights = context->input(2);
     bool use_weights = weights.NumElements() > 0;
+    bool is_1d = false;
 
     const auto splits_values = splits.flat<int64>();
     const auto values_values = values.flat<T>();
+    const auto weight_values = weights.flat<W>();
     int num_batches = splits.NumElements() - 1;
     int num_values = values.NumElements();
 
-    auto per_batch_counts = BatchedIntMap(num_batches);
+    auto per_batch_counts = BatchedMap<W>(num_batches);
     T max_value = 0;
     int batch_idx = 0;
 
@@ -306,10 +251,12 @@ class RaggedCount : public OpKernel {
       }
       const auto& value = values_values(idx);
       if (value >= 0 && (maxlength_ <= 0 || value < maxlength_)) {
-        if (binary_count_) {
-          (per_batch_counts[batch_idx - 1])[value] = 1;
+        if (binary_output_) {
+          per_batch_counts[batch_idx - 1][value] = 1;
+        } else if (use_weights) {
+          per_batch_counts[batch_idx - 1][value] += weight_values(idx);
         } else {
-          (per_batch_counts[batch_idx - 1])[value]++;
+          per_batch_counts[batch_idx - 1][value]++;
         }
         if (value > max_value) {
           max_value = value;
@@ -317,42 +264,47 @@ class RaggedCount : public OpKernel {
       }
     }
 
-    T num_output_values = GetOutputSize<T>(max_value, maxlength_, minlength_);
-    if (use_weights) {
-      OP_REQUIRES_OK(context,
-                     OutputWeightedSparse(per_batch_counts, num_output_values,
-                                          weights, false, context));
-    } else {
-      OP_REQUIRES_OK(context, OutputSparse(per_batch_counts, num_output_values,
-                                           false, context));
-    }
+    int num_output_values = GetOutputSize(max_value, maxlength_, minlength_);
+    OP_REQUIRES_OK(context, OutputSparse<W>(per_batch_counts, num_output_values,
+                                            is_1d, context));
   }
 
  private:
-  T minlength_;
-  T maxlength_;
-  bool binary_count_;
+  int maxlength_;
+  int minlength_;
+  bool binary_output_;
+  bool validate_;
 };
 
-#define REGISTER(TYPE)                                    \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")  \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          DenseCount<TYPE>)               \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput") \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          SparseCount<TYPE>)              \
-                                                          \
-  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput") \
-                              .TypeConstraint<TYPE>("T")  \
-                              .Device(DEVICE_CPU),        \
-                          RaggedCount<TYPE>)
+#define REGISTER_W(W_TYPE) \
+  REGISTER(int32, W_TYPE)  \
+  REGISTER(int64, W_TYPE)
 
-REGISTER(int32);
-REGISTER(int64);
+#define REGISTER(I_TYPE, W_TYPE)                                     \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("DenseCountSparseOutput")             \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          DenseCount<I_TYPE, W_TYPE>)                \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("SparseCountSparseOutput")            \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          SparseCount<I_TYPE, W_TYPE>)               \
+                                                                     \
+  REGISTER_KERNEL_BUILDER(Name("RaggedCountSparseOutput")            \
+                              .TypeConstraint<I_TYPE>("T")           \
+                              .TypeConstraint<W_TYPE>("output_type") \
+                              .Device(DEVICE_CPU),                   \
+                          RaggedCount<I_TYPE, W_TYPE>)
+
+TF_CALL_INTEGRAL_TYPES(REGISTER_W);
+TF_CALL_float(REGISTER_W);
+TF_CALL_double(REGISTER_W);
+
+#undef REGISTER_W
 #undef REGISTER
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/count_ops.cc b/tensorflow/core/ops/count_ops.cc
index c9fbe1f8d8e..8de0a2ef954 100644
--- a/tensorflow/core/ops/count_ops.cc
+++ b/tensorflow/core/ops/count_ops.cc
@@ -19,12 +19,21 @@ limitations under the License.
 
 namespace tensorflow {
 
-using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
+using shape_inference::ShapeHandle;
 
 Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
-  int32 rank = c->Rank(c->input(0));
-  DimensionHandle nvals = c->UnknownDim();
+  auto values = c->input(0);
+  auto weights = c->input(1);
+  ShapeHandle output;
+  auto num_weights = c->NumElements(weights);
+  if (c->ValueKnown(num_weights) && c->Value(num_weights) == 0) {
+    output = values;
+  } else {
+    TF_RETURN_IF_ERROR(c->Merge(weights, values, &output));
+  }
+  auto rank = c->Rank(output);
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -32,8 +41,8 @@ Status DenseCountSparseOutputShapeFn(InferenceContext *c) {
 }
 
 Status SparseCountSparseOutputShapeFn(InferenceContext *c) {
-  DimensionHandle rank = c->Dim(c->input(0), 1);
-  DimensionHandle nvals = c->UnknownDim();
+  auto rank = c->Dim(c->input(0), 1);
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -45,7 +54,7 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
   if (rank != c->kUnknownRank) {
     ++rank;  // Add the ragged dimension
   }
-  DimensionHandle nvals = c->UnknownDim();
+  auto nvals = c->UnknownDim();
   c->set_output(0, c->Matrix(nvals, rank));  // out.indices
   c->set_output(1, c->Vector(nvals));        // out.values
   c->set_output(2, c->Vector(rank));         // out.dense_shape
@@ -54,12 +63,12 @@ Status RaggedCountSparseOutputShapeFn(InferenceContext *c) {
 
 REGISTER_OP("DenseCountSparseOutput")
     .Input("values: T")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(DenseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -69,12 +78,12 @@ REGISTER_OP("SparseCountSparseOutput")
     .Input("indices: int64")
     .Input("values: T")
     .Input("dense_shape: int64")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(SparseCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
@@ -83,12 +92,12 @@ REGISTER_OP("SparseCountSparseOutput")
 REGISTER_OP("RaggedCountSparseOutput")
     .Input("splits: int64")
     .Input("values: T")
-    .Input("weights: float")
+    .Input("weights: output_type")
     .Attr("T: {int32, int64}")
     .Attr("minlength: int >= -1 = -1")
     .Attr("maxlength: int >= -1 = -1")
-    .Attr("binary_count: bool")
-    .Attr("output_type: {int64, float}")
+    .Attr("binary_output: bool")
+    .Attr("output_type: {int32, int64, float, double}")
     .SetShapeFn(RaggedCountSparseOutputShapeFn)
     .Output("output_indices: int64")
     .Output("output_values: output_type")
diff --git a/tensorflow/python/ops/bincount.py b/tensorflow/python/ops/bincount.py
index e1b3bebaaaa..68950eaf596 100644
--- a/tensorflow/python/ops/bincount.py
+++ b/tensorflow/python/ops/bincount.py
@@ -18,10 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_count_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util.tf_export import tf_export
@@ -33,7 +33,7 @@ def sparse_bincount(values,
                     axis=0,
                     minlength=None,
                     maxlength=None,
-                    binary_count=False,
+                    binary_output=False,
                     name=None):
   """Count the number of times an integer value appears in a tensor.
 
@@ -58,8 +58,9 @@ def sparse_bincount(values,
     maxlength: If given, skips `values` that are greater than or equal to
       `maxlength`, and ensures that the output has a `dense_shape` of at most
       `maxlength` in the inner dimension.
-    binary_count: Whether to do a binary count. When True, this op will return 1
-      for any value that exists instead of counting the number of occurrences.
+    binary_output: If True, this op will output 1 instead of the number of times
+      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
+      reduce_add). Defaults to False.
     name: A name for this op.
 
   Returns:
@@ -78,7 +79,7 @@ def sparse_bincount(values,
   SparseTensor) and returns a SparseTensor where the value of (i,j) is the
   number of times value j appears in batch i.
 
-  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
   >>> output = tf.sparse.bincount(data, axis=-1)
   >>> print(output)
   SparseTensor(indices=tf.Tensor(
@@ -102,7 +103,7 @@ def sparse_bincount(values,
   dense shape is [2, 500] instead of [2,10002] or [2, 102].
 
   >>> minlength = maxlength = 500
-  >>> data = [[10, 20, 30, 20], [11, 101, 11, 10001]]
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
   >>> output = tf.sparse.bincount(
   ...    data, axis=-1, minlength=minlength, maxlength=maxlength)
   >>> print(output)
@@ -123,8 +124,8 @@ def sparse_bincount(values,
   some values (like 20 in batch 1 and 11 in batch 2) appear more than once,
   the 'values' tensor is all 1s.
 
-  >>> dense = [[10, 20, 30, 20], [11, 101, 11, 10001]]
-  >>> output = tf.sparse.bincount(dense, binary_count=True, axis=-1)
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(data, binary_output=True, axis=-1)
   >>> print(output)
   SparseTensor(indices=tf.Tensor(
   [[    0    10]
@@ -136,20 +137,42 @@ def sparse_bincount(values,
    values=tf.Tensor([1 1 1 1 1 1], shape=(6,), dtype=int64),
    dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
 
+  **Weighted bin-counting**
+
+  This example takes two inputs - a values tensor and a weights tensor. These
+  tensors must be identically shaped, and have the same row splits or indices
+  in the case of RaggedTensors or SparseTensors. When performing a weighted
+  count, the op will output a SparseTensor where the value of (i, j) is the
+  sum of the values in the weight tensor's batch i in the locations where
+  the values tensor has the value j. In this case, the output dtype is the
+  same as the dtype of the weights tensor.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> weights = [[2, 0.25, 15, 0.5], [2, 17, 3, 0.9]]
+  >>> output = tf.sparse.bincount(data, weights=weights, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([2. 0.75 15. 5. 17. 0.9], shape=(6,), dtype=float32),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
   """
   with ops.name_scope(name, "count", [values, weights]):
     if not isinstance(values, sparse_tensor.SparseTensor):
       values = ragged_tensor.convert_to_tensor_or_ragged_tensor(
           values, name="values")
+    if weights is not None:
+      if not isinstance(weights, sparse_tensor.SparseTensor):
+        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+            weights, name="weights")
 
-    if weights is not None and binary_count:
-      raise ValueError("binary_count and weights are mutually exclusive.")
-
-    if weights is None:
-      weights = []
-      output_type = dtypes.int64
-    else:
-      output_type = dtypes.float32
+    if weights is not None and binary_output:
+      raise ValueError("binary_output and weights are mutually exclusive.")
 
     if axis is None:
       axis = 0
@@ -162,38 +185,114 @@ def sparse_bincount(values,
     maxlength_value = maxlength if maxlength is not None else -1
 
     if axis == 0:
-      if isinstance(values,
-                    (sparse_tensor.SparseTensor, ragged_tensor.RaggedTensor)):
+      if isinstance(values, sparse_tensor.SparseTensor):
+        if weights is not None:
+          weights = validate_sparse_weights(values, weights)
+        values = values.values
+      elif isinstance(values, ragged_tensor.RaggedTensor):
+        if weights is not None:
+          weights = validate_ragged_weights(values, weights)
         values = values.values
       else:
+        if weights is not None:
+          weights = array_ops.reshape(weights, [-1])
         values = array_ops.reshape(values, [-1])
 
     if isinstance(values, sparse_tensor.SparseTensor):
+      weights = validate_sparse_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.sparse_count_sparse_output(
           values.indices,
           values.values,
           values.dense_shape,
-          weights=weights,
+          weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
+          binary_output=binary_output)
     elif isinstance(values, ragged_tensor.RaggedTensor):
+      weights = validate_ragged_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.ragged_count_sparse_output(
           values.row_splits,
           values.values,
-          weights=weights,
+          weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
+          binary_output=binary_output)
     else:
+      weights = validate_dense_weights(values, weights)
       c_ind, c_val, c_shape = gen_count_ops.dense_count_sparse_output(
           values,
           weights=weights,
           minlength=minlength_value,
           maxlength=maxlength_value,
-          binary_count=binary_count,
-          output_type=output_type)
+          binary_output=binary_output)
 
     return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
+
+
+def validate_dense_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.dtype)
+
+  if not isinstance(weights, ops.Tensor):
+    raise ValueError(
+        "`weights` must be a tf.Tensor if `values` is a tf.Tensor.")
+
+  return weights
+
+
+def validate_sparse_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, sparse_tensor.SparseTensor):
+    raise ValueError(
+        "`weights` must be a SparseTensor if `values` is a SparseTensor.")
+
+  checks = []
+  if weights.dense_shape is not values.dense_shape:
+    checks.append(
+        check_ops.assert_equal(
+            weights.dense_shape,
+            values.dense_shape,
+            message="'weights' and 'values' must have the same dense shape."))
+  if weights.indices is not values.indices:
+    checks.append(
+        check_ops.assert_equal(
+            weights.indices,
+            values.indices,
+            message="'weights' and 'values' must have the same indices.")
+    )
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
+
+
+def validate_ragged_weights(values, weights):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, ragged_tensor.RaggedTensor):
+    raise ValueError(
+        "`weights` must be a RaggedTensor if `values` is a RaggedTensor.")
+
+  checks = []
+  if weights.row_splits is not values.row_splits:
+    checks.append(
+        check_ops.assert_equal(
+            weights.row_splits,
+            values.row_splits,
+            message="'weights' and 'values' must have the same row splits."))
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
diff --git a/tensorflow/python/ops/bincount_test.py b/tensorflow/python/ops/bincount_test.py
index 776b65b72d0..839af8dcc35 100644
--- a/tensorflow/python/ops/bincount_test.py
+++ b/tensorflow/python/ops/bincount_test.py
@@ -21,6 +21,8 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
 from tensorflow.python.ops import bincount
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.ragged import ragged_factory_ops
@@ -65,7 +67,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 6],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_maxlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -73,7 +75,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
           "expected_values": [1, 1, 1, 1, 1],
           "expected_shape": [2, 7],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_minlength_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -82,7 +84,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 9],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_minlength_larger_values_binary",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
@@ -91,40 +93,40 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [1, 7]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [2, 8],
-          "binary_count": True,
+          "binary_output": True,
       }, {
           "testcase_name": "_no_maxlength_weights",
           "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
-          "expected_values": [1, 2, 3, 8, 5],
+          "expected_values": [2, 1, 0.5, 9, 3],
           "expected_shape": [2, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights": [[0.5, 1, 2], [3, 4, 5]]
       }, {
           "testcase_name": "_maxlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "maxlength": 7,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
-          "expected_values": [1, 2, 3, 0.5, 8],
+          "expected_values": [2, 1, 0.5, 3, 9],
           "expected_shape": [2, 7],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights": [[0.5, 1, 2, 11], [7, 3, 4, 5]]
       }, {
           "testcase_name": "_minlength_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 9,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
           "expected_shape": [2, 9],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
       }, {
           "testcase_name": "_minlength_larger_values_weights",
           "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
           "minlength": 3,
           "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
                                [1, 7]],
-          "expected_values": [1, 2, 3, 7, 0.5, 8, 7],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
           "expected_shape": [2, 8],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
       }, {
           "testcase_name": "_1d",
           "x": np.array([3, 2, 1, 1], dtype=np.int32),
@@ -146,7 +148,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                        expected_shape,
                        minlength=None,
                        maxlength=None,
-                       binary_count=False,
+                       binary_output=False,
                        weights=None,
                        axis=-1):
     y = bincount.sparse_bincount(
@@ -154,7 +156,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
         weights=weights,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -216,7 +218,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
           "expected_values": [1, 1, 1, 1],
           "expected_shape": [3, 6],
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -230,7 +232,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -244,7 +246,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -258,7 +260,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "binary_count":
+          "binary_output":
               True,
       },
       {
@@ -268,9 +270,10 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 8, 5],
+          "expected_values": [2, 6, 7, 10],
           "expected_shape": [3, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 0, 0], [10, 0, 3.5, 3.5]]),
       },
       {
           "testcase_name":
@@ -279,11 +282,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 8, 5],
+          "expected_values": [2, 6, 7, 10],
           "expected_shape": [3, 7],
           "maxlength":
               7,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 14, 0], [10, 0, 3.5, 3.5]]),
       },
       {
           "testcase_name":
@@ -292,11 +296,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 7, 8, 5],
+          "expected_values": [2, 6, 14, 6.5, 10],
           "expected_shape": [3, 9],
           "minlength":
               9,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
       },
       {
           "testcase_name":
@@ -305,11 +310,12 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
               np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
                        dtype=np.int32),
           "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 3, 7, 8, 5],
+          "expected_values": [2, 6, 14, 6.5, 10],
           "expected_shape": [3, 8],
           "minlength":
               3,
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
       },
       {
           "testcase_name": "_1d",
@@ -338,16 +344,17 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_count=False,
+                        binary_output=False,
                         weights=None,
                         axis=-1):
     x_sparse = sparse_ops.from_dense(x)
+    w_sparse = sparse_ops.from_dense(weights) if weights is not None else None
     y = bincount.sparse_bincount(
         x_sparse,
-        weights=weights,
+        weights=w_sparse,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
@@ -393,7 +400,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 6],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_maxlength_binary",
@@ -402,7 +409,7 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 7],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_minlength_binary",
@@ -412,13 +419,13 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
           "expected_shape": [5, 9],
-          "binary_count": True,
+          "binary_output": True,
       },
       {
           "testcase_name": "_minlength_larger_values_binary",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "minlength": 3,
-          "binary_count": True,
+          "binary_output": True,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
           "expected_values": [1, 1, 1, 1, 1, 1, 1],
@@ -428,18 +435,18 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "testcase_name": "_no_maxlength_weights",
           "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
           "expected_shape": [5, 6],
-          "weights": [0.5, 1, 2, 3, 4, 5]
+          "weights": [[], [], [6, 0.5, 2], [], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_maxlength_weights",
           "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
           "maxlength": 7,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 1, 3, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
           "expected_shape": [5, 7],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_minlength_weights",
@@ -447,9 +454,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 9,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
           "expected_shape": [5, 9],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_minlength_larger_values_weights",
@@ -457,9 +464,9 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
           "minlength": 3,
           "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
                                [4, 5]],
-          "expected_values": [0.5, 1, 3, 7, 0.5, 8, 5],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
           "expected_shape": [5, 8],
-          "weights": [0.5, 1, 2, 3, 4, 5, 6, 7, 8]
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
       },
       {
           "testcase_name": "_1d",
@@ -484,21 +491,114 @@ class TestSparseCount(test.TestCase, parameterized.TestCase):
                         expected_shape,
                         maxlength=None,
                         minlength=None,
-                        binary_count=False,
+                        binary_output=False,
                         weights=None,
                         axis=-1):
     x_ragged = ragged_factory_ops.constant(x)
+    w = ragged_factory_ops.constant(weights) if weights is not None else None
     y = bincount.sparse_bincount(
         x_ragged,
-        weights=weights,
+        weights=w,
         minlength=minlength,
         maxlength=maxlength,
-        binary_count=binary_count,
+        binary_output=binary_output,
         axis=axis)
     self.assertAllEqual(expected_indices, y.indices)
     self.assertAllEqual(expected_values, y.values)
     self.assertAllEqual(expected_shape, y.dense_shape)
 
 
+class TestSparseCountFailureModes(test.TestCase):
+
+  def test_dense_input_sparse_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_ragged_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(ValueError, "must be a tf.Tensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_wrong_shape_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = np.array([[3, 2], [5, 4], [4, 3]])
+    # Note: Eager mode and graph mode throw different errors here. Graph mode
+    # will fail with a ValueError from the shape checking logic, while Eager
+    # will fail with an InvalidArgumentError from the kernel itself.
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "must have the same shape"):
+        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+    else:
+      with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
+        self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_dense_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_ragged_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(ValueError, "must be a SparseTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 0, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same indices"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_too_many_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "Incompatible shapes"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_shape_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4], [0, 0, 0, 0]],
+                 dtype=np.int32))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same dense shape"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_dense_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_sparse_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegexp(ValueError, "must be a RaggedTensor"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_different_shape_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 "must have the same row splits"):
+      self.evaluate(bincount.sparse_bincount(x, weights=weights, axis=-1))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 05b8842be66..44fb74ac63a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1078,7 +1078,7 @@ tf_module {
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -3074,7 +3074,7 @@ tf_module {
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -4094,7 +4094,7 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 4c4f6c62291..f8f8edb26a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 05b8842be66..44fb74ac63a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1078,7 +1078,7 @@ tf_module {
   }
   member_method {
     name: "DenseCountSparseOutput"
-    argspec: "args=[\'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "DenseToCSRSparseMatrix"
@@ -3074,7 +3074,7 @@ tf_module {
   }
   member_method {
     name: "RaggedCountSparseOutput"
-    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'splits\', \'values\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "RaggedCross"
@@ -4094,7 +4094,7 @@ tf_module {
   }
   member_method {
     name: "SparseCountSparseOutput"
-    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_count\', \'output_type\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
+    argspec: "args=[\'indices\', \'values\', \'dense_shape\', \'weights\', \'binary_output\', \'minlength\', \'maxlength\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], "
   }
   member_method {
     name: "SparseCross"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index a9ad81920dd..67235bb2cf2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -10,7 +10,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_count\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'values\', \'weights\', \'axis\', \'minlength\', \'maxlength\', \'binary_output\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "concat"

From 8e8c67c3375da3fe8b44e7c11eb1d3fbb2eaa41c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 08:44:49 -0700
Subject: [PATCH 199/412] Comment typo fix.

PiperOrigin-RevId: 311539306
Change-Id: Ieb8cf58b706e822177269b00a1a0ba58f0a97067
---
 tensorflow/python/ops/tensor_array_ops.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index e8ea9ff4e4d..d386d14b64a 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -1122,7 +1122,7 @@ class TensorArray(object):
     Returns:
       A new TensorArray object with flow that ensures the control dependencies
       from the contexts will become control dependencies for writes, reads, etc.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
     """
     return self._implementation.identity()
 
@@ -1152,7 +1152,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the write occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if there are more writers than specified.
@@ -1217,7 +1217,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the unstack occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.
@@ -1236,7 +1236,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the scatter occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.
@@ -1255,7 +1255,7 @@ class TensorArray(object):
 
     Returns:
       A new TensorArray object with flow that ensures the split occurs.
-      Use this object all for subsequent operations.
+      Use this object for all subsequent operations.
 
     Raises:
       ValueError: if the shape inference fails.

From e033fd5b33e5f3cfb7b075715e6d38c3de2383fd Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 14 May 2020 08:44:55 -0700
Subject: [PATCH 200/412] [TF MLIR SI] Don't constant fold, only consider
 result of folding

This results in less changes to the module during shape inference (e.g., only shapes are changed, no constant nodes are created). Effectively this computes the folded result and then just uses that information locally. Which is conceptually more wasteful (as a subsequent canonicalize pass may need to recompute these) but is less surprising and avoids dropping attributes during this part.

There is still additional changes that need to be made to avoid doing needless computations here, this mostly focuses on decreasing graph mutations.

PiperOrigin-RevId: 311539328
Change-Id: Ib6daa331c1e18a6d23463aa945c87e59d253708b
---
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  10 +-
 .../tensorflow/tests/shape_inference.mlir     |  31 ++--
 .../tensorflow/transforms/shape_inference.cc  | 145 +++++++++++++-----
 3 files changed, 137 insertions(+), 49 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 2007824369c..b21fef32cca 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -3551,12 +3551,20 @@ OpFoldResult FoldIdentityTranspose(TransposeOp op) {
   if (!const_perm) return {};
 
   auto const_value = const_perm.value();
-  const auto &elements = const_value.getValues<APInt>();
+  const auto elements = const_value.getValues<APInt>();
 
   for (auto it : llvm::enumerate(elements)) {
     if (it.index() != it.value()) return {};
   }
 
+  // TODO(jpienaar): Remove when we handle this more generally.
+  if (op.getType() != op.x().getType()) {
+    // If the types don't match then only fold if all the operands are in the TF
+    // dialect.
+    for (auto user : op.getOperation()->getUsers())
+      if (user->getDialect() != op.getDialect()) return {};
+  }
+
   return op.x();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 160bba94cfc..cfe8db9025e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -3,8 +3,8 @@
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 130 : i32}} {
 // CHECK-LABEL: func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32>
   func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<*xi32> {
- // CHECK-NOT: tf.Cast
- // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+ // CHECK: %[[RESULT:.*]] = "tf.AddV2"
+ // CHECK-SAME: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
  // CHECK: return %[[RESULT]] : tensor<1xi32>
     %0 = "tf.Cast"(%arg0) : (tensor<1xi32>) -> tensor<*xi32>
     %1 = "tf.Cast"(%arg1) : (tensor<1xi32>) -> tensor<*xi32>
@@ -60,8 +60,8 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: func @simple_folding
   func @simple_folding(%arg0: tensor<1x1x1x1xi32>, %arg1: tensor<1x1x1x1xf32>) -> tensor<?x?x?x?xf32> {
-// CHECK: %[[CST:.*]] = "tf.Const"{{.*}} {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
-// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[CST]]
+// CHECK: %[[SHAPE:.*]] = "tf.Shape"
+// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[SHAPE]]
 // CHECK-SAME: (tensor<4xi32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
 // CHECK: return %[[CONV]] : tensor<1x1x1x1xf32>
     %0 = "tf.Shape"(%arg0) : (tensor<1x1x1x1xi32>) -> tensor<4xi32>
@@ -300,13 +300,6 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %0 : tensor<*xi32>
   }
 
-  // CHECK-LABEL: func @fold_cast
-  func @fold_cast(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-    // CHECK-NOT: Cast
-    %0 = "tf.Cast"(%arg0) : (tensor<*xf32>) -> (tensor<*xf32>)
-    return %0 : tensor<*xf32>
-  }
-
   // CHECK-LABEL: func @while_variant
   // CHECK-SAME: -> tensor<!tf.variant<tensor<16x1xf32>>>
   func @while_variant(%arg0: tensor<!tf.variant<tensor<16x1xf32>>>) -> tensor<!tf.variant> {
@@ -362,8 +355,6 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @partitioned_call_func_const
   func @partitioned_call_func_const(%arg0: tensor<2xi32>) -> tensor<2xi32> {
-    // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
-    // CHECK: return %[[CONST]]
     return %arg0 : tensor<2xi32>
   }
 
@@ -410,4 +401,18 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     %40 = "tf.Reshape"(%39, %19) {T = f32, Tshape = i32, device = ""} : (tensor<1x4x4x32xf32>, tensor<2xi32>) -> tensor<?x?xf32>
    return
   }
+
+  // CHECK-LABEL: const_fold
+  func @const_fold() -> () {
+    // CHECK: tf.Const
+    // CHECK-SAME: () -> tensor<4xi32>
+    %0 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<*xi32>
+    // CHECK: tf.Const
+    // CHECK-SAME: () -> tensor<4xi32>
+    %1 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<*xi32>
+    // CHECK: tf.Add
+    // CHECK-SAME: (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
+    %2 = "tf.Add"(%0, %1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    return
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 5a2cae38062..6a63e83be0f 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -430,6 +430,7 @@ LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
 Attribute ComputeOutputComponent(const ValuePort& value_port,
                                  ValueQueryFn values) {
   LLVM_DEBUG(value_port.print(llvm::errs() << "\nComputing output for "));
+  if (auto known = values(value_port)) return known;
 
   auto op = value_port.producer.dyn_cast<Operation*>();
   if (!op) return nullptr;
@@ -454,6 +455,7 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
     ValuePort op_port(op->getOperand(port[1]));
     return values(op_port);
   }
+
   return nullptr;
 }
 
@@ -475,8 +477,11 @@ class ShapeInference {
   }
 
   Attribute ComputeOutputComponent(const ValuePort& value_port) {
-    return ::mlir::TF::ComputeOutputComponent(
+    if (auto known_attr = results_[value_port]) return known_attr;
+    auto attr = ::mlir::TF::ComputeOutputComponent(
         value_port, [this](const ValuePort& port) { return results_[port]; });
+    RecordValue(value_port, attr);
+    return attr;
   }
 
   // Returns ShapeHandle if the op result could be computed as shape.
@@ -520,19 +525,35 @@ class ShapeInference {
   LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
                                                     int64_t max_iteration);
 
+  // Propagates any constant operand of call_op to the called function body's
+  // corresponding argument if the callee has only one use.
+  //
+  // TODO(b/154065712): Move this to a more general inter-procedural constant
+  // folding pass.
+  void PropagateConstantToCallee(CallOpInterface call_op,
+                                 SymbolRefAttr callee_sym, ModuleOp module);
+
+  // Propagates any constant return value of the callee function to the call
+  // op's corresponding result.
+  void PropagateConstantFromCallee(CallOpInterface call_op,
+                                   SymbolRefAttr callee_sym, ModuleOp module);
+
+  // Tries to compute the result of folding the op. This doesn't actually
+  // perform constant folding, it is just computes the equivalent constants.
+  // Returns whether it was able to compute constant values.
+  LogicalResult TryToFold(Operation* op);
+
  private:
   // Mapping between ValuePort (which corresponds to an OpResult or smaller,
   // e.g., first element of OpResult produded) to an Attribute if the ValuePort
   // corresponds to a constant value.
   ValuePortResultMap results_;
   int64_t graph_version_;
-  MLIRContext* context_;
   Dialect* tf_dialect_;
 };
 
 ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context)
     : graph_version_(graph_version) {
-  context_ = context;
   tf_dialect_ = context->getRegisteredDialect<TensorFlowDialect>();
 }
 
@@ -581,7 +602,6 @@ ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
       auto ret = ComputeOutputComponent(front);
       if (!ret) continue;
 
-      RecordValue(front, ret);
       LLVM_DEBUG(ret.print(llvm::dbgs() << "\ncomputed result = "));
 
       // If worklist is empty, then this is the root query op.
@@ -686,10 +706,14 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
     size_t index = it.index();
 
     // If the operand is constant, then convert it to Tensor.
-    ElementsAttr attr;
-    if (matchPattern(operand, m_Constant(&attr))) {
+    ValuePort vp(operand);
+    Attribute attr = ComputeOutputComponent(vp);
+    if (!attr && matchPattern(operand, m_Constant(&attr)))
+      RecordValue(vp, attr);
+    if (attr) {
       tensorflow::Tensor* input_tensor = &tensors[index];
-      auto status = tensorflow::ConvertToTensor(attr, input_tensor);
+      auto status =
+          tensorflow::ConvertToTensor(attr.cast<ElementsAttr>(), input_tensor);
       if (status.ok()) {
         input_tensors[index] = input_tensor;
       } else {
@@ -865,13 +889,9 @@ LogicalResult ShapeInference::PropagateShapeToFunctions(
   return success(all_succeeded);
 }
 
-// If the callee has only one use, propagates any constant operand of call_op to
-// the called function body's corresponding argument.
-//
-// TODO(b/154065712): Move this to a more general inter-procedural constant
-// folding pass.
-void PropagateConstantToCallee(CallOpInterface call_op,
-                               SymbolRefAttr callee_sym, ModuleOp module) {
+void ShapeInference::PropagateConstantToCallee(CallOpInterface call_op,
+                                               SymbolRefAttr callee_sym,
+                                               ModuleOp module) {
   auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
   auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
   int num_uses = std::distance(func_uses->begin(), func_uses->end());
@@ -879,31 +899,29 @@ void PropagateConstantToCallee(CallOpInterface call_op,
   Operation* op = call_op.getOperation();
   if (num_uses == 1) {
     // If this is the only caller, and an operand is a constant, propagate
-    // the constant inside the function.
+    // the constant value inside the function.
     for (auto arg : func.getArguments()) {
-      auto operand = op->getOperand(arg.getArgNumber()).getDefiningOp();
-      if (isa_and_nonnull<TF::ConstOp>(operand)) {
-        arg.replaceAllUsesWith(builder.clone(*operand)->getResult(0));
-      }
+      auto operand = op->getOperand(arg.getArgNumber());
+      if (auto known_constant = ComputeOutputComponent(ValuePort(operand)))
+        RecordValue(ValuePort(arg), known_constant);
     }
   }
 }
 
-// Propagates any constant return value of the callee function to the call op's
-// corresponding result.
-void PropagateConstantFromCallee(CallOpInterface call_op,
-                                 SymbolRefAttr callee_sym, ModuleOp module) {
+void ShapeInference::PropagateConstantFromCallee(CallOpInterface call_op,
+                                                 SymbolRefAttr callee_sym,
+                                                 ModuleOp module) {
   auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
-  // If the return value is a constant, replace the call result with a constant.
+  // If the return value is a constant, use the constant as the value of
+  // the call return.
   Operation* op = call_op.getOperation();
   OpBuilder builder(op);
   builder.setInsertionPointAfter(op);
   for (auto retval :
        llvm::enumerate(func.front().getTerminator()->getOperands())) {
-    auto retval_op = retval.value().getDefiningOp();
-    if (isa_and_nonnull<TF::ConstOp>(retval_op)) {
-      op->getResult(retval.index())
-          .replaceAllUsesWith(builder.clone(*retval_op)->getResult(0));
+    ValuePort vp(retval.value());
+    if (auto known_constant = ComputeOutputComponent(vp)) {
+      RecordValue(ValuePort(op->getResult(retval.index())), known_constant);
     }
   }
 }
@@ -938,10 +956,68 @@ LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
   return success();
 }
 
+LogicalResult ShapeInference::TryToFold(Operation* op) {
+  // If any output result is known, then the op probably has been computed
+  // before.
+  if (op->getNumResults() > 0 && results_[ValuePort(op->getResult(0))])
+    return success();
+
+  SmallVector<Attribute, 8> constant_operands(op->getNumOperands());
+  SmallVector<OpFoldResult, 8> fold_results;
+
+  // Check to see if any operands to the operation is constant and whether
+  // the operation knows how to constant fold itself.
+  bool some_unknown = false;
+  for (int i = 0, e = op->getNumOperands(); i != e; ++i) {
+    if (!(constant_operands[i] =
+              ComputeOutputComponent(ValuePort(op->getOperand(i)))))
+      some_unknown = true;
+  }
+
+  // Attempt to constant fold the operation.
+  auto* abstract_op = op->getAbstractOperation();
+  if (abstract_op) {
+    if (failed(abstract_op->foldHook(op, constant_operands, fold_results)))
+      return failure();
+  } else {
+    Dialect* dialect = op->getDialect();
+    if (!dialect) return failure();
+    // Only attempt TF dialect fallback if there are no unknown operands.
+    if (some_unknown && dialect == tf_dialect_) return failure();
+    SmallVector<Attribute, 8> constants;
+    if (failed(dialect->constantFoldHook(op, constant_operands, constants)))
+      return failure();
+    fold_results.assign(constants.begin(), constants.end());
+  }
+
+  for (auto result : zip(op->getResults(), fold_results)) {
+    auto fold_result = std::get<1>(result);
+    Attribute attr = nullptr;
+    if ((attr = fold_result.dyn_cast<Attribute>())) {
+      RecordValue(ValuePort(std::get<0>(result)), attr);
+    } else {
+      auto value = fold_result.get<Value>();
+      if ((attr = ComputeOutputComponent(ValuePort(value))))
+        RecordValue(ValuePort(std::get<0>(result)), attr);
+    }
+
+    if (ElementsAttr eattr = attr.dyn_cast_or_null<ElementsAttr>()) {
+      if (std::get<0>(result).getType() == eattr.getType()) continue;
+
+      // Inserts a cast back to the original type if any user is not in the
+      // TF dialect.
+      Type old_type = std::get<0>(result).getType();
+      std::get<0>(result).setType(eattr.getType());
+      AddCastBackForUnsupportedNonTFUses(op, std::get<0>(result), tf_dialect_,
+                                         old_type);
+    }
+  }
+
+  return success();
+}
+
 LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
                                                       int64_t max_iteration) {
-  // An operation folder that is used to attempt folding before inference._
-  OperationFolder folder(context_);
   bool changed = true;
 
   // TODO(aminim): we could have a more efficient traversal by guiding the
@@ -955,9 +1031,7 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
     region->walk([&](Operation* op) {
       if (auto infer_ti = dyn_cast<InferTypeOpInterface>(op)) {
         changed |= RefineWithInferTypeOpInterface(infer_ti, tf_dialect_);
-        // TODO(jpienaar): Debug why we can't just return here. We end up with
-        // additional constant due to the propagation of constant into attached
-        // function if we return already.
+        return;
       }
 
       if (op->getDialect() != tf_dialect_) {
@@ -965,8 +1039,9 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
         return;
       }
 
-      // Before attempting inference, just try to fold the operation.
-      if (succeeded(folder.tryToFold(op))) return;
+      // Before attempting inference, just try to compute the folded
+      // value/shape.
+      if (succeeded(TryToFold(op))) return;
 
       // Best-effort shape inference in attached functions. Do not return
       // failure even if it doesn't get to fixed point.

From 8565ed2eed43057d4f880a0594100108df438d85 Mon Sep 17 00:00:00 2001
From: Brian Zhao <bmzhao@google.com>
Date: Thu, 14 May 2020 08:55:13 -0700
Subject: [PATCH 201/412] Removing TensorHandleList Delete APIs, since
 TensorHandleList pointer is owned by ConcreteFunction.

PiperOrigin-RevId: 311541122
Change-Id: I0a538b3452c62ee021cf7a41257cbcf580c0d3f2
---
 .../c/experimental/saved_model/internal/tensorhandle_list.cc  | 3 ---
 .../c/experimental/saved_model/public/tensorhandle_list.h     | 4 ----
 2 files changed, 7 deletions(-)

diff --git a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
index 6ef937591aa..7d018658101 100644
--- a/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
+++ b/tensorflow/c/experimental/saved_model/internal/tensorhandle_list.cc
@@ -32,8 +32,5 @@ TFE_TensorHandle* TF_TensorHandleListGet(const TF_TensorHandleList* list,
   return tensorflow::wrap((*tensorflow::unwrap(list))[i]);
 }
 
-void TF_DeleteTensorHandleList(const TF_TensorHandleList* list) {
-  delete tensorflow::unwrap(list);
-}
 
 }  // end extern "C"
diff --git a/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
index 393708aa2bf..a1e88db3474 100644
--- a/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
+++ b/tensorflow/c/experimental/saved_model/public/tensorhandle_list.h
@@ -36,10 +36,6 @@ TF_CAPI_EXPORT extern size_t TF_TensorHandleListSize(
 TF_CAPI_EXPORT extern TFE_TensorHandle* TF_TensorHandleListGet(
     const TF_TensorHandleList* list, int i);
 
-// Deletes `list`.
-TF_CAPI_EXPORT extern void TF_DeleteTensorHandleList(
-    const TF_TensorHandleList* list);
-
 #ifdef __cplusplus
 }  // end extern "C"
 #endif  // __cplusplus

From ffef54602d33f3b23ce21a0d421efde05efe7cef Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Thu, 14 May 2020 18:21:13 +0200
Subject: [PATCH 202/412] Fix missing self Add initial autograph wrapping in
 map_fn

---
 tensorflow/python/kernel_tests/map_fn_test.py | 2 +-
 tensorflow/python/ops/map_fn.py               | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 7bf793c1e20..1859c6c5873 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -187,7 +187,7 @@ class MapFnTest(test.TestCase):
     self.assertAllEqual(nums, received[2])
 
   @test_util.run_in_graph_and_eager_modes
-  def testMap_autograph_indirect():
+  def testMap_autograph_indirect(self):
     def test_function(x):
       cond = tf.constant(-1)
       if cond == 0:
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 2c9c678336e..dfe32998282 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -39,6 +39,12 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
+autograph_ctx = lazy_loader.LazyLoader(
+    "autograph_ctx", globals(),
+    "tensorflow.python.autograph.core.ag_ctx")
+autograph = lazy_loader.LazyLoader(
+    "autograph", globals(),
+    "tensorflow.python.autograph.impl.api")
 
 @tf_export(v1=["map_fn"])
 @deprecation.deprecated_args(None, "Use fn_output_signature instead", "dtype")
@@ -477,7 +483,8 @@ def map_fn(fn,
       elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable,
                                                         elems_flat_signature)
       elems_value = elems_unflatten(elems_value_flat)
-      result_value = fn(elems_value)
+      ag_ctx = autograph_ctx.control_status_ctx()
+      result_value = autograph.tf_convert(elems_value, ag_ctx)
       nest.assert_same_structure(fn_output_signature or elems, result_value)
       result_value_flat = nest.flatten(result_value)
       result_value_batchable = _result_value_flat_to_batchable(

From 5d3c548620a5e23ba765cd8d7a09feaa08e9b056 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 09:35:50 -0700
Subject: [PATCH 203/412] Resolve trivial aliases for portable TensorFlow
 targets.

PiperOrigin-RevId: 311548335
Change-Id: I837aa5a62500682783607841f0c993c2b6c238ed
---
 tensorflow/c/BUILD                         | 20 ++++++++---------
 tensorflow/c/eager/BUILD                   |  4 ++--
 tensorflow/cc/saved_model/BUILD            |  2 +-
 tensorflow/compiler/jit/BUILD              |  2 +-
 tensorflow/core/common_runtime/eager/BUILD | 18 +++++++--------
 tensorflow/core/kernels/BUILD              | 26 +++++++++++-----------
 tensorflow/examples/label_image/BUILD      |  2 +-
 tensorflow/java/src/main/native/BUILD      |  2 +-
 tensorflow/lite/delegates/flex/BUILD       | 14 ++++++------
 tensorflow/lite/testing/BUILD              | 12 +++++-----
 tensorflow/lite/testing/kernel_test/BUILD  |  2 +-
 tensorflow/tools/benchmark/BUILD           |  4 ++--
 12 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 66ade5c7bd4..7fb02028837 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -85,7 +85,7 @@ tf_cuda_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:chromiumos": [
             ":tf_attrtype",
@@ -182,7 +182,7 @@ tf_cuda_library(
         ":tf_status_internal",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":tf_status",
@@ -219,7 +219,7 @@ tf_cuda_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:lib",
@@ -234,7 +234,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":tf_status_internal",
@@ -272,7 +272,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -288,7 +288,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":tensor_interface",
@@ -313,7 +313,7 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":tensor_interface",
@@ -426,7 +426,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -457,7 +457,7 @@ tf_cuda_library(
     ] + select({
         "//tensorflow:android": [
             ":c_api_internal",
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":c_api_internal",
@@ -484,7 +484,7 @@ tf_cuda_library(
         ":tf_status_helper",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 69808f6f49f..fe4d5ac6ffe 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -35,7 +35,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":context_interface",
@@ -412,7 +412,7 @@ tf_cuda_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             ":c_api",
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index a20cc9c9945..b13d8db48a9 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -84,7 +84,7 @@ cc_library(
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
     ]) + if_android([
-        "//tensorflow/core:android_tensorflow_lib",
+        "//tensorflow/core:portable_tensorflow_lib",
     ]),
 )
 
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 28d922f9e3c..bc8fac0e88f 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -251,7 +251,7 @@ cc_library(
     visibility = [":friends"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:graph",
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 2b2313d91ff..625468b39d5 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -47,7 +47,7 @@ tf_cuda_library(
     visibility = ["//tensorflow:internal"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu_lib",
@@ -83,7 +83,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:worker_env",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "@com_google_absl//absl/types:optional",
@@ -147,7 +147,7 @@ tf_cuda_library(
         "//tensorflow/core/platform:platform_port",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu_lib",
@@ -181,7 +181,7 @@ tf_cuda_library(
         ":eager_executor",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "@com_google_absl//absl/types:variant",
@@ -207,7 +207,7 @@ tf_cuda_library(
         ":tensor_handle_data",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "@com_google_absl//absl/strings",
@@ -312,7 +312,7 @@ tf_cuda_library(
         "@farmhash_archive//:farmhash",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:windows": KERNEL_AND_DEVICE_DEPS,
         "//conditions:default": KERNEL_AND_DEVICE_DEPS + [
@@ -381,7 +381,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/distributed_runtime/eager:remote_mgr",
@@ -498,7 +498,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core/distributed_runtime/eager:remote_mgr",
@@ -527,7 +527,7 @@ tf_cuda_library(
         "@farmhash_archive//:farmhash",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu",
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 6cb8704f494..788924e8b37 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -4021,7 +4021,7 @@ cc_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//third_party/eigen3",
@@ -4046,7 +4046,7 @@ cc_library(
         ":eigen_spatial_convolutions-inl",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
@@ -4062,7 +4062,7 @@ cc_library(
     deps = select({
         "//tensorflow:android": [
             ":conv_3d_mobile",
-            "//tensorflow/core:android_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             ":conv_3d",
@@ -7270,8 +7270,8 @@ tf_cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":quantized_ops",
@@ -7331,8 +7331,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":ops_util",
@@ -7416,8 +7416,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":ops_testutil",
@@ -7603,8 +7603,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             ":ops_util",
@@ -7829,8 +7829,8 @@ cc_binary(
     ] + select({
         "//tensorflow:android": [
             ":android_tensorflow_kernels",
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:framework",
diff --git a/tensorflow/examples/label_image/BUILD b/tensorflow/examples/label_image/BUILD
index 162a44ac109..a0e5005d45a 100644
--- a/tensorflow/examples/label_image/BUILD
+++ b/tensorflow/examples/label_image/BUILD
@@ -35,7 +35,7 @@ tf_cc_binary(
             # cc:cc_ops is used to include image ops (for label_image)
             # Jpg, gif, and png related code won't be included
             "//tensorflow/cc:cc_ops",
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
             # cc:android_tensorflow_image_op is for including jpeg/gif/png
             # decoder to enable real-image evaluation on Android
             "//tensorflow/core/kernels:android_tensorflow_image_op",
diff --git a/tensorflow/java/src/main/native/BUILD b/tensorflow/java/src/main/native/BUILD
index 0b363ff577e..e38e58d6fe6 100644
--- a/tensorflow/java/src/main/native/BUILD
+++ b/tensorflow/java/src/main/native/BUILD
@@ -30,7 +30,7 @@ tf_cuda_library(
     }),
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//conditions:default": [
             "//tensorflow/c:c_api",
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index d69d2207e63..98314fdc1b8 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -23,7 +23,7 @@ cc_library(
         "//tensorflow/lite:string_util",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -63,7 +63,7 @@ cc_library(
         ":delegate_only_runtime",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib",
@@ -100,7 +100,7 @@ cc_library(
         "//tensorflow/lite:util",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -134,7 +134,7 @@ cc_library(
         "@com_google_absl//absl/memory",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -180,7 +180,7 @@ cc_library(
         # set of core TensorFlow kernels. We may want to revisit this dependency
         # to allow selective registration via build targets.
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -208,7 +208,7 @@ tf_cc_test(
         "@com_google_googletest//:gtest",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib",
@@ -242,7 +242,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
     ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index df85f659bf3..379230b3a4b 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -68,8 +68,8 @@ exports_files([
             "//tensorflow/core:test",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
     }),
 ) for conversion_mode, test_name, tags, args in generated_test_models_all() + merged_test_models()]
@@ -326,7 +326,7 @@ cc_library(
             "//tensorflow/core:tensorflow",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib",
@@ -365,7 +365,7 @@ cc_library(
             "//tensorflow/core:framework",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib",
@@ -405,7 +405,7 @@ cc_library(
             "//tensorflow/core:lib",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib",
@@ -440,7 +440,7 @@ cc_library(
             "//tensorflow/core:lib",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
         "//tensorflow:ios": [
             "//tensorflow/core:portable_tensorflow_lib",
diff --git a/tensorflow/lite/testing/kernel_test/BUILD b/tensorflow/lite/testing/kernel_test/BUILD
index 5180f2f4e5a..76333c76259 100644
--- a/tensorflow/lite/testing/kernel_test/BUILD
+++ b/tensorflow/lite/testing/kernel_test/BUILD
@@ -25,7 +25,7 @@ cc_library(
             "//tensorflow/core:lib",
         ],
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
         ],
     }),
 )
diff --git a/tensorflow/tools/benchmark/BUILD b/tensorflow/tools/benchmark/BUILD
index 93b408d522e..674133431f1 100644
--- a/tensorflow/tools/benchmark/BUILD
+++ b/tensorflow/tools/benchmark/BUILD
@@ -28,8 +28,8 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
+            "//tensorflow/core:portable_tensorflow_lib",
+            "//tensorflow/core:portable_tensorflow_test_lib",
         ],
         "//conditions:default": [
             "//tensorflow/core:core_cpu",

From 866e01f318188f15c00d77c2efb219a2c50eb96b Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Thu, 14 May 2020 09:55:16 -0700
Subject: [PATCH 204/412] [XLA:Python] Cache the backend in xla_client_test.

This is in preparation for removing backend caching logic from xla_client.

PiperOrigin-RevId: 311551914
Change-Id: Ia791dc911bd7d9890dec111b8da69a9c619f061c
---
 tensorflow/compiler/xla/python/xla_client_test.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 62b3fae018a..fbdd9921a40 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -2029,8 +2029,11 @@ def TestFactory(xla_backend, cloud_tpu=False):
   return tests
 
 
-def InstantiateTests(globals_dict, backend, test_prefix="", **kw):
-  for klass in TestFactory(backend, **kw):
+def InstantiateTests(globals_dict, backend_fn, test_prefix="", **kw):
+  # Avoid creating a new backend per test (this causes GPU OOM, and is probably
+  # inefficient).
+  backend_fn = functools.lru_cache(maxsize=None)(backend_fn)
+  for klass in TestFactory(backend_fn, **kw):
     test = type(test_prefix + klass.__name__, (klass,), {})
     # Clean up the qualified names of the tests to not include the test factory.
     test.__qualname__ = test.__name__

From 9dd3efb5aa3bacba8c66042ff975a3b9d4d30f95 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Thu, 14 May 2020 10:06:52 -0700
Subject: [PATCH 205/412] Do not silently ignore ptxas compilation failures.

Change the xla_gpu_unsafe_fallback_to_driver_on_ptxas_error default to false.

PiperOrigin-RevId: 311554370
Change-Id: I9a7f9ff114957998a84136e16333addf4a2cd354
---
 tensorflow/compiler/xla/debug_options_flags.cc        |  2 +-
 tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 216fb0a7422..60a563ee956 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -66,7 +66,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_enable_xprof_traceme(true);
   // TODO(b/155295372): disable ptxas fallback by default.
   opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(true);
-  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_error(true);
+  opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_error(false);
 
   return opts;
 }
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 0196267d904..7ff8d40b440 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -416,11 +416,12 @@ std::vector<uint8> NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
             CHECK(hlo_module_config.debug_options()
                       .xla_gpu_unsafe_fallback_to_driver_on_ptxas_error())
                 << "There was an error when trying to compile ptx into sass "
-                   "code. If you want to try falling back to the GPU driver to "
-                   "jit compile ptx, you can use the flag "
-                   "--xla_gpu_unsafe_fallback_to_driver_on_ptxas_error."
-                   " Use at your own risk though, it has known drawbacks like "
-                   "increased memory consumption.";
+                   "code. Up until May 14 2020, XLA silently ignored such "
+                   "errors and fell back to the GPU driver. This is likely to "
+                   "trigger subtle runtime issues and is hence discouraged. "
+                   "If you want to temporarily restore this behavior use the "
+                   "flag --xla_gpu_unsafe_fallback_to_driver_on_ptxas_error "
+                   "and file a bug in b/components/366096.";
           }
 
           // We're going to use the driver to JIT our PTX->SASS, so warn if

From c3d351abd20a814e7a8eae4e3d951b18667cbac8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 10:24:35 -0700
Subject: [PATCH 206/412] Internal change

PiperOrigin-RevId: 311558265
Change-Id: Ib91edbfdbd7d3442c72401a794283518393bc64d
---
 .../compiler/mlir/tensorflow/ir/tf_ops.cc     |  10 +-
 .../tensorflow/tests/shape_inference.mlir     |  31 ++--
 .../tensorflow/transforms/shape_inference.cc  | 145 +++++-------------
 3 files changed, 49 insertions(+), 137 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index b21fef32cca..2007824369c 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -3551,20 +3551,12 @@ OpFoldResult FoldIdentityTranspose(TransposeOp op) {
   if (!const_perm) return {};
 
   auto const_value = const_perm.value();
-  const auto elements = const_value.getValues<APInt>();
+  const auto &elements = const_value.getValues<APInt>();
 
   for (auto it : llvm::enumerate(elements)) {
     if (it.index() != it.value()) return {};
   }
 
-  // TODO(jpienaar): Remove when we handle this more generally.
-  if (op.getType() != op.x().getType()) {
-    // If the types don't match then only fold if all the operands are in the TF
-    // dialect.
-    for (auto user : op.getOperation()->getUsers())
-      if (user->getDialect() != op.getDialect()) return {};
-  }
-
   return op.x();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index cfe8db9025e..160bba94cfc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -3,8 +3,8 @@
 module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 130 : i32}} {
 // CHECK-LABEL: func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi32>
   func @main(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<*xi32> {
- // CHECK: %[[RESULT:.*]] = "tf.AddV2"
- // CHECK-SAME: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+ // CHECK-NOT: tf.Cast
+ // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
  // CHECK: return %[[RESULT]] : tensor<1xi32>
     %0 = "tf.Cast"(%arg0) : (tensor<1xi32>) -> tensor<*xi32>
     %1 = "tf.Cast"(%arg1) : (tensor<1xi32>) -> tensor<*xi32>
@@ -60,8 +60,8 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: func @simple_folding
   func @simple_folding(%arg0: tensor<1x1x1x1xi32>, %arg1: tensor<1x1x1x1xf32>) -> tensor<?x?x?x?xf32> {
-// CHECK: %[[SHAPE:.*]] = "tf.Shape"
-// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[SHAPE]]
+// CHECK: %[[CST:.*]] = "tf.Const"{{.*}} {value = dense<1> : tensor<4xi32>} : () -> tensor<4xi32>
+// CHECK: %[[CONV:.*]] = "tf.Conv2DBackpropInput"(%[[CST]]
 // CHECK-SAME: (tensor<4xi32>, tensor<1x1x1x1xf32>, tensor<1x1x1x1xf32>) -> tensor<1x1x1x1xf32>
 // CHECK: return %[[CONV]] : tensor<1x1x1x1xf32>
     %0 = "tf.Shape"(%arg0) : (tensor<1x1x1x1xi32>) -> tensor<4xi32>
@@ -300,6 +300,13 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     return %0 : tensor<*xi32>
   }
 
+  // CHECK-LABEL: func @fold_cast
+  func @fold_cast(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+    // CHECK-NOT: Cast
+    %0 = "tf.Cast"(%arg0) : (tensor<*xf32>) -> (tensor<*xf32>)
+    return %0 : tensor<*xf32>
+  }
+
   // CHECK-LABEL: func @while_variant
   // CHECK-SAME: -> tensor<!tf.variant<tensor<16x1xf32>>>
   func @while_variant(%arg0: tensor<!tf.variant<tensor<16x1xf32>>>) -> tensor<!tf.variant> {
@@ -355,6 +362,8 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
 
   // CHECK-LABEL: func @partitioned_call_func_const
   func @partitioned_call_func_const(%arg0: tensor<2xi32>) -> tensor<2xi32> {
+    // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<[3, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+    // CHECK: return %[[CONST]]
     return %arg0 : tensor<2xi32>
   }
 
@@ -401,18 +410,4 @@ func @multiple_blocks_one_return(%arg0: tensor<?xf32>) -> tensor<*xf32> {
     %40 = "tf.Reshape"(%39, %19) {T = f32, Tshape = i32, device = ""} : (tensor<1x4x4x32xf32>, tensor<2xi32>) -> tensor<?x?xf32>
    return
   }
-
-  // CHECK-LABEL: const_fold
-  func @const_fold() -> () {
-    // CHECK: tf.Const
-    // CHECK-SAME: () -> tensor<4xi32>
-    %0 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<*xi32>
-    // CHECK: tf.Const
-    // CHECK-SAME: () -> tensor<4xi32>
-    %1 = "tf.Const"() {value = dense<[200, 26, 26, 32]> : tensor<4xi32>} : () -> tensor<*xi32>
-    // CHECK: tf.Add
-    // CHECK-SAME: (tensor<4xi32>, tensor<4xi32>) -> tensor<4xi32>
-    %2 = "tf.Add"(%0, %1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
-    return
-  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 6a63e83be0f..5a2cae38062 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -430,7 +430,6 @@ LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
 Attribute ComputeOutputComponent(const ValuePort& value_port,
                                  ValueQueryFn values) {
   LLVM_DEBUG(value_port.print(llvm::errs() << "\nComputing output for "));
-  if (auto known = values(value_port)) return known;
 
   auto op = value_port.producer.dyn_cast<Operation*>();
   if (!op) return nullptr;
@@ -455,7 +454,6 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
     ValuePort op_port(op->getOperand(port[1]));
     return values(op_port);
   }
-
   return nullptr;
 }
 
@@ -477,11 +475,8 @@ class ShapeInference {
   }
 
   Attribute ComputeOutputComponent(const ValuePort& value_port) {
-    if (auto known_attr = results_[value_port]) return known_attr;
-    auto attr = ::mlir::TF::ComputeOutputComponent(
+    return ::mlir::TF::ComputeOutputComponent(
         value_port, [this](const ValuePort& port) { return results_[port]; });
-    RecordValue(value_port, attr);
-    return attr;
   }
 
   // Returns ShapeHandle if the op result could be computed as shape.
@@ -525,35 +520,19 @@ class ShapeInference {
   LogicalResult PropagateShapeIntoAttachedFunctions(Operation* op,
                                                     int64_t max_iteration);
 
-  // Propagates any constant operand of call_op to the called function body's
-  // corresponding argument if the callee has only one use.
-  //
-  // TODO(b/154065712): Move this to a more general inter-procedural constant
-  // folding pass.
-  void PropagateConstantToCallee(CallOpInterface call_op,
-                                 SymbolRefAttr callee_sym, ModuleOp module);
-
-  // Propagates any constant return value of the callee function to the call
-  // op's corresponding result.
-  void PropagateConstantFromCallee(CallOpInterface call_op,
-                                   SymbolRefAttr callee_sym, ModuleOp module);
-
-  // Tries to compute the result of folding the op. This doesn't actually
-  // perform constant folding, it is just computes the equivalent constants.
-  // Returns whether it was able to compute constant values.
-  LogicalResult TryToFold(Operation* op);
-
  private:
   // Mapping between ValuePort (which corresponds to an OpResult or smaller,
   // e.g., first element of OpResult produded) to an Attribute if the ValuePort
   // corresponds to a constant value.
   ValuePortResultMap results_;
   int64_t graph_version_;
+  MLIRContext* context_;
   Dialect* tf_dialect_;
 };
 
 ShapeInference::ShapeInference(int64_t graph_version, MLIRContext* context)
     : graph_version_(graph_version) {
+  context_ = context;
   tf_dialect_ = context->getRegisteredDialect<TensorFlowDialect>();
 }
 
@@ -602,6 +581,7 @@ ShapeHandle ShapeInference::ComputeOutputAsShape(OpResult result,
       auto ret = ComputeOutputComponent(front);
       if (!ret) continue;
 
+      RecordValue(front, ret);
       LLVM_DEBUG(ret.print(llvm::dbgs() << "\ncomputed result = "));
 
       // If worklist is empty, then this is the root query op.
@@ -706,14 +686,10 @@ bool ShapeInference::InferShapeForSingleOperation(Operation* op) {
     size_t index = it.index();
 
     // If the operand is constant, then convert it to Tensor.
-    ValuePort vp(operand);
-    Attribute attr = ComputeOutputComponent(vp);
-    if (!attr && matchPattern(operand, m_Constant(&attr)))
-      RecordValue(vp, attr);
-    if (attr) {
+    ElementsAttr attr;
+    if (matchPattern(operand, m_Constant(&attr))) {
       tensorflow::Tensor* input_tensor = &tensors[index];
-      auto status =
-          tensorflow::ConvertToTensor(attr.cast<ElementsAttr>(), input_tensor);
+      auto status = tensorflow::ConvertToTensor(attr, input_tensor);
       if (status.ok()) {
         input_tensors[index] = input_tensor;
       } else {
@@ -889,9 +865,13 @@ LogicalResult ShapeInference::PropagateShapeToFunctions(
   return success(all_succeeded);
 }
 
-void ShapeInference::PropagateConstantToCallee(CallOpInterface call_op,
-                                               SymbolRefAttr callee_sym,
-                                               ModuleOp module) {
+// If the callee has only one use, propagates any constant operand of call_op to
+// the called function body's corresponding argument.
+//
+// TODO(b/154065712): Move this to a more general inter-procedural constant
+// folding pass.
+void PropagateConstantToCallee(CallOpInterface call_op,
+                               SymbolRefAttr callee_sym, ModuleOp module) {
   auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
   auto func_uses = SymbolTable::getSymbolUses(func, &module.getBodyRegion());
   int num_uses = std::distance(func_uses->begin(), func_uses->end());
@@ -899,29 +879,31 @@ void ShapeInference::PropagateConstantToCallee(CallOpInterface call_op,
   Operation* op = call_op.getOperation();
   if (num_uses == 1) {
     // If this is the only caller, and an operand is a constant, propagate
-    // the constant value inside the function.
+    // the constant inside the function.
     for (auto arg : func.getArguments()) {
-      auto operand = op->getOperand(arg.getArgNumber());
-      if (auto known_constant = ComputeOutputComponent(ValuePort(operand)))
-        RecordValue(ValuePort(arg), known_constant);
+      auto operand = op->getOperand(arg.getArgNumber()).getDefiningOp();
+      if (isa_and_nonnull<TF::ConstOp>(operand)) {
+        arg.replaceAllUsesWith(builder.clone(*operand)->getResult(0));
+      }
     }
   }
 }
 
-void ShapeInference::PropagateConstantFromCallee(CallOpInterface call_op,
-                                                 SymbolRefAttr callee_sym,
-                                                 ModuleOp module) {
+// Propagates any constant return value of the callee function to the call op's
+// corresponding result.
+void PropagateConstantFromCallee(CallOpInterface call_op,
+                                 SymbolRefAttr callee_sym, ModuleOp module) {
   auto func = module.lookupSymbol<FuncOp>(callee_sym.getRootReference());
-  // If the return value is a constant, use the constant as the value of
-  // the call return.
+  // If the return value is a constant, replace the call result with a constant.
   Operation* op = call_op.getOperation();
   OpBuilder builder(op);
   builder.setInsertionPointAfter(op);
   for (auto retval :
        llvm::enumerate(func.front().getTerminator()->getOperands())) {
-    ValuePort vp(retval.value());
-    if (auto known_constant = ComputeOutputComponent(vp)) {
-      RecordValue(ValuePort(op->getResult(retval.index())), known_constant);
+    auto retval_op = retval.value().getDefiningOp();
+    if (isa_and_nonnull<TF::ConstOp>(retval_op)) {
+      op->getResult(retval.index())
+          .replaceAllUsesWith(builder.clone(*retval_op)->getResult(0));
     }
   }
 }
@@ -956,68 +938,10 @@ LogicalResult ShapeInference::PropagateShapeIntoAttachedFunctions(
   return success();
 }
 
-LogicalResult ShapeInference::TryToFold(Operation* op) {
-  // If any output result is known, then the op probably has been computed
-  // before.
-  if (op->getNumResults() > 0 && results_[ValuePort(op->getResult(0))])
-    return success();
-
-  SmallVector<Attribute, 8> constant_operands(op->getNumOperands());
-  SmallVector<OpFoldResult, 8> fold_results;
-
-  // Check to see if any operands to the operation is constant and whether
-  // the operation knows how to constant fold itself.
-  bool some_unknown = false;
-  for (int i = 0, e = op->getNumOperands(); i != e; ++i) {
-    if (!(constant_operands[i] =
-              ComputeOutputComponent(ValuePort(op->getOperand(i)))))
-      some_unknown = true;
-  }
-
-  // Attempt to constant fold the operation.
-  auto* abstract_op = op->getAbstractOperation();
-  if (abstract_op) {
-    if (failed(abstract_op->foldHook(op, constant_operands, fold_results)))
-      return failure();
-  } else {
-    Dialect* dialect = op->getDialect();
-    if (!dialect) return failure();
-    // Only attempt TF dialect fallback if there are no unknown operands.
-    if (some_unknown && dialect == tf_dialect_) return failure();
-    SmallVector<Attribute, 8> constants;
-    if (failed(dialect->constantFoldHook(op, constant_operands, constants)))
-      return failure();
-    fold_results.assign(constants.begin(), constants.end());
-  }
-
-  for (auto result : zip(op->getResults(), fold_results)) {
-    auto fold_result = std::get<1>(result);
-    Attribute attr = nullptr;
-    if ((attr = fold_result.dyn_cast<Attribute>())) {
-      RecordValue(ValuePort(std::get<0>(result)), attr);
-    } else {
-      auto value = fold_result.get<Value>();
-      if ((attr = ComputeOutputComponent(ValuePort(value))))
-        RecordValue(ValuePort(std::get<0>(result)), attr);
-    }
-
-    if (ElementsAttr eattr = attr.dyn_cast_or_null<ElementsAttr>()) {
-      if (std::get<0>(result).getType() == eattr.getType()) continue;
-
-      // Inserts a cast back to the original type if any user is not in the
-      // TF dialect.
-      Type old_type = std::get<0>(result).getType();
-      std::get<0>(result).setType(eattr.getType());
-      AddCastBackForUnsupportedNonTFUses(op, std::get<0>(result), tf_dialect_,
-                                         old_type);
-    }
-  }
-
-  return success();
-}
-
 LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
                                                       int64_t max_iteration) {
+  // An operation folder that is used to attempt folding before inference._
+  OperationFolder folder(context_);
   bool changed = true;
 
   // TODO(aminim): we could have a more efficient traversal by guiding the
@@ -1031,7 +955,9 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
     region->walk([&](Operation* op) {
       if (auto infer_ti = dyn_cast<InferTypeOpInterface>(op)) {
         changed |= RefineWithInferTypeOpInterface(infer_ti, tf_dialect_);
-        return;
+        // TODO(jpienaar): Debug why we can't just return here. We end up with
+        // additional constant due to the propagation of constant into attached
+        // function if we return already.
       }
 
       if (op->getDialect() != tf_dialect_) {
@@ -1039,9 +965,8 @@ LogicalResult ShapeInference::InferShapeUntilFixPoint(Region* region,
         return;
       }
 
-      // Before attempting inference, just try to compute the folded
-      // value/shape.
-      if (succeeded(TryToFold(op))) return;
+      // Before attempting inference, just try to fold the operation.
+      if (succeeded(folder.tryToFold(op))) return;
 
       // Best-effort shape inference in attached functions. Do not return
       // failure even if it doesn't get to fixed point.

From 8098b120097088423fe260d7633f4dfc9d882033 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 10:44:05 -0700
Subject: [PATCH 207/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311562488
Change-Id: I7dd029345a87fd0c982a8bbedefc29df8a5fd563
---
 tensorflow/go/op/wrappers.go | 80 +++++++++++++++++-------------------
 1 file changed, 37 insertions(+), 43 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c6d67c9ad44..a6ee1a13b6e 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -4715,7 +4715,7 @@ type DenseCountSparseOutputAttr func(optionalAttr)
 
 // DenseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4727,7 +4727,7 @@ func DenseCountSparseOutputMinlength(value int64) DenseCountSparseOutputAttr {
 
 // DenseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -4742,20 +4742,20 @@ func DenseCountSparseOutputMaxlength(value int64) DenseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	values: int32 or int64; Tensor containing data to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	values: Tensor containing data to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values. May
+// also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
-func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+func DenseCountSparseOutput(scope *Scope, values tf.Output, weights tf.Output, binary_output bool, optional ...DenseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -8607,7 +8607,7 @@ type RaggedCountSparseOutputAttr func(optionalAttr)
 
 // RaggedCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8619,7 +8619,7 @@ func RaggedCountSparseOutputMinlength(value int64) RaggedCountSparseOutputAttr {
 
 // RaggedCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -8634,33 +8634,27 @@ func RaggedCountSparseOutputMaxlength(value int64) RaggedCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	splits: int64; Tensor containing the row splits of the ragged tensor to count.
-//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	splits: Tensor containing the row splits of the ragged tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//   END
-//   }
-//   out_arg {
-//     name: "output_dense_shape"
-//     description: <<END
-// int64; shape tensor for the resulting sparse tensor object.
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
 //   END
 //   }
 //   attr {
 //     name: "T"
 //     description: <<END
-// dtype; dtype of the input values tensor.
-//	output_dense_shape
-func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+// Dtype of the input values tensor.
+func RaggedCountSparseOutput(scope *Scope, splits tf.Output, values tf.Output, weights tf.Output, binary_output bool, optional ...RaggedCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}
@@ -13706,7 +13700,7 @@ type SparseCountSparseOutputAttr func(optionalAttr)
 
 // SparseCountSparseOutputMinlength sets the optional minlength attribute to value.
 //
-// value: int32; minimum value to count. Can be set to -1 for no minimum.
+// value: Minimum value to count. Can be set to -1 for no minimum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13718,7 +13712,7 @@ func SparseCountSparseOutputMinlength(value int64) SparseCountSparseOutputAttr {
 
 // SparseCountSparseOutputMaxlength sets the optional maxlength attribute to value.
 //
-// value: int32; maximum value to count. Can be set to -1 for no maximum.
+// value: Maximum value to count. Can be set to -1 for no maximum.
 // If not specified, defaults to -1
 //
 // REQUIRES: value >= -1
@@ -13733,22 +13727,22 @@ func SparseCountSparseOutputMaxlength(value int64) SparseCountSparseOutputAttr {
 //   Counts the number of times each value occurs in the input.
 //
 // Arguments:
-//	indices: int64; Tensor containing the indices of the sparse tensor to count.
-//	values: int32 or int64; Tensor containing values of the sparse tensor to count.
-//	dense_shape: int64; Tensor containing the dense shape of the sparse tensor to count.
-//	weights: float32; Optional rank 1 Tensor (shape=[max_values]) with weights for each count value.
-//	binary_count: bool; whether to output the number of occurrences of each value or 1.
-//	output_type: dtype; dtype of the output values tensor.
+//	indices: Tensor containing the indices of the sparse tensor to count.
+//	values: Tensor containing values of the sparse tensor to count.
+//	dense_shape: Tensor containing the dense shape of the sparse tensor to count.
+//	weights: A Tensor of the same shape as indices containing per-index weight values.
+// May also be the empty tensor if no weights are used.
+//	binary_output: Whether to output the number of occurrences of each value or 1.
 //
 // Returns:
-//	output_indices: int64; indices tensor for the resulting sparse tensor object.
-//	output_values: int64 or float32; values tensor for the resulting sparse tensor object.
-//	output_dense_shape: int64; shape tensor for the resulting sparse tensor object.
-func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_count bool, output_type tf.DataType, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
+//	output_indices: Indices tensor for the resulting sparse tensor object.
+//	output_values: Values tensor for the resulting sparse tensor object.
+//	output_dense_shape: Shape tensor for the resulting sparse tensor object.
+func SparseCountSparseOutput(scope *Scope, indices tf.Output, values tf.Output, dense_shape tf.Output, weights tf.Output, binary_output bool, optional ...SparseCountSparseOutputAttr) (output_indices tf.Output, output_values tf.Output, output_dense_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"binary_count": binary_count, "output_type": output_type}
+	attrs := map[string]interface{}{"binary_output": binary_output}
 	for _, a := range optional {
 		a(attrs)
 	}

From 0d94bc6d71f89a380e0b57967f6c78d59f5785f1 Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Thu, 14 May 2020 11:02:42 -0700
Subject: [PATCH 208/412] Fix style in `op_hint.py` to match formatting from
 Copybara.

No functional changes

PiperOrigin-RevId: 311566454
Change-Id: Ic4f002df42168bdb8841b80a93ebf22a8e7fa4bd
---
 tensorflow/lite/python/op_hint.py | 62 ++++++++++++++++++-------------
 1 file changed, 36 insertions(+), 26 deletions(-)

diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 159fcaa2bf3..9d62c1b8a97 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -435,6 +435,7 @@ class OpHint(object):
     Args:
       *args: List of inputs to be converted (should be Tf.Tensor).
       **kwargs: This allows 'names' which should be a list of names.
+
     Returns:
       Wrapped inputs (identity standins that have additional metadata). These
       are also are also tf.Tensor's.
@@ -453,6 +454,7 @@ class OpHint(object):
     Args:
       *args: List of outputs to be converted (should be tf.Tensor).
       **kwargs: See
+
     Returns:
       Wrapped outputs (identity standins that have additional metadata). These
       are also tf.Tensor's.
@@ -574,8 +576,8 @@ class _LiteAggregateOperand(_LiteOperand):
       elif self.aggregation == OpHint.AGGREGATE_STACK:
         pass
       else:
-        raise ValueError(
-            "Invalid aggregation type %r specified" % self.aggregation)
+        raise ValueError("Invalid aggregation type %r specified" %
+                         self.aggregation)
     return self.flattened
 
   def flatten(self):
@@ -646,8 +648,8 @@ class _LiteAggregateOperand(_LiteOperand):
       stack_node.attr["num"].i = len(flattened)
       output_type = flattened[0].attr["T"].type
       stack_node.attr["T"].type = output_type
-      stack_node.input.append(_tensorflow_output_name(
-          fused_op_name, output_index))
+      stack_node.input.append(
+          _tensorflow_output_name(fused_op_name, output_index))
       out_graphdef.node.extend([stack_node])
 
       for idx, discrete in enumerate(flattened):
@@ -675,11 +677,10 @@ class _LiteFuncCall(object):
     inputs: inputs to the op (hash from index # to argument)
     outputs: outputs to the op (hash from index # to argument)
     function_name: the tflite custom op name to use
-    uuid: a unique call id for this particular call  (i.e.
-      multiple function calls would have the same function_name but different
-      uuids.
-    params: A param name to key value for op constant data. I.e. for
-      axis on a reduction, strides on a convolution, etc.
+    uuid: a unique call id for this particular call  (i.e. multiple function
+      calls would have the same function_name but different uuids.
+    params: A param name to key value for op constant data. I.e. for axis on a
+      reduction, strides on a convolution, etc.
     level: Level of the OpHint.
     children_inputs_mappings: If the Ophint has children, children inputs
       mappings indicate how their inputs & outputs are mapped.
@@ -700,6 +701,7 @@ class _LiteFuncCall(object):
     Returns:
       Tuple of (inputs, outputs). where input and output i a list of names.
     """
+
     def _flatten(input_or_output_dict):
       flattened_items = []
       for item in input_or_output_dict.values():
@@ -709,6 +711,7 @@ class _LiteFuncCall(object):
     return _flatten(self.inputs), _flatten(self.outputs)
 
   def __str__(self):
+
     def format_args(items):
       s = ""
       for idx, item in items.iteritems():
@@ -739,8 +742,8 @@ def _find_all_hints_in_nodes(nodes):
   for node in nodes:
     attr = node.attr
     # This is an op hint if it has a FUNCTION_UUID_ATTR, otherwise skip
-    if (OpHint.FUNCTION_UUID_ATTR not in attr
-        or not attr[OpHint.FUNCTION_UUID_ATTR].s):
+    if (OpHint.FUNCTION_UUID_ATTR not in attr or
+        not attr[OpHint.FUNCTION_UUID_ATTR].s):
       continue
     uuid = attr[OpHint.FUNCTION_UUID_ATTR].s
 
@@ -751,9 +754,11 @@ def _find_all_hints_in_nodes(nodes):
     call_def.level = attr[OpHint.FUNCTION_LEVEL_ATTR].i
     # Get sorting and aggregation information
 
-    sort = (attr[OpHint.FUNCTION_SORT_INDEX_ATTR].i
-            if OpHint.FUNCTION_SORT_INDEX_ATTR in attr else None)
-    if sort == -1: sort = None
+    sort = (
+        attr[OpHint.FUNCTION_SORT_INDEX_ATTR].i
+        if OpHint.FUNCTION_SORT_INDEX_ATTR in attr else None)
+    if sort == -1:
+      sort = None
     aggregation = None
     if OpHint.FUNCTION_AGGREGATE_ATTR in attr:
       aggregation = _compat.as_text(attr[OpHint.FUNCTION_AGGREGATE_ATTR].s)
@@ -887,6 +892,7 @@ def _tensor_name_base(full_tensor_name):
   Args:
     full_tensor_name: A tensor name that is annotated with a device placement
       (this is what tensor flow introspection gives).
+
   Returns:
     A name without any device assignment.
   """
@@ -919,10 +925,10 @@ def _check_subgraph_closed(n, reachable_by_input, input_nodes_set,
   while next_to_visit:
     current_node = next_to_visit.pop()
     visited.add(current_node)
-    if (current_node in reachable_by_input
-        and current_node not in input_nodes_set):
-      raise TypeError(
-          "Node %s uses input %s not in input_nodes." % (n, current_node))
+    if (current_node in reachable_by_input and
+        current_node not in input_nodes_set):
+      raise TypeError("Node %s uses input %s not in input_nodes." %
+                      (n, current_node))
     if current_node not in input_nodes_set:
       next_to_visit += [
           input_node for input_node in name_to_input_name[current_node]
@@ -1066,6 +1072,7 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
 
   Args:
     in_graph_def: Graph def to use as input.
+
   Returns:
     Simplified tuple (graph_def, changed_something) where changed_something
     is true if anything was done.
@@ -1101,15 +1108,15 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
       node = name_to_node[current_node_name]
       is_op_hint_stack = node.name.startswith("OpHintStack")
       is_op_hint_unstack = node.name.startswith("OpHintUnstack")
-      if (node.op == "Identity" or is_op_hint_stack
-          or (do_generic_pack_unpack and node.op == "Pack")):
+      if (node.op == "Identity" or is_op_hint_stack or
+          (do_generic_pack_unpack and node.op == "Pack")):
         is_hint_created_stack |= is_op_hint_stack
         next_to_visit += [
             input_node for input_node in name_to_input_name[current_node_name]
             if input_node not in visited
         ]
-      elif (is_op_hint_unstack
-            or (do_generic_pack_unpack and node.op == "Unpack")):
+      elif (is_op_hint_unstack or
+            (do_generic_pack_unpack and node.op == "Unpack")):
         unpack_nodes.add(node.name)
         is_hint_created_stack &= is_op_hint_unstack
       else:
@@ -1124,7 +1131,8 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
       # Unstacked form
       no_external_dependency = True
       for other_n in in_graph_def.node:
-        if other_n.name in visited: continue
+        if other_n.name in visited:
+          continue
         for input_tensor in name_to_input_name[other_n.name]:
           input_op = _tensor_name_base(input_tensor)
           if input_op in visited and input_op != pack_node:
@@ -1141,9 +1149,9 @@ def _remove_one_redundant_stack_unstack(in_graph_def):
           if node_name not in visited:
             new_node = _copy.deepcopy(other_n)
             new_node.input[:] = [
-                (end_input if stripped == pack_node else
-                 non_stripped) for stripped, non_stripped in zip(
-                     name_to_input_name[node_name], new_node.input[:])
+                (end_input if stripped == pack_node else non_stripped)
+                for stripped, non_stripped in zip(name_to_input_name[node_name],
+                                                  new_node.input[:])
             ]
             out.node.extend([new_node])
         return out, True
@@ -1177,6 +1185,7 @@ def _convert_op_hints_to_stubs_helper(
     graph_def: A graph def that we should convert.
     write_callback: A function pointer that can be used to write intermediate
       steps of graph transformation (optional).
+
   Returns:
     A new stubbed graph_def.
   """
@@ -1306,6 +1315,7 @@ def convert_op_hints_to_stubs(session=None,
     graph_def: A graph def that we should convert.
     write_callback: A function pointer that can be used to write intermediate
       steps of graph transformation (optional).
+
   Returns:
     A new graphdef with all ops contained in OpHints being replaced by
     a single op call with the right parameters.

From 5a4e21c1c6e97aa9e1f31a5fe4ac763bd5b57381 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Thu, 14 May 2020 11:21:19 -0700
Subject: [PATCH 209/412] Add test for PromoteVarHandlesToArgs pass for testing
 users of tf.VarHandleOps (NFC).

PiperOrigin-RevId: 311570365
Change-Id: I65d0d98b43e4d4b15fa3e798dfd4b58fccb40ec9
---
 .../tests/promote_var_handles_to_args.mlir          | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir b/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
index 5e53a457ecb..8b8a070cfab 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/promote_var_handles_to_args.mlir
@@ -44,3 +44,16 @@ func @duplicate_vars() {
   %1 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
   return
 }
+
+// CHECK-LABEL: func @duplicate_vars_with_users
+// CHECK-SAME: (%arg0: tensor<f32>, %arg1: tensor<!tf.resource<tensor<f32>>> {tf.resource_name = "x"})
+// CHECK: "tf.ReadVariableOp"(%arg1)
+// CHECK: "tf.AssignAddVariableOp"(%arg1, %arg0)
+// CHECK-NOT: "tf.VarHandleOp"
+func @duplicate_vars_with_users(%arg0: tensor<f32>) {
+  %0 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  %1 = "tf.ReadVariableOp"(%0) : (tensor<!tf.resource<tensor<f32>>>) -> tensor<f32>
+  %2 = "tf.VarHandleOp"() {container = "", shape = "tfshape$", shared_name = "x"} : () -> tensor<!tf.resource<tensor<f32>>>
+  "tf.AssignAddVariableOp"(%2, %arg0) : (tensor<!tf.resource<tensor<f32>>>, tensor<f32>) -> ()
+  return
+}

From ba43780830f09da72081fe5061c436f1c6203a92 Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Thu, 14 May 2020 12:13:06 -0700
Subject: [PATCH 210/412] Generate MLIR ops for TPU Host/Device communication
 for outside compilation.

These ops are needed for communicating dependencies(data or control flow) between
TPU device calculations and outside compiled computations run on host.

PiperOrigin-RevId: 311580827
Change-Id: Ia82623ae2a3535b829691952063724cfaedf22bb
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 103 ++++++++++++++++++
 tensorflow/core/ops/tpu_host_compute_ops.cc   |   6 +-
 2 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 64ea0732e8c..aa1601c4032 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -10329,6 +10329,33 @@ https://www.tensorflow.org/xla/operation_semantics#gather
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaHostComputeOp : TF_Op<"XlaHostCompute", []> {
+  let summary = [{
+A pseudo-op to represent host-side computation in an XLA program.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+
+    StrArrayAttr:$ancestors,
+    TF_ShapeAttrArray:$shapes,
+    SymbolRefAttr:$shape_inference_graph,
+    StrAttr:$key,
+    DefaultValuedAttr<I64Attr, "1000000">:$cost_estimate_ns,
+    DefaultValuedAttr<I64Attr, "0">:$tpu_core
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF_XlaKeyValueSortOp : TF_Op<"XlaKeyValueSort", [NoSideEffect]> {
   let summary = "Wraps the XLA Sort operator, documented at";
 
@@ -10377,6 +10404,24 @@ https://www.tensorflow.org/performance/xla/operation_semantics#pad
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaRecvFromHostOp : TF_Op<"XlaRecvFromHost", []> {
+  let summary = "An op to receive a tensor from the host.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_ShapeAttr:$shape,
+    StrAttr:$key
+  );
+
+  let results = (outs
+    TF_Tensor:$output
+  );
+
+  TF_DerivedResultTypeAttr Toutput = TF_DerivedResultTypeAttr<0>;
+}
+
 def TF_XlaReduceOp : TF_Op<"XlaReduce", [NoSideEffect]> {
   let summary = "Wraps the XLA Reduce operator, documented at";
 
@@ -10441,6 +10486,23 @@ i=0...N-1.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_XlaSendToHostOp : TF_Op<"XlaSendToHost", []> {
+  let summary = "An op to send a tensor to the host.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$input,
+
+    StrAttr:$key
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeAttr Tinput = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_XlaSvdOp : TF_Op<"XlaSvd", [NoSideEffect]> {
   let summary = [{
 Computes the eigen decomposition of a batch of self-adjoint matrices
@@ -10582,3 +10644,44 @@ used to look up the program in the compilation cache.
   TF_DerivedResultSizeAttr num_computations = TF_DerivedResultSizeAttr<1>;
   TF_DerivedOperandSizeAttr NumDynamicShapes = TF_DerivedOperandSizeAttr<0>;
 }
+
+def TF__XlaRecvAtHostOp : TF_Op<"_XlaRecvAtHost", []> {
+  let summary = [{
+A placeholder op to receive values from a running XLA computation.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TF_StrTensor:$dynamic_key,
+
+    StrAttr:$key,
+    I64Attr:$device_ordinal
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
+def TF__XlaSendFromHostOp : TF_Op<"_XlaSendFromHost", []> {
+  let summary = "A placeholder op to send values to a running XLA computation.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+    TF_StrTensor:$dynamic_key,
+
+    StrAttr:$key,
+    I64Attr:$device_ordinal
+  );
+
+  let results = (outs);
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+}
diff --git a/tensorflow/core/ops/tpu_host_compute_ops.cc b/tensorflow/core/ops/tpu_host_compute_ops.cc
index 48aeb81ac13..753cc0015d9 100644
--- a/tensorflow/core/ops/tpu_host_compute_ops.cc
+++ b/tensorflow/core/ops/tpu_host_compute_ops.cc
@@ -28,8 +28,7 @@ REGISTER_OP("_XlaSendFromHost")
     .SetIsStateful()
     .SetShapeFn(::tensorflow::shape_inference::NoOutputs)
     .Doc(R"doc(
-A placeholder op for multiple values that will be sent from TensorFlow to a
-running XLA computation.
+A placeholder op to send values to a running XLA computation.
 
 inputs: A list of tensors that will be sent to the XLA computation.
 dynamic_key: The key sent at runtime by the compile node to identify which
@@ -49,8 +48,7 @@ REGISTER_OP("_XlaRecvAtHost")
     .SetIsStateful()
     .SetShapeFn(::tensorflow::shape_inference::UnknownShape)
     .Doc(R"doc(
-A placeholder op for multiple values that will be sent to TensorFlow from a
-running XLA computation.
+A placeholder op to receive values from a running XLA computation.
 
 dynamic_key: The key sent at runtime by the compile node to identify which
 execution the transfer corresponds to.

From 215616fddc5731023739da5ab1ebb51cadfc452e Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Thu, 14 May 2020 12:35:12 -0700
Subject: [PATCH 211/412] Add support for setting up a TF_OutputList from the
 client and use it to build function with multiple results

PiperOrigin-RevId: 311585364
Change-Id: I5245fd0f5e5c0e8e7e22350d970c508e0154d59b
---
 .../c/eager/c_api_unified_experimental.cc     |   4 +
 .../c/eager/c_api_unified_experimental.h      |  18 ++-
 .../eager/c_api_unified_experimental_graph.cc |   4 +
 .../eager/c_api_unified_experimental_test.cc  | 147 +++++++++++++++++-
 4 files changed, 164 insertions(+), 9 deletions(-)

diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index d29c457798e..e5030a602b3 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -127,6 +127,10 @@ int TF_OutputListNumOutputs(TF_OutputList* o) {
 TF_AbstractTensor* TF_OutputListGet(TF_OutputList* o, int i) {
   return wrap(unwrap(o)->outputs[i]);
 }
+void TF_OutputListPushBack(TF_OutputList* o, TF_AbstractTensor* tensor,
+                           TF_Status* s) {
+  unwrap(o)->outputs.push_back(unwrap(tensor));
+}
 
 void TF_AbstractOpSetOpType(TF_AbstractOp* op, const char* const op_type,
                             TF_Status* s) {
diff --git a/tensorflow/c/eager/c_api_unified_experimental.h b/tensorflow/c/eager/c_api_unified_experimental.h
index 512717caa34..86c59a7f625 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.h
+++ b/tensorflow/c/eager/c_api_unified_experimental.h
@@ -88,19 +88,21 @@ void TF_AbstractOpSetAttrType(TF_AbstractOp* op, const char* const attr_name,
 void TF_DeleteAbstractTensor(TF_AbstractTensor*);
 
 // TF_OutputList holds the list of TF_AbstractTensor that results from executing
-// an operation.
-// It just lets us not specify the number of outputs of an operation
-// beforehand. This forces a memory allocation in the runtime, which is bad, but
-// it allows for generic code.
-// TODO(aminim): the description above isn't clear with respect to
-// TF_OutputListNumOutputs and the current eager implementation which requires
-// the number of outputs to be set by the client.
+// an operation, or provided to create a function.
+// When executing an operation in an eager context, the expected number of
+// outputs must be set beforehand with `TF_OutputListSetNumOutputs`.
 typedef struct TF_OutputList TF_OutputList;
 TF_OutputList* TF_NewOutputList();
 void TF_DeleteOutputList(TF_OutputList* o);
-void TF_OutputListSetNumOutputs(TF_OutputList* o, int, TF_Status*);
+// Prepare tracing to the expected number of output for an operation.
+void TF_OutputListSetNumOutputs(TF_OutputList* o, int num_outputs, TF_Status*);
+// Return the number of outputs in the list.
 int TF_OutputListNumOutputs(TF_OutputList* o);
+// Return the `i`th output in the list.
 TF_AbstractTensor* TF_OutputListGet(TF_OutputList* o, int i);
+// Append a tensor at the end of the output list, growing its size by one.
+void TF_OutputListPushBack(TF_OutputList* o, TF_AbstractTensor* tensor,
+                           TF_Status*);
 
 // TF_ExecuteOperation will, if in eager mode, execute, if in graph mode, maybe
 // capture some inputs and then add a node in the graph. The output tensors are
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
index e38332e3e8e..dd5a95b3526 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_graph.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -139,6 +139,10 @@ class GraphContext : public ExecutionContext {
       return;
     }
     auto* tf_opdesc = graph_op->op_.release();
+    if (tf_opdesc == nullptr) {
+      TF_SetStatus(s, TF_INVALID_ARGUMENT, "AbstractOp is incomplete.");
+      return;
+    }
     for (int i = 0; i < num_inputs; ++i) {
       auto* graph_tensor = dyncast<GraphTensor>(inputs[i]);
       if (!graph_tensor) {
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index 9f56c8aa579..9776b4d13ed 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -169,7 +169,152 @@ TEST_P(UnifiedCAPI, TestBasicGraph) {
   TF_DeleteExecutionContext(eager_execution_ctx);
 }
 
-TEST_P(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
+TEST_P(UnifiedCAPI, TestMultiOutputGraph) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_Status* s = status.get();
+
+  // Start a new function / execution context.
+  string fn_name = "two_adds";
+  TF_ExecutionContext* graph_ctx = TF_CreateFunction(fn_name.c_str(), s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  auto* arg0 = TF_AddFunctionParameter(graph_ctx, TF_FLOAT, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  auto* arg1 = TF_AddFunctionParameter(graph_ctx, TF_FLOAT, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Create a first "Add" computing `arg0 + arg1`.
+  TF_AbstractTensor* add_output1;
+  {
+    // Build an abstract operation, inputs and output.
+    auto* add_op = TF_NewAbstractOp(graph_ctx);
+    TF_AbstractOpSetOpType(add_op, "Add", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractOpSetOpName(add_op, "my_add1", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractTensor* inputs[2] = {arg0, arg1};
+    TF_OutputList* add_outputs = TF_NewOutputList();
+    // Trace the operation now (create a node in the graph).
+    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteAbstractOp(add_op);
+    // Extract the resulting tensor.
+    add_output1 = TF_OutputListGet(add_outputs, 0);
+    TF_DeleteOutputList(add_outputs);
+  }
+
+  // Same with a second "Add" computing `arg1 + arg1`.
+  TF_AbstractTensor* add_output2;
+  {
+    // Build an abstract operation, inputs and output.
+    auto* add_op = TF_NewAbstractOp(graph_ctx);
+    TF_AbstractOpSetOpType(add_op, "Add", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractOpSetOpName(add_op, "my_add2", s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_AbstractTensor* inputs[2] = {arg1, arg1};
+    TF_OutputList* add_outputs = TF_NewOutputList();
+    // Trace the operation now (create a node in the graph).
+    TF_ExecuteOperation(add_op, 2, inputs, add_outputs, graph_ctx, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteAbstractOp(add_op);
+    // Extract the resulting tensor.
+    add_output2 = TF_OutputListGet(add_outputs, 0);
+    TF_DeleteOutputList(add_outputs);
+  }
+
+  // Finalize the function by providing the returned values.
+  TF_AbstractFunction* func;
+  {
+    // We want to return the output of both add operations, create a new list
+    // and populate it.
+    TF_OutputList* func_outputs = TF_NewOutputList();
+    TF_OutputListPushBack(func_outputs, add_output1, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_OutputListPushBack(func_outputs, add_output2, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    func = TF_FinalizeFunction(graph_ctx, func_outputs, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_DeleteOutputList(func_outputs);
+  }
+
+  /**
+   * We traced so far this function:
+   *
+   *   def two_adds(a, b):
+   *     my_add1 = a + b
+   *     my_add2 = b + b
+   *     return my_add1, my_add2
+   *
+   * Now we will execute this function with an eager context:
+   *
+   *   output1, output2 = two_adds(2.0, 3.0)
+   *
+   * and check that we got 5.0 and 6.0 as results.
+   */
+
+  // Build eager context.
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TF_ExecutionContext* eager_execution_ctx =
+      TF_NewEagerExecutionContext(opts, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TFE_DeleteContextOptions(opts);
+
+  TF_ExecutionContextRegisterFunction(eager_execution_ctx, func, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Build the abstract op to run the function.
+  TF_AbstractOp* fn_op = TF_NewAbstractOp(eager_execution_ctx);
+  TF_AbstractOpSetOpType(fn_op, fn_name.c_str(), s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+
+  // Build two abstract input tensors as function arguments.
+  std::vector<TF_AbstractTensor*> func_args;
+  {
+    TFE_Context* eager_ctx =
+        TF_ExecutionContextGetTFEContext(eager_execution_ctx);
+    TFE_TensorHandle* input_eager = TestScalarTensorHandle(eager_ctx, 2.0f);
+    func_args.push_back(TF_CreateAbstractTensorFromEagerTensor(input_eager, s));
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    input_eager = TestScalarTensorHandle(eager_ctx, 3.0f);
+    func_args.push_back(TF_CreateAbstractTensorFromEagerTensor(input_eager, s));
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  }
+
+  TF_OutputList* func_outputs = TF_NewOutputList();
+  TF_OutputListSetNumOutputs(func_outputs, 2, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_ExecuteOperation(fn_op, func_args.size(), func_args.data(), func_outputs,
+                      eager_execution_ctx, s);
+  ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+  TF_DeleteAbstractOp(fn_op);
+  for (TF_AbstractTensor* t : func_args) TF_DeleteAbstractTensor(t);
+
+  ASSERT_EQ(2, TF_OutputListNumOutputs(func_outputs));
+  float results[2];
+  for (int idx = 0; idx < 2; ++idx) {
+    TF_AbstractTensor* result = TF_OutputListGet(func_outputs, idx);
+    TFE_TensorHandle* handle = TF_AbstractTensorGetEagerTensor(result, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    TF_Tensor* f_t = TFE_TensorHandleResolve(handle, s);
+    ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
+    results[idx] = *static_cast<float*>(TF_TensorData(f_t));
+    TF_DeleteTensor(f_t);
+  }
+  ASSERT_EQ(results[0], 5.0);
+  ASSERT_EQ(results[1], 6.0);
+
+  for (int idx = 0; idx < 2; ++idx) {
+    TF_AbstractTensor* result = TF_OutputListGet(func_outputs, idx);
+    TF_DeleteAbstractTensor(result);
+  }
+  TF_DeleteOutputList(func_outputs);
+  TF_DeleteExecutionContext(eager_execution_ctx);
+  TF_DeleteAbstractFunction(func);
+}
+
+TEST(UnifiedCAPI, TF_ExecutionContextToFunctionWithEagerContextRaises) {
   std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
       TF_NewStatus(), TF_DeleteStatus);
   TFE_ContextOptions* opts = TFE_NewContextOptions();

From f4a49c6871a36444a0a28e9b127ab052efc1f9ca Mon Sep 17 00:00:00 2001
From: Yunlu Li <yunluli@google.com>
Date: Thu, 14 May 2020 12:40:01 -0700
Subject: [PATCH 212/412] Set sparse FullyConnected op version properly.

PiperOrigin-RevId: 311586496
Change-Id: Ieb57857388bbb25de02163b9a6594dd02666b867
---
 tensorflow/lite/toco/tflite/operator.cc             |  1 +
 tensorflow/lite/tools/versioning/op_version.cc      | 10 ++++++++++
 tensorflow/lite/tools/versioning/op_version.h       |  3 +++
 tensorflow/lite/tools/versioning/op_version_test.cc |  9 +++++++++
 4 files changed, 23 insertions(+)

diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 917fd24c952..fee10a19787 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -487,6 +487,7 @@ class FullyConnected
     op_sig.options.fully_connected.keep_num_dims = fc_op.keep_num_dims;
     op_sig.options.fully_connected.weights_format =
         GetWeightFormat(fc_op.weights_format);
+    op_sig.options.fully_connected.sparse_weight = false;
     return ::tflite::GetBuiltinOperatorVersion(op_sig);
   }
 };
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 9022afca629..118e2d420f8 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -121,6 +121,11 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       // | Quantized Int8  |                  4 |                        4 |
       // +-----------------+--------------------+--------------------------+
 
+      // FullyConnected with sparse weight is supported at version 8.
+      if (op_sig.options.fully_connected.sparse_weight) {
+        return 8;
+      }
+
       // Int16 fully fixed point kernel is at version 7.
       if (op_sig.input_types.at(0) == TensorType_INT16 &&
           op_sig.input_types.at(1) == TensorType_INT16 &&
@@ -578,6 +583,11 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
         op_sig.options.fully_connected.weights_format =
             fully_connected_option->weights_format();
       }
+
+      const Tensor* weight_tensor =
+          subgraph->tensors()->Get(op->inputs()->Get(1));
+      op_sig.options.fully_connected.sparse_weight =
+          (weight_tensor->sparsity() != nullptr);
     } break;
 
     case BuiltinOperator_MUL: {
diff --git a/tensorflow/lite/tools/versioning/op_version.h b/tensorflow/lite/tools/versioning/op_version.h
index 4b0fe8836e2..df74ffaf6dd 100644
--- a/tensorflow/lite/tools/versioning/op_version.h
+++ b/tensorflow/lite/tools/versioning/op_version.h
@@ -37,6 +37,9 @@ typedef struct {
     struct {
       bool keep_num_dims;
       FullyConnectedOptionsWeightsFormat weights_format;
+      // TODO(b/156530611): Make this global when more ops support sparse
+      // computation.
+      bool sparse_weight;
     } fully_connected;
     struct {
       float input1_scale;
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index f0d8259d764..4017fc3bff0 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -352,6 +352,15 @@ TEST(OpVersionTest, VersioningFullyConnectedTest) {
   fake_op_sig.options.fully_connected = {
       false, FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8};
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 6);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_FULLY_CONNECTED,
+      .input_types = std::vector<TensorType>{TensorType_INT8, TensorType_INT8},
+      .output_types = std::vector<TensorType>{TensorType_INT8},
+  };
+  fake_op_sig.options.fully_connected = {
+      false, FullyConnectedOptionsWeightsFormat_DEFAULT, true};
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 8);
 }
 
 TEST(OpVersionTest, VersioningDequantizeTest) {

From 45d18ddb7ee181d5f847c64558ad72d63e9db609 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 13:22:03 -0700
Subject: [PATCH 213/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311594498
Change-Id: I8a91e5e8f8418d44ece61b1c52c76892ff949d0b
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a6ee1a13b6e..e6725269279 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12053,7 +12053,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12064,7 +12064,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18969,7 +18969,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18980,7 +18980,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19384,7 +19384,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20455,7 +20455,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21627,7 +21627,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22335,7 +22335,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22531,7 +22531,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22600,7 +22600,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22715,7 +22715,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22774,7 +22774,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22948,7 +22948,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23325,7 +23325,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25648,7 +25648,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25711,7 +25711,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25962,7 +25962,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26446,7 +26446,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45534,7 +45534,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47474,7 +47474,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47545,7 +47545,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48534,7 +48534,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 66769844a5c58e8a25352d8a16ff40b04f6c523e Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 14 May 2020 13:27:42 -0700
Subject: [PATCH 214/412] [XLA:CPU] Allow C64 and C128 types in Sort().

These seem to have been omitted mostly as an oversight; the logic in Sort() doesn't seem to be data-type specific.

PiperOrigin-RevId: 311595522
Change-Id: I6264bbe6556a0823e8a88e2025c4886182aad6bf
---
 .../compiler/xla/service/cpu/ir_emitter.cc    | 23 ++++---------------
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index f516a1538d3..5a4c6250293 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <stddef.h>
 #include <stdint.h>
+
 #include <algorithm>
 #include <iterator>
 #include <limits>
@@ -570,25 +571,9 @@ Status IrEmitter::HandleSort(HloInstruction* hlo) {
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(sort));
   Shape keys_shape = sort->keys()->shape();
   PrimitiveType keys_type = keys_shape.element_type();
-  switch (keys_type) {
-    case PRED:
-    case S8:
-    case U8:
-    case S16:
-    case U16:
-    case BF16:
-    case F16:
-    case S32:
-    case U32:
-    case F32:
-    case S64:
-    case U64:
-    case F64:
-      break;
-    default:
-      return Unimplemented(
-          "Element type %s not supported in the Sort op on CPU.",
-          PrimitiveType_Name(keys_type));
+  if (!primitive_util::IsArrayType(keys_type)) {
+    return Unimplemented("Element type %s not supported in the Sort op on CPU.",
+                         PrimitiveType_Name(keys_type));
   }
   std::vector<llvm::Value*> destination_addresses(sort->operand_count());
   for (int64 i = 0; i < sort->operand_count(); ++i) {

From 6db3caf99be91664813ae621e62c3287e2af44d3 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Thu, 14 May 2020 13:28:33 -0700
Subject: [PATCH 215/412] Update gather_op_test and unique_op_test to use
 subTest for easier debugging.

PiperOrigin-RevId: 311595699
Change-Id: I1a8cf8b5b314aada4aeeece2603e975bc8a4ff42
---
 .../python/kernel_tests/gather_op_test.py     | 213 +++++++++---------
 .../python/kernel_tests/unique_op_test.py     |  59 ++---
 2 files changed, 142 insertions(+), 130 deletions(-)

diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index 953f18bb07a..b966110963c 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -62,14 +62,15 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       data = np.array([0, 1, 2, 3, 7, 5])
       for dtype in _TEST_TYPES:
         for indices in 4, [1, 2, 2, 4, 5]:
-          params_np = self._buildParams(data, dtype)
-          params = constant_op.constant(params_np)
-          indices_tf = constant_op.constant(indices)
-          gather_t = array_ops.gather(params, indices_tf)
-          gather_val = self.evaluate(gather_t)
-          np_val = params_np[indices]
-          self.assertAllEqual(np_val, gather_val)
-          self.assertEqual(np_val.shape, gather_t.get_shape())
+          with self.subTest(dtype=dtype, indices=indices):
+            params_np = self._buildParams(data, dtype)
+            params = constant_op.constant(params_np)
+            indices_tf = constant_op.constant(indices)
+            gather_t = array_ops.gather(params, indices_tf)
+            gather_val = self.evaluate(gather_t)
+            np_val = params_np[indices]
+            self.assertAllEqual(np_val, gather_val)
+            self.assertEqual(np_val.shape, gather_t.get_shape())
 
   def testScalar2D(self):
     with self.session(use_gpu=True):
@@ -77,14 +78,15 @@ class GatherTest(test.TestCase, parameterized.TestCase):
                        [9, 10, 11], [12, 13, 14]])
       for dtype in _TEST_TYPES:
         for axis in range(data.ndim):
-          params_np = self._buildParams(data, dtype)
-          params = constant_op.constant(params_np)
-          indices = constant_op.constant(2)
-          gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = self.evaluate(gather_t)
-          self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
-          expected_shape = data.shape[:axis] + data.shape[axis + 1:]
-          self.assertEqual(expected_shape, gather_t.get_shape())
+          with self.subTest(dtype=dtype, axis=axis):
+            params_np = self._buildParams(data, dtype)
+            params = constant_op.constant(params_np)
+            indices = constant_op.constant(2)
+            gather_t = array_ops.gather(params, indices, axis=axis)
+            gather_val = self.evaluate(gather_t)
+            self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
+            expected_shape = data.shape[:axis] + data.shape[axis + 1:]
+            self.assertEqual(expected_shape, gather_t.get_shape())
 
   def testSimpleTwoD32(self):
     with self.session(use_gpu=True):
@@ -92,16 +94,17 @@ class GatherTest(test.TestCase, parameterized.TestCase):
                        [9, 10, 11], [12, 13, 14]])
       for dtype in _TEST_TYPES:
         for axis in range(data.ndim):
-          params_np = self._buildParams(data, dtype)
-          params = constant_op.constant(params_np)
-          # The indices must be in bounds for any axis.
-          indices = constant_op.constant([0, 1, 0, 2])
-          gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = self.evaluate(gather_t)
-          self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
-                              gather_val)
-          expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
-          self.assertEqual(expected_shape, gather_t.get_shape())
+          with self.subTest(dtype=dtype, axis=axis):
+            params_np = self._buildParams(data, dtype)
+            params = constant_op.constant(params_np)
+            # The indices must be in bounds for any axis.
+            indices = constant_op.constant([0, 1, 0, 2])
+            gather_t = array_ops.gather(params, indices, axis=axis)
+            gather_val = self.evaluate(gather_t)
+            self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
+                                gather_val)
+            expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
+            self.assertEqual(expected_shape, gather_t.get_shape())
 
   @test_util.run_deprecated_v1
   def testHigherRank(self):
@@ -112,58 +115,60 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         for axis in range(len(shape)):
           params = self._buildParams(np.random.randn(*shape), dtype)
           indices = np.random.randint(shape[axis], size=indices_shape)
-          with self.cached_session(use_gpu=True) as sess:
-            tf_params = constant_op.constant(params)
-            tf_indices = constant_op.constant(indices)
-            # Check that both positive and negative indices for axis work.
-            tf_axis = constant_op.constant(axis)
-            tf_negative_axis = constant_op.constant(-len(shape) + axis)
-            gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
-            gather_negative_axis = array_ops.gather(
-                tf_params, tf_indices, axis=tf_negative_axis)
-            gather_value, gather_negative_axis_value = sess.run(
-                [gather, gather_negative_axis])
-            gather_np = np.take(params, indices, axis)
-            self.assertAllEqual(gather_np, gather_value)
-            self.assertAllEqual(gather_np, gather_negative_axis_value)
-            expected_shape = (params.shape[:axis] + indices.shape +
-                              params.shape[axis + 1:])
-            self.assertEqual(expected_shape, gather.shape)
-            self.assertEqual(expected_shape, gather_negative_axis.shape)
+          with self.subTest(indices_shape=indices_shape, dtype=dtype, axis=axis,
+                            indices=indices):
+            with self.cached_session(use_gpu=True) as sess:
+              tf_params = constant_op.constant(params)
+              tf_indices = constant_op.constant(indices)
+              # Check that both positive and negative indices for axis work.
+              tf_axis = constant_op.constant(axis)
+              tf_negative_axis = constant_op.constant(-len(shape) + axis)
+              gather = array_ops.gather(tf_params, tf_indices, axis=tf_axis)
+              gather_negative_axis = array_ops.gather(
+                  tf_params, tf_indices, axis=tf_negative_axis)
+              gather_value, gather_negative_axis_value = sess.run(
+                  [gather, gather_negative_axis])
+              gather_np = np.take(params, indices, axis)
+              self.assertAllEqual(gather_np, gather_value)
+              self.assertAllEqual(gather_np, gather_negative_axis_value)
+              expected_shape = (params.shape[:axis] + indices.shape +
+                                params.shape[axis + 1:])
+              self.assertEqual(expected_shape, gather.shape)
+              self.assertEqual(expected_shape, gather_negative_axis.shape)
 
-            # Test gradients
-            gather_grad = np.random.randn(
-                *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype)
-            if dtype.is_complex:
-              gather_grad -= 1j * gather_grad
-            params_grad, indices_grad, axis_grad = gradients_impl.gradients(
-                gather, [tf_params, tf_indices, tf_axis], gather_grad)
-            self.assertEqual(indices_grad, None)
-            self.assertEqual(axis_grad, None)
-            if dtype.is_integer:
-              self.assertEqual(params_grad, None)
-              continue
-            # For axis 0, we are able to create an efficient IndexedSlices for
-            # the gradient.
-            if axis == 0:
-              self.assertEqual(type(params_grad), ops.IndexedSlices)
-              params_grad = ops.convert_to_tensor(params_grad)
-            correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
-            outer_dims = axis
-            inner_dims = len(shape) - axis - 1
-            gather_grad = gather_grad.reshape(
-                shape[:axis] + (indices.size,) + shape[axis + 1:])
-            for source_index, dest_index in enumerate(indices.flat):
-              dest_slice = ((slice(None),) * outer_dims + (dest_index,) +
-                            (slice(None),) * inner_dims)
-              source_slice = ((slice(None),) * outer_dims + (source_index,) +
+              # Test gradients
+              gather_grad = np.random.randn(
+                  *gather.get_shape().as_list()).astype(dtype.as_numpy_dtype)
+              if dtype.is_complex:
+                gather_grad -= 1j * gather_grad
+              params_grad, indices_grad, axis_grad = gradients_impl.gradients(
+                  gather, [tf_params, tf_indices, tf_axis], gather_grad)
+              self.assertEqual(indices_grad, None)
+              self.assertEqual(axis_grad, None)
+              if dtype.is_integer:
+                self.assertEqual(params_grad, None)
+                continue
+              # For axis 0, we are able to create an efficient IndexedSlices for
+              # the gradient.
+              if axis == 0:
+                self.assertEqual(type(params_grad), ops.IndexedSlices)
+                params_grad = ops.convert_to_tensor(params_grad)
+              correct_params_grad = np.zeros(shape).astype(dtype.as_numpy_dtype)
+              outer_dims = axis
+              inner_dims = len(shape) - axis - 1
+              gather_grad = gather_grad.reshape(
+                  shape[:axis] + (indices.size,) + shape[axis + 1:])
+              for source_index, dest_index in enumerate(indices.flat):
+                dest_slice = ((slice(None),) * outer_dims + (dest_index,) +
                               (slice(None),) * inner_dims)
-              correct_params_grad[dest_slice] += gather_grad[source_slice]
-            self.assertAllClose(
-                correct_params_grad,
-                self.evaluate(params_grad),
-                atol=2e-6,
-                rtol=2e-6)
+                source_slice = ((slice(None),) * outer_dims + (source_index,) +
+                                (slice(None),) * inner_dims)
+                correct_params_grad[dest_slice] += gather_grad[source_slice]
+              self.assertAllClose(
+                  correct_params_grad,
+                  self.evaluate(params_grad),
+                  atol=2e-6,
+                  rtol=2e-6)
 
   @test_util.run_deprecated_v1
   def testString(self):
@@ -177,12 +182,14 @@ class GatherTest(test.TestCase, parameterized.TestCase):
   @test_util.run_deprecated_v1
   def testUInt32AndUInt64(self):
     for unsigned_type in (dtypes.uint32, dtypes.uint64):
-      params = self._buildParams(
-          np.array([[1, 2, 3], [7, 8, 9]]), unsigned_type)
-      with self.cached_session():
-        self.assertAllEqual([7, 8, 9],
-                            array_ops.gather(params, 1, axis=0).eval())
-        self.assertAllEqual([1, 7], array_ops.gather(params, 0, axis=1).eval())
+      with self.subTest(unsigned_type=unsigned_type):
+        params = self._buildParams(
+            np.array([[1, 2, 3], [7, 8, 9]]), unsigned_type)
+        with self.cached_session():
+          self.assertAllEqual([7, 8, 9],
+                              array_ops.gather(params, 1, axis=0).eval())
+          self.assertAllEqual([1, 7],
+                              array_ops.gather(params, 0, axis=1).eval())
 
   @test_util.run_deprecated_v1
   def testUnknownIndices(self):
@@ -237,14 +244,15 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       indices = 0
       for bad_axis in (1, 2, -2):
         # Shape inference can validate axis for known params rank.
-        with self.assertRaisesWithPredicateMatch(
-            ValueError, "Shape must be at least rank . but is rank 1"):
-          array_ops.gather(params, indices, axis=bad_axis)
-        # If params rank is unknown, an op error occurs.
-        with self.assertRaisesOpError(
-            r"Expected axis in the range \[-1, 1\), but got %s" % bad_axis):
-          array_ops.gather(params_ph, indices, axis=bad_axis).eval(
-              feed_dict={params_ph: params})
+        with self.subTest(bad_axis=bad_axis):
+          with self.assertRaisesWithPredicateMatch(
+              ValueError, "Shape must be at least rank . but is rank 1"):
+            array_ops.gather(params, indices, axis=bad_axis)
+          # If params rank is unknown, an op error occurs.
+          with self.assertRaisesOpError(
+              r"Expected axis in the range \[-1, 1\), but got %s" % bad_axis):
+            array_ops.gather(params_ph, indices, axis=bad_axis).eval(
+                feed_dict={params_ph: params})
 
   @test_util.run_deprecated_v1
   def testEmptySlices(self):
@@ -252,20 +260,21 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       for dtype in _TEST_TYPES:
         for itype in np.int32, np.int64:
           # Leading axis gather.
-          params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
-          indices = np.array([3, 4], dtype=itype)
-          gather = array_ops.gather(params, indices, axis=0)
-          self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
+          with self.subTest(dtype=dtype, itype=itype):
+            params = np.zeros((7, 0, 0), dtype=dtype.as_numpy_dtype)
+            indices = np.array([3, 4], dtype=itype)
+            gather = array_ops.gather(params, indices, axis=0)
+            self.assertAllEqual(gather.eval(), np.zeros((2, 0, 0)))
 
-          # Middle axis gather.
-          params = np.zeros((0, 7, 0), dtype=dtype.as_numpy_dtype)
-          gather = array_ops.gather(params, indices, axis=1)
-          self.assertAllEqual(gather.eval(), np.zeros((0, 2, 0)))
+            # Middle axis gather.
+            params = np.zeros((0, 7, 0), dtype=dtype.as_numpy_dtype)
+            gather = array_ops.gather(params, indices, axis=1)
+            self.assertAllEqual(gather.eval(), np.zeros((0, 2, 0)))
 
-          # Trailing axis gather.
-          params = np.zeros((0, 0, 7), dtype=dtype.as_numpy_dtype)
-          gather = array_ops.gather(params, indices, axis=2)
-          self.assertAllEqual(gather.eval(), np.zeros((0, 0, 2)))
+            # Trailing axis gather.
+            params = np.zeros((0, 0, 7), dtype=dtype.as_numpy_dtype)
+            gather = array_ops.gather(params, indices, axis=2)
+            self.assertAllEqual(gather.eval(), np.zeros((0, 0, 2)))
 
   @parameterized.parameters([
       # batch_dims=0 (equivalent to tf.gather)
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 7d9e875be2d..436fef8171f 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -61,17 +61,18 @@ class UniqueTest(test.TestCase):
 
   def testInt32Axis(self):
     for dtype in [np.int32, np.int64]:
-      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
-      y0, idx0 = gen_array_ops.unique_v2(x, axis=np.array([0], dtype))
-      self.assertEqual(y0.shape.rank, 2)
-      tf_y0, tf_idx0 = self.evaluate([y0, idx0])
-      y1, idx1 = gen_array_ops.unique_v2(x, axis=np.array([1], dtype))
-      self.assertEqual(y1.shape.rank, 2)
-      tf_y1, tf_idx1 = self.evaluate([y1, idx1])
-      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
-      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
-      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
-      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+      with self.subTest(dtype=dtype):
+        x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+        y0, idx0 = gen_array_ops.unique_v2(x, axis=np.array([0], dtype))
+        self.assertEqual(y0.shape.rank, 2)
+        tf_y0, tf_idx0 = self.evaluate([y0, idx0])
+        y1, idx1 = gen_array_ops.unique_v2(x, axis=np.array([1], dtype))
+        self.assertEqual(y1.shape.rank, 2)
+        tf_y1, tf_idx1 = self.evaluate([y1, idx1])
+        self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+        self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+        self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+        self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
 
   def testInt32V2(self):
     # This test is only temporary, once V2 is used
@@ -144,26 +145,28 @@ class UniqueWithCountsTest(test.TestCase):
     for i in range(len(x)):
       self.assertEqual(x[i], tf_y[tf_idx[i]].decode('ascii'))
     for value, count in zip(tf_y, tf_count):
-      v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
-      self.assertEqual(count, sum(v))
+      with self.subTest(value=value, count=count):
+        v = [1 if x[i] == value.decode('ascii') else 0 for i in range(7000)]
+        self.assertEqual(count, sum(v))
 
   def testInt32Axis(self):
     for dtype in [np.int32, np.int64]:
-      x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
-      y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
-          x, axis=np.array([0], dtype))
-      self.assertEqual(y0.shape.rank, 2)
-      tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0])
-      y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
-          x, axis=np.array([1], dtype))
-      self.assertEqual(y1.shape.rank, 2)
-      tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1])
-      self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
-      self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
-      self.assertAllEqual(tf_count0, np.array([2, 1]))
-      self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
-      self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
-      self.assertAllEqual(tf_count1, np.array([1, 2]))
+      with self.subTest(dtype=dtype):
+        x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
+        y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
+            x, axis=np.array([0], dtype))
+        self.assertEqual(y0.shape.rank, 2)
+        tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0])
+        y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
+            x, axis=np.array([1], dtype))
+        self.assertEqual(y1.shape.rank, 2)
+        tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1])
+        self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
+        self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
+        self.assertAllEqual(tf_count0, np.array([2, 1]))
+        self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
+        self.assertAllEqual(tf_idx1, np.array([0, 1, 1]))
+        self.assertAllEqual(tf_count1, np.array([1, 2]))
 
   def testInt32V2(self):
     # This test is only temporary, once V2 is used

From ec7ea83d9d416ac5322061535a2251658cbf5d22 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Thu, 14 May 2020 14:19:16 -0700
Subject: [PATCH 216/412] [TF:TRT] Enable concatenation_test and
 biasadd_matmul_test for TAP.

PiperOrigin-RevId: 311604247
Change-Id: Ifca2be4bf2f40dc48f2beffb76bea94fe52101b4
---
 tensorflow/python/compiler/tensorrt/BUILD | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 1e4c215994f..192ba71cebd 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -120,8 +120,10 @@ cuda_py_tests(
     srcs = [
         "test/base_test.py",
         "test/batch_matmul_test.py",
+        "test/biasadd_matmul_test.py",
         "test/binary_tensor_weight_broadcast_test.py",
         "test/combined_nms_test.py",
+        "test/concatenation_test.py",
         "test/const_broadcast_test.py",
         "test/conv2d_test.py",
         "test/dynamic_input_shapes_test.py",
@@ -155,27 +157,6 @@ cuda_py_tests(
     ],
 )
 
-cuda_py_tests(
-    name = "concatenation_test",
-    srcs = [
-        "test/biasadd_matmul_test.py",
-        "test/concatenation_test.py",
-    ],
-    python_version = "PY3",
-    tags = [
-        "no_rocm",
-        "no_windows",
-        "nomac",
-        "notap",  # b/140261407
-    ],
-    xla_enable_strict_auto_jit = False,
-    deps = [
-        ":tf_trt_integration_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-    ],
-)
-
 cuda_py_test(
     name = "quantization_mnist_test",
     srcs = ["test/quantization_mnist_test.py"],

From 501309eef9b43a3e965c2ec6a315ea76fffa2c90 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Thu, 14 May 2020 14:22:42 -0700
Subject: [PATCH 217/412] Only add sub_index to _Arg nodes.

PiperOrigin-RevId: 311604877
Change-Id: Ib7c941b38e6ea38378bd4d9d44dc1d262ee6dd4a
---
 .../core/common_runtime/replicate_per_replica_nodes.cc    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
index cfbcde82ce2..fbae80aef55 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
@@ -42,7 +42,9 @@ class ReplicateHelper {
       Node* replicated_node = graph->AddNode(node_def, &status);
       TF_RETURN_IF_ERROR(status);
       replicated_node->set_assigned_device_name(device);
-      replicated_node->AddAttr("sub_index", i);
+      if (replicated_node->IsArg()) {
+        replicated_node->AddAttr("sub_index", i);
+      }
       replicated_nodes[i] = replicated_node;
     }
     replicated_nodes_map_.emplace(node, std::move(replicated_nodes));
@@ -214,7 +216,9 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
       // Reuse the original nodes if there is only one allowed device.
       for (Node* n : cluster_nodes) {
         n->set_assigned_device_name(allowed_devices.at(0));
-        n->AddAttr("sub_index", 0);
+        if (n->IsArg()) {
+          n->AddAttr("sub_index", 0);
+        }
       }
       continue;
     }

From 6f57007fb8713c2e2cb3f2aa8971544b67cc2516 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Thu, 14 May 2020 14:32:59 -0700
Subject: [PATCH 218/412] Use subTest on einsum_test to make errors easier to
 understand.

PiperOrigin-RevId: 311606884
Change-Id: I7f8738ffc26479f98d468431706c7d4f7c6efcfc
---
 .../python/kernel_tests/einsum_op_test.py     | 72 ++++++++++---------
 1 file changed, 40 insertions(+), 32 deletions(-)

diff --git a/tensorflow/python/kernel_tests/einsum_op_test.py b/tensorflow/python/kernel_tests/einsum_op_test.py
index a6b623b828c..47d5d457193 100644
--- a/tensorflow/python/kernel_tests/einsum_op_test.py
+++ b/tensorflow/python/kernel_tests/einsum_op_test.py
@@ -42,10 +42,11 @@ class EinsumOpTest(test.TestCase):
     r = np.random.RandomState(0)
     inputs = []
     for shape in input_shapes:
-      arr = np.array(r.randn(*shape)).astype(dtype)
-      if dtype == np.complex64 or dtype == np.complex128:
-        arr += 1j * np.array(r.randn(*shape)).astype(dtype)
-      inputs.append(arr)
+      with self.subTest(s=s, shape=shape):
+        arr = np.array(r.randn(*shape)).astype(dtype)
+        if dtype == np.complex64 or dtype == np.complex128:
+          arr += 1j * np.array(r.randn(*shape)).astype(dtype)
+        inputs.append(arr)
     input_tensors = [constant_op.constant(x, shape=x.shape) for x in inputs]
     a = np.einsum(s, *inputs)
     b = self.evaluate(gen_linalg_ops.einsum(input_tensors, s))
@@ -160,10 +161,11 @@ class EinsumOpTest(test.TestCase):
       input_shapes = [(2, 2), (2, 2)]
       inputs = []
       for shape in input_shapes:
-        arr = np.array(r.randn(*shape)).astype(dtype)
-        if dtype == np.complex64 or dtype == np.complex128:
-          arr += 1j * np.array(r.randn(*shape)).astype(dtype)
-        inputs.append(arr)
+        with self.subTest(dtype=dtype, shape=shape):
+          arr = np.array(r.randn(*shape)).astype(dtype)
+          if dtype == np.complex64 or dtype == np.complex128:
+            arr += 1j * np.array(r.randn(*shape)).astype(dtype)
+          inputs.append(arr)
       input_tensors = [constant_op.constant(x) for x in inputs]
       if dtype == bfloat16:
         # np.einsum doesn't support bfloat16.
@@ -199,14 +201,15 @@ class EinsumOpTest(test.TestCase):
         ('...ij,...jk->ik', r.randn(2, 2, 3), r.randn(3, 4)),
     ]
     for args in cases:
-      with self.assertRaises((ValueError, errors.InvalidArgumentError)):
-        _ = self.evaluate(gen_linalg_ops.einsum(args[1:], args[0]))
+      with self.subTest(args=args):
+        with self.assertRaises((ValueError, errors.InvalidArgumentError)):
+          _ = self.evaluate(gen_linalg_ops.einsum(args[1:], args[0]))
 
-      placeholders = [
-          array_ops.placeholder_with_default(x, shape=None) for x in args[1:]
-      ]
-      with self.assertRaises((ValueError, errors.InvalidArgumentError)):
-        _ = self.evaluate(gen_linalg_ops.einsum(placeholders, args[0]))
+        placeholders = [
+            array_ops.placeholder_with_default(x, shape=None) for x in args[1:]
+        ]
+        with self.assertRaises((ValueError, errors.InvalidArgumentError)):
+          _ = self.evaluate(gen_linalg_ops.einsum(placeholders, args[0]))
 
   @test_util.run_in_graph_and_eager_modes
   def testPlaceholder(self):
@@ -216,10 +219,12 @@ class EinsumOpTest(test.TestCase):
       inputs = []
       input_placeholders = []
       for actual_shape, placeholder_shape in input_and_placeholder_shapes:
-        input_np = np.array(r.randn(*actual_shape))
-        inputs.append(input_np)
-        input_placeholders.append(
-            array_ops.placeholder_with_default(input_np, placeholder_shape))
+        with self.subTest(equation=equation, actual_shape=actual_shape,
+                          placeholder_shape=placeholder_shape):
+          input_np = np.array(r.randn(*actual_shape))
+          inputs.append(input_np)
+          input_placeholders.append(
+              array_ops.placeholder_with_default(input_np, placeholder_shape))
 
       a = np.einsum(equation, *inputs)
       b = self.evaluate(gen_linalg_ops.einsum(input_placeholders, equation))
@@ -288,19 +293,22 @@ class EinsumGradTest(test.TestCase):
     with self.cached_session():
       r = np.random.RandomState(seed=0)
       for dtype in (np.float32, np.float64, np.complex64, np.complex128):
-        tol = 10 * np.sqrt(np.finfo(dtype).resolution)
-        if dtype in (np.complex64, np.complex128):
-          inputs = [
-              np.array(r.randn(*shape), dtype) +
-              1j * np.array(r.randn(*shape), dtype) for shape in input_shapes
-          ]
-        else:
-          inputs = [np.array(r.randn(*shape), dtype) for shape in input_shapes]
-        input_tensors = [constant_op.constant(x, shape=x.shape) for x in inputs]
-        analytical, numerical = gradient_checker_v2.compute_gradient(
-            lambda *xs: gen_linalg_ops.einsum(xs, s), input_tensors)
-        self.assertLess(
-            gradient_checker_v2.max_error(analytical, numerical), tol)
+        with self.subTest(s=s, dtype=dtype):
+          tol = 10 * np.sqrt(np.finfo(dtype).resolution)
+          if dtype in (np.complex64, np.complex128):
+            inputs = [
+                np.array(r.randn(*shape), dtype) +
+                1j * np.array(r.randn(*shape), dtype) for shape in input_shapes
+            ]
+          else:
+            inputs = [
+                np.array(r.randn(*shape), dtype) for shape in input_shapes]
+          input_tensors = [
+              constant_op.constant(x, shape=x.shape) for x in inputs]
+          analytical, numerical = gradient_checker_v2.compute_gradient(
+              lambda *xs: gen_linalg_ops.einsum(xs, s), input_tensors)
+          self.assertLess(
+              gradient_checker_v2.max_error(analytical, numerical), tol)
 
   @test_util.disable_xla('b/131919749')
   def testUnary(self):

From eab1e71ebfd3e5ef83219584dd3785db46200d43 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Thu, 14 May 2020 14:44:10 -0700
Subject: [PATCH 219/412] Add Minimum and Maximum ops to Hexagon delegate for
 uint8/int8

PiperOrigin-RevId: 311609003
Change-Id: Iedb6b59b5895b28c906c029e95202294377d32ec
---
 .../experimental/delegates/hexagon/README.md  |   2 +
 .../delegates/hexagon/builders/BUILD          |   2 +
 .../hexagon/builders/min_max_builder.cc       | 106 +++++++++++
 .../hexagon/builders/min_max_builder.h        |  45 +++++
 .../delegates/hexagon/builders/op_builder.cc  |   4 +
 .../delegates/hexagon/builders/op_factory.h   |   1 +
 .../delegates/hexagon/builders/tests/BUILD    |   1 +
 .../builders/tests/min_max_builder_test.cc    | 171 ++++++++++++++++++
 .../experimental/delegates/hexagon/utils.cc   |  12 ++
 9 files changed, 344 insertions(+)
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h
 create mode 100644 tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc

diff --git a/tensorflow/lite/experimental/delegates/hexagon/README.md b/tensorflow/lite/experimental/delegates/hexagon/README.md
index a97342c9fdc..6e627c17cd2 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/README.md
+++ b/tensorflow/lite/experimental/delegates/hexagon/README.md
@@ -79,8 +79,10 @@ are verified in `IsNodeSupportedByHexagon`:
 * Hardswish
 * L2Normalization (without any activation)
 * Logistic (aka Sigmoid)
+* Maximum
 * MaxPool2D (without any activation) (b/129276536)
 * Mean
+* Minimum
 * MirrorPad
 * Mul (without any activation) (b/129276536)
 * Neg
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
index ff764984de9..e24adc2537c 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/BUILD
@@ -19,6 +19,7 @@ cc_library(
         "hardswish_builder.cc",
         "l2_normalization_builder.cc",
         "matmul_builder.cc",
+        "min_max_builder.cc",
         "mirror_pad_builder.cc",
         "neg_op_builder.cc",
         "op_builder.cc",
@@ -46,6 +47,7 @@ cc_library(
         "hardswish_builder.h",
         "l2_normalization_builder.h",
         "matmul_builder.h",
+        "min_max_builder.h",
         "mirror_pad_builder.h",
         "neg_op_builder.h",
         "op_builder.h",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc
new file mode 100644
index 00000000000..ab5895b9a14
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.cc
@@ -0,0 +1,106 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h"
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+TfLiteStatus MinMaxOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
+                                               const TfLiteIntArray* outputs,
+                                               TfLiteContext* context) {
+  static int scalar_shape[] = {1, 1, 1, 1};
+  int a_tensor_id;
+  int b_tensor_id;
+
+  // Input tensors a and b.
+  a_tensor_id = inputs->data[0];
+  b_tensor_id = inputs->data[1];
+  const auto& a_tensor = context->tensors[a_tensor_id];
+  const auto& b_tensor = context->tensors[b_tensor_id];
+  if (a_tensor.allocation_type == kTfLiteMmapRo)
+    graph_builder_->AddConstNodeWithData(a_tensor_id, a_tensor);
+  if (b_tensor.allocation_type == kTfLiteMmapRo)
+    graph_builder_->AddConstNodeWithData(b_tensor_id, b_tensor);
+  AddInput(graph_builder_->GetHexagonTensorId(a_tensor_id));
+  AddInput(graph_builder_->GetHexagonTensorId(b_tensor_id));
+
+  // Add Inputs A & B min/max
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(a_tensor, &a_input_min_, &a_input_max_));
+  auto* a_input_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&a_input_min_),
+      sizeof(a_input_min_));
+  auto* a_input_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&a_input_max_),
+      sizeof(a_input_max_));
+  AddInput(TensorID(a_input_min_const->GetID(), 0));
+  AddInput(TensorID(a_input_max_const->GetID(), 0));
+
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(b_tensor, &b_input_min_, &b_input_max_));
+  auto* b_input_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&b_input_min_),
+      sizeof(b_input_min_));
+  auto* b_input_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&b_input_max_),
+      sizeof(b_input_max_));
+  AddInput(TensorID(b_input_min_const->GetID(), 0));
+  AddInput(TensorID(b_input_max_const->GetID(), 0));
+
+  // Add output min/max
+  const int output_tensor_id = outputs->data[0];
+  const auto& output_tensor = context->tensors[output_tensor_id];
+  float output_min, output_max;
+  TF_LITE_ENSURE_STATUS(
+      ComputeMinAndMaxQuantValues(output_tensor, &output_min, &output_max));
+  auto* output_min_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_min), sizeof(output_min));
+  auto* output_max_const = graph_builder_->AddConstNodeWithData(
+      scalar_shape, reinterpret_cast<char*>(&output_max), sizeof(output_max));
+  AddInput(TensorID(output_min_const->GetID(), 0));
+  AddInput(TensorID(output_max_const->GetID(), 0));
+
+  // Add outputs.
+  int output_batch_size, output_height_size, output_width_size,
+      output_depth_size;
+  GetDims(&output_batch_size, &output_height_size, &output_width_size,
+          &output_depth_size, context->tensors[outputs->data[0]].dims);
+  node_output_ = AddOutput(sizeof(uint8_t), 4,
+                           {output_batch_size, output_height_size,
+                            output_width_size, output_depth_size});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus MinMaxOpBuilder::RegisterOutputs(const TfLiteIntArray* outputs,
+                                              TfLiteContext* context) {
+  // Should be only 1 output.
+  graph_builder_->AddTensorWithID(outputs->data[0], node_output_.first,
+                                  node_output_.second);
+
+  return kTfLiteOk;
+}
+
+OpBuilder* CreateMinMaxBuilder(GraphBuilder* graph_builder, int op_type) {
+  return new MinMaxOpBuilder(graph_builder, op_type);
+}
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h
new file mode 100644
index 00000000000..4d50d941e4f
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/min_max_builder.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
+
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.h"
+
+namespace tflite {
+namespace delegates {
+namespace hexagon {
+
+class MinMaxOpBuilder : public OpBuilder {
+ public:
+  explicit MinMaxOpBuilder(GraphBuilder* graph_builder, int op_type)
+      : OpBuilder(graph_builder, op_type) {}
+
+  TfLiteStatus PopulateSubGraph(const TfLiteIntArray* inputs,
+                                const TfLiteIntArray* outputs,
+                                TfLiteContext* context) override;
+
+  TfLiteStatus RegisterOutputs(const TfLiteIntArray* outputs,
+                               TfLiteContext* context) override;
+
+ private:
+  TensorID node_output_;
+  float a_input_min_, a_input_max_, b_input_min_, b_input_max_;
+};
+
+}  // namespace hexagon
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_DELEGATES_HEXAGON_BUILDERS_MIN_MAX_BUILDER_H_
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
index c7432e64c79..230a292b6fe 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_builder.cc
@@ -93,6 +93,10 @@ OpBuilder* GraphBuilder::CreateOpBuilderFromTfLiteOp(int op_type) {
       return CreateQuantizeBuilder(this, OP_Requantize_8to8);
     case kTfLiteBuiltinHardSwish:
       return CreateHardSwishBuilder(this, OP_QuantizedHardSwish_8);
+    case kTfLiteBuiltinMinimum:
+      return CreateMinMaxBuilder(this, OP_QuantizedMinimum_8);
+    case kTfLiteBuiltinMaximum:
+      return CreateMinMaxBuilder(this, OP_QuantizedMaximum_8);
     default:
       context_->ReportError(context_, "Op not supported: %d", op_type);
       return nullptr;
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
index 0beb88cc68e..515d0edb929 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/op_factory.h
@@ -54,6 +54,7 @@ OpBuilder* CreateBatchSeqBuilder(GraphBuilder* graph_builder, int op_type,
 OpBuilder* CreateQuantizeBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateHardSwishBuilder(GraphBuilder* graph_builder, int op_type);
 OpBuilder* CreateCastBuilder(GraphBuilder* graph_builder, int op_type);
+OpBuilder* CreateMinMaxBuilder(GraphBuilder* graph_builder, int op_type);
 
 }  // namespace hexagon
 }  // namespace delegates
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
index 47a78dca6ac..a5cdc0411ca 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/BUILD
@@ -30,6 +30,7 @@ hexagon_op_tests(
         "conv_test.cc",
         "l2_norm_test.cc",
         "matmul_test.cc",
+        "min_max_builder_test.cc",
         "mirror_pad_test.cc",
         "mul_test.cc",
         "neg_test.cc",
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc
new file mode 100644
index 00000000000..315ea909c53
--- /dev/null
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/min_max_builder_test.cc
@@ -0,0 +1,171 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/delegates/hexagon/builders/tests/hexagon_delegate_op_model.h"
+
+namespace tflite {
+using testing::ElementsAreArray;
+
+template <typename data_type>
+class MinMaxOpModel : public SingleOpModelWithHexagon {
+ public:
+  MinMaxOpModel(tflite::BuiltinOperator op, const TensorData& input1,
+                const TensorData& input2, const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op, BuiltinOptions_MaximumMinimumOptions,
+                 CreateMaximumMinimumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  MinMaxOpModel(tflite::BuiltinOperator op, const TensorData& input1,
+                std::initializer_list<data_type> input1_values,
+                const TensorData& input2,
+                std::initializer_list<data_type> input2_values,
+                const TensorData& output, bool input1_const) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(op, BuiltinOptions_MaximumMinimumOptions,
+                 CreateMaximumMinimumOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+
+    // A workaround to mark the tensors as constant.
+    if (input1_const) {
+      auto* input1_tensor = interpreter_->tensor(input1_);
+      input1_tensor->allocation_type = kTfLiteMmapRo;
+    } else {
+      auto* input2_tensor = interpreter_->tensor(input2_);
+      input2_tensor->allocation_type = kTfLiteMmapRo;
+    }
+  }
+
+  void SetInput1(std::vector<data_type> data) { PopulateTensor(input1_, data); }
+
+  void SetInput2(std::vector<data_type> data) { PopulateTensor(input2_, data); }
+
+  std::vector<data_type> GetOutput() {
+    return ExtractVector<data_type>(output_);
+  }
+
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+template <typename data_type>
+void TestModel(tflite::BuiltinOperator op, const TensorData& input1,
+               const TensorData& input2, const TensorData& output,
+               std::initializer_list<data_type> input1_values,
+               std::initializer_list<data_type> input2_values) {
+  std::unique_ptr<MinMaxOpModel<data_type>> m;
+  m = std::make_unique<MinMaxOpModel<data_type>>(op, input1, input2, output);
+  m->SetInput1(input1_values);
+  m->SetInput2(input2_values);
+
+  m->Invoke();
+  const auto reference_output = m->GetOutput();
+  const auto reference_output_shape = m->GetOutputShape();
+  m->ApplyDelegateAndInvoke();
+  EXPECT_THAT(m->GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m->GetOutput(), ElementsAreArray(reference_output));
+}
+
+template <typename data_type>
+void TestModelConstInput(tflite::BuiltinOperator op, const TensorData& input1,
+                         const TensorData& input2, const TensorData& output,
+                         std::initializer_list<data_type> input1_values,
+                         std::initializer_list<data_type> input2_values,
+                         bool input1_const) {
+  std::unique_ptr<MinMaxOpModel<data_type>> m;
+  m = std::make_unique<MinMaxOpModel<data_type>>(
+      op, input1, input1_values, input2, input2_values, output, input1_const);
+  m->SetInput1(input1_values);
+  m->SetInput2(input2_values);
+
+  m->Invoke();
+  const auto reference_output = m->GetOutput();
+  const auto reference_output_shape = m->GetOutputShape();
+  m->ApplyDelegateAndInvoke();
+  EXPECT_THAT(m->GetOutputShape(), ElementsAreArray(reference_output_shape));
+  EXPECT_THAT(m->GetOutput(), ElementsAreArray(reference_output));
+}
+
+TEST(MinMaxOpTest, Maximum_Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModel<uint8_t>(BuiltinOperator_MAXIMUM,
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255}, data1, data2);
+}
+
+TEST(MinMaxOpTest, Maximum_Uint8Test_Const) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModelConstInput<uint8_t>(
+      BuiltinOperator_MAXIMUM, {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 255}, data1, data2, false);
+}
+
+TEST(MinMaxOpTest, Minimum_Uint8Test) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 255, 1};
+  TestModel<uint8_t>(BuiltinOperator_MINIMUM,
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255},
+                     {TensorType_UINT8, {1, 3, 1, 2}, -1, 255}, data1, data2);
+}
+
+TEST(MinMaxOpTest, Minimum_Uint8Test_Const) {
+  std::initializer_list<uint8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<uint8_t> data2 = {0, 0, 1, 12, 20, 1};
+  TestModelConstInput<uint8_t>(
+      BuiltinOperator_MINIMUM, {TensorType_UINT8, {1, 3, 1, 2}, -1, 25},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 25},
+      {TensorType_UINT8, {1, 3, 1, 2}, -1, 25}, data1, data2, false);
+}
+
+TEST(MinMaxOpTest, Maximum_Int8Test) {
+  std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 123, 1};
+  TestModel<int8_t>(BuiltinOperator_MAXIMUM,
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 125},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 125},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 125}, data1, data2);
+}
+
+TEST(MinMaxOpTest, Minimum_Int8Test) {
+  std::initializer_list<int8_t> data1 = {1, 0, 2, 11, 2, 23};
+  std::initializer_list<int8_t> data2 = {0, 0, 1, 12, 12, 1};
+  TestModel<int8_t>(BuiltinOperator_MINIMUM,
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 25},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 25},
+                    {TensorType_INT8, {1, 3, 1, 2}, -1, 25}, data1, data2);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index 1df0a6df66c..8aff13549b8 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -79,8 +79,10 @@ bool CheckOpVersion(const TfLiteRegistration* registration) {
     case kTfLiteBuiltinConcatenation:
     case kTfLiteBuiltinL2Normalization:
     case kTfLiteBuiltinLogistic:
+    case kTfLiteBuiltinMaximum:
     case kTfLiteBuiltinMaxPool2d:
     case kTfLiteBuiltinMean:
+    case kTfLiteBuiltinMinimum:
     case kTfLiteBuiltinMirrorPad:
     case kTfLiteBuiltinMul:
     case kTfLiteBuiltinPad:
@@ -366,6 +368,16 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       return InputsWithCorrectTypes(node, context,
                                     {{kTfLiteUInt8, kTfLiteInt8}});
     }
+    case kTfLiteBuiltinMinimum: {
+      return InputsWithCorrectTypes(
+          node, context,
+          {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteUInt8, kTfLiteInt8}});
+    }
+    case kTfLiteBuiltinMaximum: {
+      return InputsWithCorrectTypes(
+          node, context,
+          {{kTfLiteUInt8, kTfLiteInt8}, {kTfLiteUInt8, kTfLiteInt8}});
+    }
     default:
       return false;
   }

From b4cf239a2f1fb9f5979eed8d543fb8a754470339 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Thu, 14 May 2020 14:59:21 -0700
Subject: [PATCH 220/412] Reinstall tf-estimator-nightly since regular installs
 might not work.

PiperOrigin-RevId: 311611762
Change-Id: Ib72c156ab89422bbfce47be8216535c094b4cd6b
---
 tensorflow/tools/ci_build/release/common.sh      | 6 ++++--
 tensorflow/tools/ci_build/release/common_win.bat | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index bb40042e3af..0a9f6eae0b3 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -177,7 +177,8 @@ function install_ubuntu_16_pip_deps {
   "${PIP_CMD}" install scipy --user
   "${PIP_CMD}" install scikit-learn --user
   "${PIP_CMD}" install PyYAML==3.13 --user
-  "${PIP_CMD}" install --user --upgrade tf-estimator-nightly
+  # b/156523241
+  "${PIP_CMD}" install --force-reinstall --user --upgrade tf-estimator-nightly
   "${PIP_CMD}" install --user --upgrade tb-nightly
   "${PIP_CMD}" install --user --upgrade wrapt
   # LINT.ThenChange(:ubuntu_pip_installations)
@@ -220,7 +221,8 @@ function install_macos_pip_deps {
   ${SUDO_CMD} ${PIP_CMD} install --upgrade grpcio
   ${SUDO_CMD} ${PIP_CMD} install --upgrade tb-nightly
   ${PIP_CMD} install --user --upgrade attrs
-  ${PIP_CMD} install --user --upgrade tf-estimator-nightly
+  # b/156523241
+  ${PIP_CMD} install --force-reinstall --user --upgrade tf-estimator-nightly
   ${PIP_CMD} install --user --upgrade wrapt
   ${PIP_CMD} install --user --upgrade "future>=0.17.1"
 }
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index 85f22c1e4cb..d34c92736c0 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -28,7 +28,7 @@ SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 
 %PIP_EXE% install setuptools --upgrade
 %PIP_EXE% install future>=0.17.1 --no-deps
-%PIP_EXE% install tf-estimator-nightly --no-deps
+%PIP_EXE% install --force-reinstall tf-estimator-nightly --no-deps
 %PIP_EXE% install tb-nightly --no-deps
 %PIP_EXE% install numpy --upgrade --no-deps
 %PIP_EXE% install opt_einsum --upgrade

From 6abea04db74ef7eede4e3dbd91282c77df866d23 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 15:51:15 -0700
Subject: [PATCH 221/412] Added BUILD rules for the micro-frontend TF op so
 that we can use it as a C++ library.

PiperOrigin-RevId: 311621054
Change-Id: I54e9932fe54f7cc94f5863f7924d85853d24e48e
---
 tensorflow/lite/experimental/microfrontend/BUILD | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tensorflow/lite/experimental/microfrontend/BUILD b/tensorflow/lite/experimental/microfrontend/BUILD
index aaaf864bb60..bf0eb6ae726 100644
--- a/tensorflow/lite/experimental/microfrontend/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/BUILD
@@ -27,6 +27,17 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "audio_microfrontend_op_lib",
+    srcs = ["ops/audio_microfrontend_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/lite/experimental/microfrontend/lib:frontend",
+    ],
+    alwayslink = 1,
+)
+
 cc_test(
     name = "audio_microfrontend_test",
     size = "small",

From c628246c31ea9ff1d96ffc59a12f748db418ea76 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 16:14:53 -0700
Subject: [PATCH 222/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311625551
Change-Id: I3205d380573ed326d5b55cdc089577f34433f1f1
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index e6725269279..a6ee1a13b6e 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12053,7 +12053,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12064,7 +12064,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18969,7 +18969,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to {f:0.75  f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18980,7 +18980,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to {f:0.05  f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19384,7 +19384,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20455,7 +20455,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21627,7 +21627,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22335,7 +22335,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22531,7 +22531,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22600,7 +22600,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22715,7 +22715,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22774,7 +22774,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22948,7 +22948,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23325,7 +23325,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25648,7 +25648,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25711,7 +25711,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25962,7 +25962,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26446,7 +26446,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45534,7 +45534,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47474,7 +47474,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47545,7 +47545,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48534,7 +48534,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to {i:1  i:1  i:1  i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 7d40f2c3897a8c7bb8d236c352fcd267fbe9bc88 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Thu, 14 May 2020 16:21:53 -0700
Subject: [PATCH 223/412] Fix bazel TFLM compilation w/ TF_LITE_STATIC_MEMORY

Ensure dynamic string utils aren't compiled when this build define is
present.

PiperOrigin-RevId: 311626904
Change-Id: Ica229bf337019f0f446fdb94aaf42c6b7e7c749e
---
 tensorflow/lite/kernels/BUILD                        |  2 +-
 tensorflow/lite/kernels/internal/BUILD               |  3 +--
 .../internal/reference/portable_tensor_utils.cc      | 12 +++++++-----
 .../internal/reference/portable_tensor_utils.h       |  1 -
 .../internal/reference/portable_tensor_utils_impl.h  |  6 +++++-
 tensorflow/lite/kernels/internal/tensor.h            |  3 +++
 tensorflow/lite/kernels/internal/tensor_utils.h      |  7 ++++++-
 .../lite/kernels/internal/tensor_utils_test.cc       |  1 +
 tensorflow/lite/kernels/non_max_suppression.cc       |  1 -
 tensorflow/lite/string_util.cc                       |  2 ++
 tensorflow/lite/string_util.h                        |  4 ++++
 11 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 6f6d111fd77..3a29fee5699 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -386,7 +386,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels/internal:cppmath",
         "//tensorflow/lite/kernels/internal:quantization_util",
-        "@flatbuffers",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 93292fbb640..d6a96efdbf7 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -629,7 +629,6 @@ cc_library(
         ":cppmath",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:cpu_backend_context",
         "@gemmlowp",
     ],
 )
@@ -785,7 +784,6 @@ cc_library(
     deps = [
         ":cpu_check",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels:cpu_backend_context",
         "//third_party/eigen3",
     ],
 )
@@ -819,6 +817,7 @@ cc_test(
         ":quantization_util",
         ":tensor_utils",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:cpu_backend_context",
         "//tensorflow/lite/kernels:test_util",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 22e37d5af71..0e66dfee191 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "fixedpoint/fixedpoint.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
@@ -53,7 +52,7 @@ void PortableSymmetricQuantizeFloats(const float* values, const int size,
 void PortableSymmetricQuantizeFloats(const float* values, const int size,
                                      int8_t* quantized_values, float min_value,
                                      float max_value, float* scaling_factor) {
-  const int kScale = 127;
+  const int32_t kScale = 127;
   const float range = std::max(std::abs(min_value), std::abs(max_value));
   if (range == 0) {
     memset(quantized_values, 0, size * sizeof(int8_t));
@@ -66,7 +65,8 @@ void PortableSymmetricQuantizeFloats(const float* values, const int size,
     const int32_t quantized_value =
         static_cast<int32_t>(TfLiteRound(values[i] * scaling_factor_inv));
     // Clamp: just in case some odd numeric offset.
-    quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
+    quantized_values[i] = static_cast<int8_t>(
+        std::min(kScale, std::max(-kScale, quantized_value)));
   }
 }
 
@@ -660,7 +660,8 @@ void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2,
       int32_t value = static_cast<int32_t>(a) * static_cast<int32_t>(b);
       value = MultiplyByQuantizedMultiplier(value, multiplier, shift);
       value -= output_zp;
-      value = std::min(std::max(-128, value), 127);
+      value = std::min(std::max(static_cast<int32_t>(-128), value),
+                       static_cast<int32_t>(127));
 
       output[index] = static_cast<int8>(value);
     }
@@ -748,7 +749,8 @@ void PortableVectorBatchVectorCwiseProductAccumulate(
       int32_t prod = vector[v] * *batch_vector++;
       prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift);
       int32_t output = prod + *result;
-      output = std::max(std::min(32767, output), -32768);
+      output = std::max(std::min(static_cast<int32_t>(32767), output),
+                        static_cast<int32_t>(-32768));
       *result++ = output;
     }
   }
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index 9a365074513..f2e6c9b4f7d 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -18,7 +18,6 @@ limitations under the License.
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h"
 
 #if defined(_MSC_VER)
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index d8bd70f3722..6c15a6cd919 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -20,13 +20,17 @@ limitations under the License.
 // TODO(ghodrat): Remove this header file and the dependency to internal data
 // structure.
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
 
 namespace tflite {
+
+// Not all backends support CpuBackendContext usage, so forward declare to avoid
+// pulling in its implementation.
+class CpuBackendContext;
+
 namespace tensor_utils {
 
 // Limit a float input f between +abs_limit and -abs_limit.
diff --git a/tensorflow/lite/kernels/internal/tensor.h b/tensorflow/lite/kernels/internal/tensor.h
index 0005bf38d54..543117df0e5 100644
--- a/tensorflow/lite/kernels/internal/tensor.h
+++ b/tensorflow/lite/kernels/internal/tensor.h
@@ -119,6 +119,8 @@ class SequentialTensorWriter {
   T* output_ptr_;
 };
 
+// String ops are not yet supported on platforms w/ static memory.
+#ifndef TF_LITE_STATIC_MEMORY
 template <>
 class SequentialTensorWriter<string> {
  public:
@@ -138,6 +140,7 @@ class SequentialTensorWriter<string> {
   TfLiteTensor* output_;
   DynamicBuffer buffer_;
 };
+#endif  // TF_LITE_STATIC_MEMORY
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h
index 1929c2e2ff4..5e106eb7de4 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/tensor_utils.h
@@ -20,13 +20,18 @@ limitations under the License.
 
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/c/builtin_op_data.h"
-#include "tensorflow/lite/kernels/cpu_backend_context.h"
 
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
 
 namespace tflite {
+
+// Not all backends support CpuBackendContext usage, so forward declare to avoid
+// pulling in its implementation. Use of CpuBackendContext in method
+// implementations is purely optional.
+class CpuBackendContext;
+
 namespace tensor_utils {
 
 // Checks if all entries of vector are zero for float.
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 9b047d3ba84..3ad59acdb68 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
diff --git a/tensorflow/lite/kernels/non_max_suppression.cc b/tensorflow/lite/kernels/non_max_suppression.cc
index ee8e407066d..f57ee1bc5d2 100644
--- a/tensorflow/lite/kernels/non_max_suppression.cc
+++ b/tensorflow/lite/kernels/non_max_suppression.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <numeric>
 #include <vector>
 
-#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
diff --git a/tensorflow/lite/string_util.cc b/tensorflow/lite/string_util.cc
index f7fcf2ac630..44719858f2a 100644
--- a/tensorflow/lite/string_util.cc
+++ b/tensorflow/lite/string_util.cc
@@ -89,6 +89,7 @@ int DynamicBuffer::WriteToBuffer(char** buffer) {
   return bytes;
 }
 
+#ifndef TF_LITE_STATIC_MEMORY
 void DynamicBuffer::WriteToTensorAsVector(TfLiteTensor* tensor) {
   auto dims = TfLiteIntArrayCreate(1);
   dims->data[0] = offset_.size() - 1;  // Store number of strings.
@@ -109,6 +110,7 @@ void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor,
                     tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation,
                     tensor->is_variable, tensor);
 }
+#endif  // TF_LITE_STATIC_MEMORY
 
 int GetStringCount(const void* raw_buffer) {
   // The first integers in the raw buffer is the number of strings.
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index 779b1e12ab8..879aa76b83b 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -74,6 +74,9 @@ class DynamicBuffer {
   // The function allocates space for the buffer but does NOT take ownership.
   int WriteToBuffer(char** buffer);
 
+  // String tensors are not generally supported on platforms w/ static memory.
+  // TODO(b/156130024): Remove this guard after removing header from TFLM deps.
+#ifndef TF_LITE_STATIC_MEMORY
   // Fill content into a string tensor, with the given new_shape. The new shape
   // must match the number of strings in this object. Caller relinquishes
   // ownership of new_shape. If 'new_shape' is nullptr, keep the tensor's
@@ -82,6 +85,7 @@ class DynamicBuffer {
 
   // Fill content into a string tensor. Set shape to {num_strings}.
   void WriteToTensorAsVector(TfLiteTensor* tensor);
+#endif  // TF_LITE_STATIC_MEMORY
 
  private:
   // Data buffer to store contents of strings, not including headers.

From e6c2a5a212752e3b6a58a621a4ba512bbb9eb246 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 14 May 2020 16:53:17 -0700
Subject: [PATCH 224/412] Change more libraries in
 third_party/tensorflow/c/BUILD to depend on
 portable_tensorflow_lib_lite_no_runtime to support effort to reduce
 dependencies on mobile.

PiperOrigin-RevId: 311632630
Change-Id: I5061b458f894bccb9c0e23791d265f6ee95bdd38
---
 tensorflow/c/BUILD             | 48 ++++++++++++++++++++--------------
 tensorflow/core/platform/BUILD |  2 +-
 2 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 7fb02028837..05d5f9a3ed2 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -219,7 +219,7 @@ tf_cuda_library(
     ],
     deps = select({
         "//tensorflow:android": [
-            "//tensorflow/core:portable_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
             "//tensorflow/core:lib",
@@ -232,12 +232,13 @@ cc_library(
     srcs = ["tf_status.cc"],
     hdrs = ["tf_status.h"],
     visibility = ["//visibility:public"],
-    deps = select({
+    deps = [
+        ":tf_status_internal",
+    ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:portable_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            ":tf_status_internal",
             "//tensorflow/core:lib",
         ],
     }),
@@ -259,10 +260,15 @@ cc_library(
     name = "tensor_interface",
     hdrs = ["tensor_interface.h"],
     visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }),
 )
 
 cc_library(
@@ -286,16 +292,17 @@ cc_library(
     srcs = ["tf_tensor.cc"],
     hdrs = ["tf_tensor.h"],
     visibility = ["//visibility:public"],
-    deps = select({
+    deps = [
+        ":tensor_interface",
+        ":tf_datatype",
+        ":tf_status",
+        ":tf_status_helper",
+        ":tf_tensor_internal",
+    ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:portable_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            ":tensor_interface",
-            ":tf_datatype",
-            ":tf_status",
-            ":tf_status_helper",
-            ":tf_tensor_internal",
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
             "//tensorflow/core:protos_all_cc",
@@ -311,14 +318,15 @@ tf_cuda_library(
         "tf_tensor_internal.h",
     ],
     visibility = ["//tensorflow:internal"],
-    deps = select({
+    deps = [
+        ":tensor_interface",
+        ":tf_datatype",
+        ":tf_status",
+    ] + select({
         "//tensorflow:android": [
-            "//tensorflow/core:portable_tensorflow_lib_lite",
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
         ],
         "//conditions:default": [
-            ":tensor_interface",
-            ":tf_datatype",
-            ":tf_status",
             "//tensorflow/core:framework",
             "//tensorflow/core:protos_all_cc",
             "//tensorflow/core/platform:casts",
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index c7ff378d2ac..f78b738247d 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -1472,6 +1472,7 @@ filegroup(
         "abi.h",
         "blocking_counter.h",
         "byte_order.h",
+        "casts.h",
         "coding.cc",
         "coding.h",
         "context.h",
@@ -1557,7 +1558,6 @@ filegroup(
     srcs = [
         "base64.cc",
         "base64.h",
-        "casts.h",
         "cpu_feature_guard.cc",
         "cpu_feature_guard.h",
         "fingerprint.h",

From 90077f8c7c6517d4d761e35ade80597aab458873 Mon Sep 17 00:00:00 2001
From: Lucy Fox <lucyfox@google.com>
Date: Thu, 14 May 2020 16:59:59 -0700
Subject: [PATCH 225/412] Instrument the number of times the MLIR-based TF
 Bridge is enabled.

PiperOrigin-RevId: 311633792
Change-Id: Iba286e1c82900833b5cf9f69a697a312e51f3156
---
 tensorflow/compiler/tf2xla/BUILD               |  6 +-----
 tensorflow/compiler/tf2xla/mlir_bridge_pass.cc | 12 ++++++++++++
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 897528b6de9..55341c0a01f 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -704,12 +704,8 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:convert_graphdef",
-        "//tensorflow/compiler/mlir/tensorflow:device_util",
-        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/core:core_cpu",
-        "@com_google_absl//absl/container:flat_hash_set",
+        "//tensorflow/core:lib",
         "@llvm-project//llvm:support",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 499e27f0981..c398e5f129e 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -18,10 +18,18 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/bridge.h"
+#include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
 
+auto* mlir_bridge_gauge_v1 = monitoring::Gauge<bool, 0>::New(
+    "/tensorflow/config/experimental/enable_mlir_bridge_gauge_v1",
+    "Tracks usage of the MLIR-based TF2XLA bridge among TF1 models");
+auto* mlir_bridge_gauge_v2 = monitoring::Gauge<bool, 0>::New(
+    "/tensorflow/config/experimental/enable_mlir_bridge_gauge_v2",
+    "Tracks usage of the MLIR-based TF2XLA bridge among TF2 models");
+
 // This runs the first phase of the "bridge", transforming the graph in a form
 // that can be executed with delegation of some computations to an accelerator.
 // This builds on the model of XLA where a subset of the graph is encapsulated
@@ -32,10 +40,12 @@ Status MlirBridgePass::Run(const ConfigProto& config_proto,
                            mlir::ModuleOp module) {
   if (!config_proto.experimental().enable_mlir_bridge()) {
     VLOG(0) << "Skipping MLIR TPU Bridge, session flag not enabled";
+    mlir_bridge_gauge_v2->GetCell()->Set(false);
     return Status::OK();
   }
 
   VLOG(0) << "Running MLIR TPU Bridge";
+  mlir_bridge_gauge_v2->GetCell()->Set(true);
   TF_RETURN_IF_ERROR(
       mlir::TFTPU::TPUBridge(module, /*enable_logging=*/VLOG_IS_ON(1)));
 
@@ -48,10 +58,12 @@ Status MlirBridgeV1CompatPass::Run(const GraphOptimizationPassOptions& options,
 
   if (!options.session_options->config.experimental().enable_mlir_bridge()) {
     VLOG(0) << "Skipping MLIR TPU Bridge V1 Compat, session flag not enabled";
+    mlir_bridge_gauge_v1->GetCell()->Set(false);
     return Status::OK();
   }
 
   VLOG(0) << "Running MLIR TPU Bridge V1 Compat";
+  mlir_bridge_gauge_v1->GetCell()->Set(true);
   TF_RETURN_IF_ERROR(
       mlir::TFTPU::TPUBridgeV1Compat(module, /*enable_logging=*/VLOG_IS_ON(1)));
 

From d5e0f468cd1a9ddb1de1eaeb62734dc177047c72 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Thu, 14 May 2020 17:03:36 -0700
Subject: [PATCH 226/412] Report remote target in error messages for gRPC eager
 service requests.

PiperOrigin-RevId: 311634462
Change-Id: Ib0550c172e419ea17dac9ffa28c18b9e1a03b3cc
---
 .../rpc/eager/grpc_eager_client.cc            | 13 +++++++-----
 .../rpc/grpc_rpc_factory.cc                   |  3 ++-
 .../core/distributed_runtime/rpc/grpc_state.h | 21 ++++++++++++-------
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index de4f36ea24d..752bfdf71a1 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -106,8 +106,8 @@ class GrpcEagerClientThread : public core::RefCounted {
 class GrpcEagerClient : public EagerClient {
  public:
   GrpcEagerClient(const tensorflow::SharedGrpcChannelPtr& channel,
-                  GrpcEagerClientThread* thread)
-      : stub_(channel), thread_(thread) {
+                  GrpcEagerClientThread* thread, const string& target)
+      : stub_(channel), thread_(thread), target_(target) {
     // Hold a reference to make sure the corresponding EagerClientThread
     // outlives the client.
     thread_->Ref();
@@ -127,7 +127,8 @@ class GrpcEagerClient : public EagerClient {
     new RPCState<protobuf::Message>(                                      \
         &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request, \
         response, std::move(done_wrapped), /*call_opts=*/nullptr,         \
-        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true);   \
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,    \
+        &target_);                                                        \
   }
 
   CLIENT_METHOD(CreateContext);
@@ -146,7 +147,8 @@ class GrpcEagerClient : public EagerClient {
     new RPCState<protobuf::Message>(
         &stub_, cq_, "/tensorflow.eager.EagerService/CloseContext", *request,
         response, std::move(done_wrapped), /*call_opts=*/nullptr,
-        /*threadpool=*/nullptr);
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
+        &target_);
 
     VLOG(1) << "Sending RPC to close remote eager context "
             << request->DebugString();
@@ -194,6 +196,7 @@ class GrpcEagerClient : public EagerClient {
  private:
   ::grpc::GenericStub stub_;
   const GrpcEagerClientThread* thread_;
+  const string target_;
 
   ::grpc::CompletionQueue* cq_;
 
@@ -236,7 +239,7 @@ class GrpcEagerClientCache : public EagerClientCache {
       int assigned_index = AssignClientToThread(target);
       GrpcEagerClientThread* thread = threads_[assigned_index].get();
       core::RefCountPtr<EagerClient> worker(
-          new GrpcEagerClient(shared, thread));
+          new GrpcEagerClient(shared, thread, target));
       it = clients_.emplace(target, std::move(worker)).first;
     }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index 272d6bb1b20..bcb98baaeb9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -210,7 +210,8 @@ void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
       get_stub(index), &completion_queue_, *get_method_ptr(index),
       call->request(), call->response(),
       /*done=*/[call](const Status& s) { call->Done(s); }, call->call_opts(),
-      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_, 0 /* max_retries */);
+      /*threadpool=*/nullptr, fail_fast_, timeout_in_ms_, /*max_retries=*/0,
+      /*target=*/nullptr);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index c72ba6035a4..041b6e51ffb 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -45,7 +45,7 @@ class RPCState : public GrpcClientCQTag {
            const ::grpc::string& method, const protobuf::Message& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
            thread::ThreadPool* threadpool, int32 max_retries = 0,
-           bool fail_fast = true)
+           bool fail_fast = true, const string* target = nullptr)
       : RPCState(
             stub, cq, method, request, response, std::move(done), call_opts,
             threadpool,
@@ -63,7 +63,7 @@ class RPCState : public GrpcClientCQTag {
 #endif  // PLATFORM_GOOGLE
               return x;
             }(),
-            /*timeout_in_ms=*/0, max_retries) {
+            /*timeout_in_ms=*/0, max_retries, target) {
   }
 
   template <typename Request>
@@ -71,7 +71,7 @@ class RPCState : public GrpcClientCQTag {
            const ::grpc::string& method, const Request& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
            thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms,
-           int32 max_retries)
+           int32 max_retries, const string* target)
       : call_opts_(call_opts),
         threadpool_(threadpool),
         done_(std::move(done)),
@@ -80,7 +80,8 @@ class RPCState : public GrpcClientCQTag {
         cq_(cq),
         stub_(stub),
         method_(method),
-        fail_fast_(fail_fast) {
+        fail_fast_(fail_fast),
+        target_(target) {
     response_ = response;
     ::grpc::Status s = GrpcMaybeUnparseProto(request, &request_buf_);
     if (!s.ok()) {
@@ -152,10 +153,13 @@ class RPCState : public GrpcClientCQTag {
       StartCall();
     } else {
       // Attach additional GRPC error information if any to the final status
-      s = Status(s.code(),
-                 strings::StrCat(s.error_message(),
-                                 "\nAdditional GRPC error information:\n",
-                                 context_->debug_error_string()));
+      string error_msg = s.error_message();
+      strings::StrAppend(&error_msg, "\nAdditional GRPC error information");
+      if (target_) {
+        strings::StrAppend(&error_msg, " from remote target ", *target_);
+      }
+      strings::StrAppend(&error_msg, ":\n:", context_->debug_error_string());
+      s = Status(s.code(), error_msg);
       // Always treat gRPC cancellation as a derived error. This ensures that
       // other error types are preferred during status aggregation. (gRPC
       // cancellation messages do not contain the original status message).
@@ -196,6 +200,7 @@ class RPCState : public GrpcClientCQTag {
   ::grpc::GenericStub* stub_;
   ::grpc::string method_;
   bool fail_fast_;
+  const string* target_;
 };
 
 // Represents state associated with one streaming RPC call.

From 9a6a6476b563a65416b4bb438d021a2c7e52f139 Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 00:40:15 +0000
Subject: [PATCH 227/412] Add test and remove decorator

---
 tensorflow/python/kernel_tests/map_fn_test.py |  8 +++-----
 tensorflow/python/ops/map_fn.py               | 11 +----------
 2 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 1859c6c5873..0bc3307e484 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -189,20 +189,18 @@ class MapFnTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def testMap_autograph_indirect(self):
     def test_function(x):
-      cond = tf.constant(-1)
+      cond = constant_op.constant(-1)
       if cond == 0:
         result = x
       else:
         result = x
       return result
-
-    @tf.function
     def map_call(x):
-      return tf.map_fn(test_function, x)
+      return map_fn.map_fn(test_function, x)
 
     x = constant_op.constant([1])
     y = map_call(x)
-    self.assertAllEqual([1], self.evaluate(y))
+    self.assertAllEqual([1], self.evaluate(y)) 
 
   @test_util.run_in_graph_and_eager_modes
   def testMapShape(self):
diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index dfe32998282..4a21a6e148b 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -39,14 +39,6 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
-autograph_ctx = lazy_loader.LazyLoader(
-    "autograph_ctx", globals(),
-    "tensorflow.python.autograph.core.ag_ctx")
-autograph = lazy_loader.LazyLoader(
-    "autograph", globals(),
-    "tensorflow.python.autograph.impl.api")
-
-@tf_export(v1=["map_fn"])
 @deprecation.deprecated_args(None, "Use fn_output_signature instead", "dtype")
 def map_fn(fn,
            elems,
@@ -483,8 +475,7 @@ def map_fn(fn,
       elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable,
                                                         elems_flat_signature)
       elems_value = elems_unflatten(elems_value_flat)
-      ag_ctx = autograph_ctx.control_status_ctx()
-      result_value = autograph.tf_convert(elems_value, ag_ctx)
+      result_value = fn(elems_value)
       nest.assert_same_structure(fn_output_signature or elems, result_value)
       result_value_flat = nest.flatten(result_value)
       result_value_batchable = _result_value_flat_to_batchable(

From 86342e236b40996ea5b6ccd17f1e753b00668d1c Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 02:45:52 +0200
Subject: [PATCH 228/412] restore a remove export

---
 tensorflow/python/ops/map_fn.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 4a21a6e148b..2c9c678336e 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -39,6 +39,8 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
+
+@tf_export(v1=["map_fn"])
 @deprecation.deprecated_args(None, "Use fn_output_signature instead", "dtype")
 def map_fn(fn,
            elems,

From a2ef8b5a0659516dad3ce3f501223286615dab56 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 17:47:38 -0700
Subject: [PATCH 229/412] Update svd_op_test to run (non-gradient) tests in
 eager as well as graph mode.

PiperOrigin-RevId: 311640894
Change-Id: I39b4666c461c64ffe3f33992bb536961a266abd7
---
 tensorflow/python/kernel_tests/BUILD          |   2 +-
 tensorflow/python/kernel_tests/svd_op_test.py | 162 +++++++++---------
 2 files changed, 81 insertions(+), 83 deletions(-)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index b226e0cb859..13f59b74baf 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -3468,7 +3468,7 @@ cuda_py_test(
     name = "svd_op_test",
     size = "medium",
     srcs = ["svd_op_test.py"],
-    shard_count = 20,
+    shard_count = 30,
     tags = [
         "no_oss",  # b/117185141.
         "nomsan",  # TODO(b/117236102): Re-enable in msan build.
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index a53d2470aa5..6c2199cc591 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -20,8 +20,8 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -31,7 +31,7 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -58,35 +58,31 @@ class SvdOpTest(test.TestCase):
                                  "Shape must be at least rank 2 but is rank 1"):
       linalg_ops.svd(vector)
 
-  @test_util.run_v1_only("b/120545219")
-  def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      all_ops = []
-      for compute_uv_ in True, False:
-        for full_matrices_ in True, False:
-          matrix1 = random_ops.random_normal([5, 5], seed=42)
-          matrix2 = random_ops.random_normal([5, 5], seed=42)
-          if compute_uv_:
-            s1, u1, v1 = linalg_ops.svd(
-                matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            s2, u2, v2 = linalg_ops.svd(
-                matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            all_ops += [s1, u1, v1, s2, u2, v2]
-          else:
-            s1 = linalg_ops.svd(
-                matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            s2 = linalg_ops.svd(
-                matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
-            all_ops += [s1, s2]
-      val = self.evaluate(all_ops)
-      for i in range(2):
-        s = 6 * i
-        self.assertAllEqual(val[s], val[s + 3])  # s1 == s2
-        self.assertAllEqual(val[s + 1], val[s + 4])  # u1 == u2
-        self.assertAllEqual(val[s + 2], val[s + 5])  # v1 == v2
-      for i in range(2):
-        s = 12 + 2 * i
-        self.assertAllEqual(val[s], val[s + 1])  # s1 == s2
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
+  def testExecuteMultipleWithoutError(self):
+    all_ops = []
+    shape = [6, 5]
+    seed = [42, 24]
+    for compute_uv_ in True, False:
+      for full_matrices_ in True, False:
+        matrix1 = stateless_random_ops.stateless_random_normal(shape, seed)
+        matrix2 = stateless_random_ops.stateless_random_normal(shape, seed)
+        self.assertAllEqual(matrix1, matrix2)
+        if compute_uv_:
+          s1, u1, v1 = linalg_ops.svd(
+              matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          s2, u2, v2 = linalg_ops.svd(
+              matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          all_ops += [s1, s2, u1, u2, v1, v2]
+        else:
+          s1 = linalg_ops.svd(
+              matrix1, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          s2 = linalg_ops.svd(
+              matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
+          all_ops += [s1, s2]
+    val = self.evaluate(all_ops)
+    for i in range(0, len(val), 2):
+      self.assertAllEqual(val[i], val[i + 1])
 
 
 def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
@@ -136,8 +132,10 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
     self.assertAllClose(identity, xx, atol=tol)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
+    if not use_static_shape_ and context.executing_eagerly():
+      return
     is_complex = dtype_ in (np.complex64, np.complex128)
     is_single = dtype_ in (np.float32, np.complex64)
     tol = 3e-4 if is_single else 1e-12
@@ -152,48 +150,48 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
 
-    with self.session(use_gpu=True) as sess:
-      if use_static_shape_:
-        x_tf = constant_op.constant(x_np)
-      else:
-        x_tf = array_ops.placeholder(dtype_)
+    if use_static_shape_:
+      x_tf = constant_op.constant(x_np)
+    else:
+      x_tf = array_ops.placeholder(dtype_)
 
-      if compute_uv_:
-        s_tf, u_tf, v_tf = linalg_ops.svd(
-            x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
-        if use_static_shape_:
-          s_tf_val, u_tf_val, v_tf_val = self.evaluate([s_tf, u_tf, v_tf])
-        else:
+    if compute_uv_:
+      s_tf, u_tf, v_tf = linalg_ops.svd(
+          x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
+      if use_static_shape_:
+        s_tf_val, u_tf_val, v_tf_val = self.evaluate([s_tf, u_tf, v_tf])
+      else:
+        with self.session(use_gpu=True) as sess:
           s_tf_val, u_tf_val, v_tf_val = sess.run(
               [s_tf, u_tf, v_tf], feed_dict={x_tf: x_np})
+    else:
+      s_tf = linalg_ops.svd(
+          x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
+      if use_static_shape_:
+        s_tf_val = self.evaluate(s_tf)
       else:
-        s_tf = linalg_ops.svd(
-            x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
-        if use_static_shape_:
-          s_tf_val = self.evaluate(s_tf)
-        else:
+        with self.session(use_gpu=True) as sess:
           s_tf_val = sess.run(s_tf, feed_dict={x_tf: x_np})
 
-      if compute_uv_:
-        u_np, s_np, v_np = np.linalg.svd(
-            x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
-      else:
-        s_np = np.linalg.svd(
-            x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
-      # We explicitly avoid the situation where numpy eliminates a first
-      # dimension that is equal to one.
-      s_np = np.reshape(s_np, s_tf_val.shape)
+    if compute_uv_:
+      u_np, s_np, v_np = np.linalg.svd(
+          x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
+    else:
+      s_np = np.linalg.svd(
+          x_np, compute_uv=compute_uv_, full_matrices=full_matrices_)
+    # We explicitly avoid the situation where numpy eliminates a first
+    # dimension that is equal to one.
+    s_np = np.reshape(s_np, s_tf_val.shape)
 
-      CompareSingularValues(self, s_np, s_tf_val, tol)
-      if compute_uv_:
-        CompareSingularVectors(self, u_np, u_tf_val, min(shape_[-2:]), tol)
-        CompareSingularVectors(self,
-                               np.conj(np.swapaxes(v_np, -2, -1)), v_tf_val,
-                               min(shape_[-2:]), tol)
-        CheckApproximation(self, x_np, u_tf_val, s_tf_val, v_tf_val,
-                           full_matrices_, tol)
-        CheckUnitary(self, u_tf_val, tol)
-        CheckUnitary(self, v_tf_val, tol)
+    CompareSingularValues(self, s_np, s_tf_val, tol)
+    if compute_uv_:
+      CompareSingularVectors(self, u_np, u_tf_val, min(shape_[-2:]), tol)
+      CompareSingularVectors(self, np.conj(np.swapaxes(v_np, -2, -1)), v_tf_val,
+                             min(shape_[-2:]), tol)
+      CheckApproximation(self, x_np, u_tf_val, s_tf_val, v_tf_val,
+                         full_matrices_, tol)
+      CheckUnitary(self, u_tf_val, tol)
+      CheckUnitary(self, v_tf_val, tol)
 
   return Test
 
@@ -378,15 +376,15 @@ if __name__ == "__main__":
         for rows in 0, 1, 2, 5, 10, 32, 100:
           for cols in 0, 1, 2, 5, 10, 32, 100:
             for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
-              shape = batch_dims + (rows, cols)
-              # TF2 does not support placeholders under eager so we skip it
-              for use_static_shape in set([True, tf2.enabled()]):
+              full_shape = batch_dims + (rows, cols)
+              for use_static_shape in set([True, False]):
                 name = "%s_%s_static_shape_%s__compute_uv_%s_full_%s" % (
-                    dtype.__name__, "_".join(map(str, shape)), use_static_shape,
-                    compute_uv, full_matrices)
-                _AddTest(SvdOpTest, "Svd", name,
-                         _GetSvdOpTest(dtype, shape, use_static_shape,
-                                       compute_uv, full_matrices))
+                    dtype.__name__, "_".join(map(str, full_shape)),
+                    use_static_shape, compute_uv, full_matrices)
+                _AddTest(
+                    SvdOpTest, "Svd", name,
+                    _GetSvdOpTest(dtype, full_shape, use_static_shape,
+                                  compute_uv, full_matrices))
   for compute_uv in False, True:
     for full_matrices in False, True:
       dtypes = ([np.float32, np.float64] + [np.complex64, np.complex128] *
@@ -397,16 +395,16 @@ if __name__ == "__main__":
           mat_shapes += [(5, 11), (11, 5)]
         for mat_shape in mat_shapes:
           for batch_dims in [(), (3,)]:
-            shape = batch_dims + mat_shape
-            name = "%s_%s_compute_uv_%s_full_%s" % (
-                dtype.__name__, "_".join(map(str, shape)), compute_uv,
-                full_matrices)
-            _AddTest(SvdGradOpTest, "SvdGrad", name,
-                     _GetSvdGradOpTest(dtype, shape, compute_uv, full_matrices))
+            full_shape = batch_dims + mat_shape
+            name = "%s_%s_compute_uv_%s_full_%s" % (dtype.__name__, "_".join(
+                map(str, full_shape)), compute_uv, full_matrices)
+            _AddTest(
+                SvdGradOpTest, "SvdGrad", name,
+                _GetSvdGradOpTest(dtype, full_shape, compute_uv, full_matrices))
             # The results are too inaccurate for float32.
             if dtype in (np.float64, np.complex128):
               _AddTest(
                   SvdGradGradOpTest, "SvdGradGrad", name,
-                  _GetSvdGradGradOpTest(dtype, shape, compute_uv,
+                  _GetSvdGradGradOpTest(dtype, full_shape, compute_uv,
                                         full_matrices))
   test.main()

From 4662933489550ed226c1682967e8632af9218363 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Thu, 14 May 2020 18:00:36 -0700
Subject: [PATCH 230/412] Build DynamicSlice and DynamicUpdateSlice ops with
 MlirHloBuilder

Whitelist XlaDynamicSlice and XlaDynamicUpdateSlice for testing

PiperOrigin-RevId: 311642899
Change-Id: Icbf009cf69d3b183d0c83c10925a5fbaa3c49f1f
---
 .../compiler/mlir/xla/ir/mlir_hlo_builder.cc  | 22 +++++++++
 .../compiler/mlir/xla/ir/mlir_hlo_builder.h   |  8 ++++
 .../xla/tests/legalize-tf-with-tf2xla.mlir    | 24 ++++++++++
 .../xla/transforms/legalize_tf_with_tf2xla.cc |  2 +
 tensorflow/compiler/tests/xla_ops_test.py     |  5 +-
 tensorflow/compiler/xla/client/xla_builder.cc | 48 +++++++++++--------
 tensorflow/compiler/xla/client/xla_builder.h  |  6 +++
 7 files changed, 93 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
index cc334d8654f..461c357e509 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.cc
@@ -282,6 +282,28 @@ StatusOr<XlaOp> MlirHloBuilder::SliceInternal(
       GetI64ElementsAttr(strides, &builder_)));
 }
 
+StatusOr<XlaOp> MlirHloBuilder::DynamicSliceInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+    absl::Span<const int64> slice_sizes) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_ty,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::DynamicSliceOp>(
+      loc_, result_ty, GetValue(operand), GetValues(start_indices),
+      GetI64ElementsAttr(slice_sizes, &builder_)));
+}
+
+StatusOr<XlaOp> MlirHloBuilder::DynamicUpdateSliceInternal(
+    const Shape& shape, XlaOp operand, XlaOp update,
+    absl::Span<const XlaOp> start_indices) {
+  TF_ASSIGN_OR_RETURN(
+      mlir::Type result_ty,
+      ConvertShapeToType<mlir::RankedTensorType>(shape, builder_));
+  return MakeXlaOp(builder_.create<mlir::xla_hlo::DynamicUpdateSliceOp>(
+      loc_, result_ty, GetValue(operand), GetValue(update),
+      GetValues(start_indices)));
+}
+
 StatusOr<XlaOp> MlirHloBuilder::PadInternal(
     const Shape& shape, XlaOp operand, XlaOp padding_value,
     const PaddingConfig& padding_config) {
diff --git a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
index 5a84d60cdc2..fc5baaee44d 100644
--- a/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
+++ b/tensorflow/compiler/mlir/xla/ir/mlir_hlo_builder.h
@@ -175,6 +175,14 @@ class MlirHloBuilder : public XlaBuilder {
                                 absl::Span<const int64> limit_indices,
                                 absl::Span<const int64> strides) override;
 
+  StatusOr<XlaOp> DynamicSliceInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+      absl::Span<const int64> slice_sizes) override;
+
+  StatusOr<XlaOp> DynamicUpdateSliceInternal(
+      const Shape& shape, XlaOp operand, XlaOp update,
+      absl::Span<const XlaOp> start_indices) override;
+
   StatusOr<XlaOp> PadInternal(const Shape& shape, XlaOp operand,
                               XlaOp padding_value,
                               const PaddingConfig& padding_config) override;
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
index 01398eb7314..e8d5cfe997d 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf-with-tf2xla.mlir
@@ -163,6 +163,30 @@ func @truncated_normal() -> tensor<2x2xf32> {
   return %1 : tensor<2x2xf32>
 }
 
+// CHECK-LABEL: dynamic_update_slice
+// CHECK-SAME: (%[[ARG0:.*]]: tensor<3x4xi32>, %[[ARG1:.*]]: tensor<2x2xi32>, %[[ARG2:.*]]: tensor<2xi32>
+func @dynamic_update_slice(%arg0: tensor<3x4xi32>, %arg1: tensor<2x2xi32>, %arg2: tensor<2xi32>) -> tensor<3x4xi32> {
+
+  // CHECK: %[[SLICE0:.*]] = "xla_hlo.slice"(%[[ARG2]])
+  // CHECK-DAG-SAME: start_indices = dense<0> : tensor<1xi64>
+  // CHECK-DAG-SAME: limit_indices = dense<1> : tensor<1xi64>
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>
+  // CHECK-SAME: (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK: %[[DIM0:.*]] = "xla_hlo.reshape"(%[[SLICE0]]) : (tensor<1xi32>) -> tensor<i32>
+
+  // CHECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%[[ARG2]])
+  // CHECK-DAG-SAME: start_indices = dense<1> : tensor<1xi64>
+  // CHECK-DAG-SAME: limit_indices = dense<2> : tensor<1xi64>
+  // CHECK-DAG-SAME: strides = dense<1> : tensor<1xi64>
+  // CHECK-SAME: (tensor<2xi32>) -> tensor<1xi32>
+  // CHECK: %[[DIM1:.*]] = "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<1xi32>) -> tensor<i32>
+
+  // CHECK: "xla_hlo.dynamic-update-slice"(%[[ARG0]], %[[ARG1]], %[[DIM0]], %[[DIM1]])
+
+  %0 = "tf.XlaDynamicUpdateSlice"(%arg0, %arg1, %arg2) : (tensor<3x4xi32>, tensor<2x2xi32>, tensor<2xi32>) -> tensor<3x4xi32>
+  return %0: tensor<3x4xi32>
+}
+
 // TODO(hinsu): Add a test with a valid TF op for which tf2xla kernel is
 // available but doesn't support this instance.
 }
diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
index 86a2defd3a8..76657bd5e20 100644
--- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_with_tf2xla.cc
@@ -168,6 +168,8 @@ static bool IsOpWhitelisted(Operation* op) {
     TypeID::get<TF::XlaBroadcastHelperOp>(),
     TypeID::get<TF::XlaConvOp>(),
     TypeID::get<TF::XlaDotOp>(),
+    TypeID::get<TF::XlaDynamicSliceOp>(),
+    TypeID::get<TF::XlaDynamicUpdateSliceOp>(),
     TypeID::get<TF::XlaPadOp>(),
     TypeID::get<TF::Xlog1pyOp>(),
     TypeID::get<TF::XlogyOp>()
diff --git a/tensorflow/compiler/tests/xla_ops_test.py b/tensorflow/compiler/tests/xla_ops_test.py
index 1f83701ea7c..f3e915daa67 100644
--- a/tensorflow/compiler/tests/xla_ops_test.py
+++ b/tensorflow/compiler/tests/xla_ops_test.py
@@ -304,7 +304,6 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
       self._assertOpOutputMatchesExpected(
           lambda x: xla.transpose(x, [1, 0]), args=(v,), expected=v.T)
 
-  @test_util.disable_mlir_bridge('Not supported yet')
   def testDynamicSlice(self):
     for dtype in self.numeric_types:
       self._assertOpOutputMatchesExpected(
@@ -317,7 +316,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
                         [[673, 674], [683, 684], [693, 694]]]),
               dtype=dtype))
 
-  @test_util.disable_mlir_bridge('Not supported yet')
+  @test_util.disable_mlir_bridge('Error handling')
   def testDynamicSliceWithIncorrectStartIndicesShape(self):
     with self.session() as session:
       with self.test_scope():
@@ -331,7 +330,7 @@ class XlaOpsNumericalTest(xla_test.XLATestCase, parameterized.TestCase):
           (r'start_indices must be a vector with length equal to input rank, '
            r'but input rank is 3 and start_indices has shape \[2\].*'))
 
-  @test_util.disable_mlir_bridge('Not supported yet')
+  @test_util.disable_mlir_bridge('Error handling')
   def testDynamicSliceWithIncorrectSizeIndicesShape(self):
     with self.session() as session:
       with self.test_scope():
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 6539817d524..a4e5b936153 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -864,8 +864,6 @@ XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
                                absl::Span<const XlaOp> start_indices,
                                absl::Span<const int64> slice_sizes) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     std::vector<const Shape*> start_indices_shape_ptrs;
     TF_ASSIGN_OR_RETURN(const auto& start_indices_shapes,
@@ -876,23 +874,28 @@ XlaOp XlaBuilder::DynamicSlice(XlaOp operand,
     TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicSliceShape(
                             *operand_shape, start_indices_shapes, slice_sizes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    for (int64 size : slice_sizes) {
-      instr.add_dynamic_slice_sizes(size);
-    }
-
-    std::vector<XlaOp> operands = {operand};
-    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, operands);
+    return DynamicSliceInternal(shape, operand, start_indices, slice_sizes);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::DynamicSliceInternal(
+    const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+    absl::Span<const int64> slice_sizes) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+
+  for (int64 size : slice_sizes) {
+    instr.add_dynamic_slice_sizes(size);
+  }
+
+  std::vector<XlaOp> operands = {operand};
+  operands.insert(operands.end(), start_indices.begin(), start_indices.end());
+  return AddInstruction(std::move(instr), HloOpcode::kDynamicSlice, operands);
+}
+
 XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
                                      absl::Span<const XlaOp> start_indices) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
-
     TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand));
     TF_ASSIGN_OR_RETURN(const Shape* update_shape, GetShapePtr(update));
     std::vector<const Shape*> start_indices_shape_ptrs;
@@ -904,15 +907,22 @@ XlaOp XlaBuilder::DynamicUpdateSlice(XlaOp operand, XlaOp update,
     TF_ASSIGN_OR_RETURN(
         Shape shape, ShapeInference::InferDynamicUpdateSliceShape(
                          *operand_shape, *update_shape, start_indices_shapes));
-    *instr.mutable_shape() = shape.ToProto();
-
-    std::vector<XlaOp> operands = {operand, update};
-    operands.insert(operands.end(), start_indices.begin(), start_indices.end());
-    return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
-                          operands);
+    return DynamicUpdateSliceInternal(shape, operand, update, start_indices);
   });
 }
 
+StatusOr<XlaOp> XlaBuilder::DynamicUpdateSliceInternal(
+    const Shape& shape, XlaOp operand, XlaOp update,
+    absl::Span<const XlaOp> start_indices) {
+  HloInstructionProto instr;
+  *instr.mutable_shape() = shape.ToProto();
+
+  std::vector<XlaOp> operands = {operand, update};
+  operands.insert(operands.end(), start_indices.begin(), start_indices.end());
+  return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
+                        operands);
+}
+
 XlaOp XlaBuilder::ConcatInDim(absl::Span<const XlaOp> operands,
                               int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 24b0cba3a1b..b631514248c 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -423,9 +423,15 @@ class XlaBuilder {
 
   XlaOp DynamicSlice(XlaOp operand, absl::Span<const XlaOp> start_indices,
                      absl::Span<const int64> slice_sizes);
+  virtual StatusOr<XlaOp> DynamicSliceInternal(
+      const Shape& shape, XlaOp operand, absl::Span<const XlaOp> start_indices,
+      absl::Span<const int64> slice_sizes);
 
   XlaOp DynamicUpdateSlice(XlaOp operand, XlaOp update,
                            absl::Span<const XlaOp> start_indices);
+  virtual StatusOr<XlaOp> DynamicUpdateSliceInternal(
+      const Shape& shape, XlaOp operand, XlaOp update,
+      absl::Span<const XlaOp> start_indices);
 
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64 dimension);
   virtual StatusOr<XlaOp> ConcatInDimInternal(const Shape& shape,

From 377612c026bcfc1fd86e63b7c5f995101d7bebfd Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Thu, 14 May 2020 18:19:41 -0700
Subject: [PATCH 231/412] Fix hardswish test for ubsan.

PiperOrigin-RevId: 311645688
Change-Id: Id9f3b31da09355c9997f3f2cc95dca5954c956ec
---
 tensorflow/lite/kernels/internal/reference/reference_ops.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index f40b268b443..1a6c6d0d80e 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -2597,7 +2597,7 @@ inline void HardSwish(const HardSwishParams& params,
     // significant bits in the high bits of our 16-bit fixedpoint values, so
     // that fixed-point approximate computations below are as accurate as
     // possible.
-    const int16_t input_value_on_hires_input_scale = input_value << 7;
+    const int16_t input_value_on_hires_input_scale = input_value * (1 << 7);
     // Compute the input value on essentially the output scale, just not
     // right-shifted yet. This is the value that we'll use in the (x >= +3)
     // case, and that in the general case we'll multiply against the "relu-ish"

From a5267f056ff5838ad7ac7dd8c8f1fc29e3064d68 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 18:40:52 -0700
Subject: [PATCH 232/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311648388
Change-Id: Id9f6f7c4de82be3a405377e722e740fd0dfee80d
---
 tensorflow/go/op/wrappers.go | 44 ++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index a6ee1a13b6e..e6725269279 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -12053,7 +12053,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12064,7 +12064,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -18969,7 +18969,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75  f:1.33}
+// If not specified, defaults to {f:0.75 f:1.33}
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18980,7 +18980,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05  f:1}
+// If not specified, defaults to {f:0.05 f:1}
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19384,7 +19384,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8  tensor_shape:{dim:{size:4}}  int_val:255  int_val:0  int_val:0  int_val:255}
+// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20455,7 +20455,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21627,7 +21627,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22335,7 +22335,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22531,7 +22531,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22600,7 +22600,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22715,7 +22715,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22774,7 +22774,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22948,7 +22948,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23325,7 +23325,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25648,7 +25648,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25711,7 +25711,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25962,7 +25962,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26446,7 +26446,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -45534,7 +45534,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47474,7 +47474,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1  i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47545,7 +47545,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48534,7 +48534,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1  i:1  i:1  i:1}
+// If not specified, defaults to {i:1 i:1 i:1 i:1}
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value

From 0de7edf8b11755878043e76200d575b08c025d63 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Thu, 14 May 2020 19:03:56 -0700
Subject: [PATCH 233/412] Generate separate pod for Core ML delegate

PiperOrigin-RevId: 311651255
Change-Id: I7ba8755d447674fa8d20935b0c9815a5406b879f
---
 .../lite/experimental/delegates/coreml/BUILD  |  6 ++++-
 tensorflow/lite/experimental/ios/BUILD.apple  | 25 ++++++++++++++++---
 .../ios/TensorFlowLiteC.podspec.template      | 14 +++++++++--
 .../lite/experimental/swift/BUILD.apple       | 14 +++++++++++
 .../swift/Sources/CoreMLDelegate.swift        |  2 +-
 .../swift/TensorFlowLiteSwift-nightly.podspec | 16 ++++++++++--
 .../TensorFlowLiteSwift.podspec.template      | 16 ++++++++++--
 7 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/tensorflow/lite/experimental/delegates/coreml/BUILD b/tensorflow/lite/experimental/delegates/coreml/BUILD
index 92aa96d5c50..c04aba65aa0 100644
--- a/tensorflow/lite/experimental/delegates/coreml/BUILD
+++ b/tensorflow/lite/experimental/delegates/coreml/BUILD
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 package(default_visibility = [
     "//visibility:public",
 ])
@@ -46,6 +45,11 @@ objc_library(
     name = "coreml_delegate",
     srcs = ["coreml_delegate.mm"],
     hdrs = ["coreml_delegate.h"],
+    module_name = "TensorFlowLiteCCoreML",
+    # By setting CoreML as weak_framework, the TensorFlow Lite can be built for older iOS versions.
+    weak_sdk_frameworks = [
+        "CoreML",
+    ],
     deps = [
         ":coreml_delegate_kernel",
         ":mlmodel_proto_cc",
diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 8e7b32eba91..5c954bc3de8 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -24,7 +24,6 @@ genrule(
 
 TFL_FRAMEWORK_HDRS = [
     "//tensorflow/lite/delegates/gpu:metal_delegate.h",
-    ":coreml_delegate.h",
     "//tensorflow/lite/c:c_api.h",
     "//tensorflow/lite/c:common.h",
 ]
@@ -58,16 +57,35 @@ ios_static_framework(
     ],
 )
 
+# This target builds the Core ML delegate as a separate static framework, which
+# does not include the TensorFlow Lite runtime. As this target does not contain
+# TensorFlow Lite runtime, it is intended to be linked along with the
+# TensorFlowLiteC framework above in a composable way.
+#
+# bazel build -c opt --config=ios_fat //tensorflow/lite/experimental/ios:TensorFlowLiteCCoreMl_framework
+ios_static_framework(
+    name = "TensorFlowLiteCCoreML_framework",
+    hdrs = [
+        ":coreml_delegate.h",
+    ],
+    avoid_deps = [
+        ":tensorflow_lite_c",
+    ],
+    bundle_name = "TensorFlowLiteCCoreML",
+    minimum_os_version = TFL_MINIMUM_OS_VERSION,
+    deps = [
+        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
+    ],
+)
+
 cc_library(
     name = "tensorflow_lite_c",
     hdrs = [
         "//tensorflow/lite/c:c_api.h",
         "//tensorflow/lite/c:common.h",
         "//tensorflow/lite/delegates/gpu:metal_delegate.h",
-        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h",
     ],
     linkopts = [
-        "-Wl,-weak_framework,CoreML",
         "-Wl,-weak_framework,Metal",
     ],
     tags = [
@@ -77,7 +95,6 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:c_api",
         "//tensorflow/lite/delegates/gpu:metal_delegate",
-        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
     ],
 )
 
diff --git a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
index d69c479282b..d8a5ef8f2e1 100644
--- a/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
+++ b/tensorflow/lite/experimental/ios/TensorFlowLiteC.podspec.template
@@ -19,6 +19,16 @@ Pod::Spec.new do |s|
 
   s.module_name = 'TensorFlowLiteC'
   s.library = 'c++'
-  s.vendored_frameworks = 'Frameworks/TensorFlowLiteC.framework'
-  s.weak_frameworks = 'CoreML'
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.vendored_frameworks = 'Frameworks/TensorFlowLiteC.framework'
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.weak_framework = 'CoreML'
+    coreml.dependency 'TensorFlowLiteC/Core'
+    coreml.vendored_frameworks = 'Frameworks/TensorFlowLiteCCoreML.framework'
+  end
 end
diff --git a/tensorflow/lite/experimental/swift/BUILD.apple b/tensorflow/lite/experimental/swift/BUILD.apple
index 50130fc194a..e671721dd1c 100644
--- a/tensorflow/lite/experimental/swift/BUILD.apple
+++ b/tensorflow/lite/experimental/swift/BUILD.apple
@@ -10,6 +10,19 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+# TODO(b/153554551): investigate if separate delegate libraries can be made with same module_name
+# If you don't need delegates and want to reduce size of the app, you can exclude Metal/Core ML
+# delegate related dependencies from the rule.
+# For example, if you don't want to use Core ML delegate:
+# 1. add `exclude = ["Sources/CoreMLDelegate.swift"]` to `glob`, so that `srcs` would look like this:
+#    ```
+#    srcs = glob(
+#        ["Sources/*.swift"],
+#        exclude = ["Sources/CoreMLDelegate.swift"],
+#    ),
+# 2. remove "-Wl,-weak_framework,CoreML" from `linkopts`
+# 3. remove "...:coreml_delegate" from `deps`
+
 swift_library(
     name = "TensorFlowLite",
     srcs = glob(["Sources/*.swift"]),
@@ -21,6 +34,7 @@ swift_library(
     tags = TFL_DEFAULT_TAGS,
     visibility = ios_visibility_whitelist(),
     deps = [
+        "//tensorflow/lite/experimental/delegates/coreml:coreml_delegate",
         "//tensorflow/lite/experimental/ios:tensorflow_lite_c",
     ],
 )
diff --git a/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift b/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
index 5a1526d45ea..9fc76bc3026 100644
--- a/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
+++ b/tensorflow/lite/experimental/swift/Sources/CoreMLDelegate.swift
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-import TensorFlowLiteC
+import TensorFlowLiteCCoreML
 
 /// A delegate that uses the `Core ML` framework for performing TensorFlow Lite graph operations.
 ///
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
index 3b21483f663..8b0e797eeaa 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift-nightly.podspec
@@ -20,8 +20,20 @@ Pod::Spec.new do |s|
 
   tfl_dir = 'tensorflow/lite/'
   swift_dir = tfl_dir + 'experimental/swift/'
-  s.source_files = swift_dir + 'Sources/*.swift'
-  s.dependency 'TensorFlowLiteC', "~> #{s.version}"
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.dependency 'TensorFlowLiteC', "#{s.version}"
+    core.source_files = swift_dir + 'Sources/*.swift'
+    core.exclude_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.source_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+    coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
+    coreml.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+  end
 
   s.test_spec 'Tests' do |ts|
     ts.source_files = swift_dir + 'Tests/*.swift'
diff --git a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
index 9e875b44ee2..a925112f539 100644
--- a/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
+++ b/tensorflow/lite/experimental/swift/TensorFlowLiteSwift.podspec.template
@@ -20,8 +20,20 @@ Pod::Spec.new do |s|
 
   tfl_dir = 'tensorflow/lite/'
   swift_dir = tfl_dir + 'experimental/swift/'
-  s.source_files = swift_dir + 'Sources/*.swift'
-  s.dependency 'TensorFlowLiteC', '~> 0.0.1-nightly'
+
+  s.default_subspec = 'Core'
+
+  s.subspec 'Core' do |core|
+    core.dependency 'TensorFlowLiteC', "#{s.version}"
+    core.source_files = swift_dir + 'Sources/*.swift'
+    core.exclude_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+  end
+
+  s.subspec 'CoreML' do |coreml|
+    coreml.source_files = swift_dir + 'Sources/CoreMLDelegate.swift'
+    coreml.dependency 'TensorFlowLiteC/CoreML', "#{s.version}"
+    coreml.dependency 'TensorFlowLiteSwift/Core', "#{s.version}"
+  end
 
   s.test_spec 'Tests' do |ts|
     ts.source_files = swift_dir + 'Tests/*.swift'

From efa3fb28d94b7937edaafb5874c191ad0e2149ca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 19:07:07 -0700
Subject: [PATCH 234/412] Split index_lookup into string_lookup and
 integer_lookup.

PiperOrigin-RevId: 311651579
Change-Id: Ie033727dbe1026a7c7a88e4b31653840a17ac3d1
---
 .../python/keras/layers/preprocessing/BUILD   |  64 +-
 .../layers/preprocessing/index_lookup.py      | 253 ++++----
 .../index_lookup_distribution_test.py         |   7 +-
 .../layers/preprocessing/index_lookup_test.py | 597 +++++++++++++-----
 .../layers/preprocessing/integer_lookup.py    | 112 ++++
 .../preprocessing/integer_lookup_test.py      | 501 +++++++++++++++
 .../layers/preprocessing/integer_lookup_v1.py |  25 +
 .../layers/preprocessing/string_lookup.py     | 106 ++++
 .../preprocessing/string_lookup_test.py       | 224 +++++++
 .../layers/preprocessing/string_lookup_v1.py  |  25 +
 .../keras/layers/preprocessing/table_utils.py |   1 -
 .../preprocessing/text_vectorization.py       |  68 +-
 .../preprocessing/text_vectorization_test.py  | 106 +---
 .../preprocessing/text_vectorization_v1.py    |   4 +-
 ...al.preprocessing.-text-vectorization.pbtxt |   2 +-
 ...al.preprocessing.-text-vectorization.pbtxt |   2 +-
 16 files changed, 1658 insertions(+), 439 deletions(-)
 create mode 100644 tensorflow/python/keras/layers/preprocessing/integer_lookup.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/string_lookup.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
 create mode 100644 tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py

diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index bef294429bd..c1e1d5573e5 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -27,10 +27,12 @@ py_library(
         ":discretization",
         ":hashing",
         ":image_preprocessing",
+        ":integer_lookup",
         ":normalization",
         ":preprocessing_stage",
         ":preprocessing_test_utils",
         ":reduction",
+        ":string_lookup",
         ":text_vectorization",
     ],
 )
@@ -146,6 +148,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "integer_lookup",
+    srcs = [
+        "integer_lookup.py",
+        "integer_lookup_v1.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":index_lookup",
+        ":table_utils",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
 py_library(
     name = "table_utils",
     srcs = [
@@ -179,7 +195,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":categorical_encoding",
-        ":index_lookup",
+        ":string_lookup",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
@@ -235,6 +251,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "string_lookup",
+    srcs = [
+        "string_lookup.py",
+        "string_lookup_v1.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":index_lookup",
+        ":table_utils",
+        "//tensorflow/python:dtypes",
+    ],
+)
+
 py_library(
     name = "preprocessing_stage",
     srcs = [
@@ -442,6 +472,22 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "integer_lookup_test",
+    size = "medium",
+    srcs = ["integer_lookup_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":integer_lookup",
+        ":preprocessing_test_utils",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 distribute_py_test(
     name = "normalization_distribution_test",
     srcs = ["normalization_distribution_test.py"],
@@ -517,6 +563,22 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "string_lookup_test",
+    size = "medium",
+    srcs = ["string_lookup_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":preprocessing_test_utils",
+        ":string_lookup",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_test(
     name = "preprocessing_stage_test",
     srcs = ["preprocessing_stage_test.py"],
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup.py b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
index d6c8a07c8ba..ba9b0d740e1 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup.py
@@ -41,14 +41,16 @@ _ACCUMULATOR_COUNTS_NAME = "counts"
 
 
 class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
-  """Maps strings (or integers) from a vocabulary to integer indices.
+  """Maps values from a vocabulary to integer indices.
 
-  This layer translates a set of arbitrary strings or integers into an integer
-  output via a table-based lookup, with optional out-of-vocabulary handling.
+  This layer translates a set of arbitrary hashables into an integer output via
+  a table-based lookup, with optional out-of-vocabulary handling. This is the
+  basis layer for both IntegerLookup and IndexLookup; it holds the common
+  logic but is not intended to be exported as part of the Keras API.
 
   If desired, the user can call this layer's `adapt()` method on a data set,
   which will analyze the data set, determine the frequency of individual string
-  or integer values, and create a vocabulary from them. This vocabulary can have
+  values, and create a vocabulary from them. This vocabulary can have
   unlimited size or be capped, depending on the configuration options for this
   layer; if there are more unique values in the input than the maximum
   vocabulary size, the most frequent terms will be used to create the
@@ -56,84 +58,47 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
 
   Attributes:
     max_tokens: The maximum size of the vocabulary for this layer. If None,
-      there is no cap on the size of the vocabulary. Note that the vocabulary
-      does include OOV buckets, so the effective number of unique values in the
-      vocabulary is `(max_tokens - num_oov_tokens)` when this value is set.
-    num_oov_tokens: The number of out-of-vocabulary tokens to use; defaults to
-      1. If this value is more than 1, OOV inputs are hashed to determine their
-      OOV value; if this value is 0, passing an OOV input will result in a '-1'
-      being returned for that value in the output tensor. (Note that, because
-      the value is -1 and not 0, this will allow you to effectively drop OOV
-      values from categorical encodings.)
-    vocabulary: An optional list of vocabulary terms, or a path to a text file
-      containing a vocabulary to load into this layer. The file should contain
-      one token per line. In either case, the vocabulary must be unique; if
-      the list or file contains the same token multiple times, an error will
-      be thrown. Note that when passing a vocabulary - either as a list or as
-      a file - the vocabulary will not be present in the layer's config dict;
-      it will instead be a part of the layer's weights.
-    reserve_zero: Whether to reserve the index 0, which indicates pad values in
-      the Keras masking system. If True, the output of this layer will be in the
-      range `[1...max_tokens+1)`; if False, the output will be in the range
-      `[0...max_tokens)`. Defaults to True.
-    mask_zero: If True, input values of 0 (for integers) and `""` (for strings)
-      will be treated as masked values and assigned an output value of 0. If
-      this option is set, `reserve_zero` must also be set. Defaults to False.
-  Call arguments:
-    inputs: The data to look up. Can be a tf.Tensor or RaggedTensor.
-    invert: Controls the lookup direction. If False, the layer will map strings
-      to integers; if true, the layer will map integers to strings. Defaults
-      to False.
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_indices - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary tokens to use. If this
+      value is more than 1, OOV inputs are hashed to determine their OOV value;
+      if this value is 0, passing an OOV input will result in a '-1' being
+      returned for that value in the output tensor. (Note that, because the
+      value is -1 and not 0, this will allow you to effectively drop OOV values
+      from categorical encodings.)
+    mask_token: A token that represents masked values, and which is mapped to
+      index 0. If set to None, no mask term will be added and the OOV tokens, if
+      any, will be indexed from (0...num_oov_indices) instead of
+      (1...num_oov_indices+1).
+    oov_token: The token representing an out-of-vocabulary value. This token is
+      only used when performing an inverse lookup.
+    vocabulary: An optional list of vocabulary terms. If the list contains the
+      same token multiple times, an error will be thrown.
   """
   # TODO(momernick): Add an examples section to the docstring.
 
   def __init__(self,
-               max_tokens=None,
-               num_oov_tokens=1,
+               max_tokens,
+               num_oov_indices,
+               mask_token,
+               oov_token,
                vocabulary=None,
-               reserve_zero=True,
-               mask_zero=False,
                **kwargs):
-    invert = False
-    if invert:
-      allowed_dtypes = [dtypes.int32, dtypes.int64]
-    else:
-      allowed_dtypes = [dtypes.string, dtypes.int32, dtypes.int64]
-
-    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
-      raise ValueError("TextVectorization may only have a dtype in %s." %
-                       allowed_dtypes)
-
-    if "dtype" not in kwargs:
-      kwargs["dtype"] = dtypes.int64 if invert else dtypes.string
 
     # If max_tokens is set, the value must be greater than 1 - otherwise we
     # are creating a 0-element vocab, which doesn't make sense.
     if max_tokens is not None and max_tokens <= 1:
       raise ValueError("If set, max_tokens must be greater than 1.")
 
-    if num_oov_tokens < 0:
-      raise ValueError("num_oov_tokens must be greater than 0. You passed %s" %
-                       num_oov_tokens)
+    if num_oov_indices < 0:
+      raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
+                       num_oov_indices)
 
-    self.invert = invert
     self.max_tokens = max_tokens
-    self.num_oov_tokens = num_oov_tokens
-    self.reserve_zero = reserve_zero
-    self.mask_zero = mask_zero
-
-    # We need to reserve at least num_oov_tokens tokens, plus one additional
-    # value if we are reserving the zero value in our output.
-    if reserve_zero:
-      self._reserved_values = (num_oov_tokens + 1)
-    else:
-      self._reserved_values = num_oov_tokens
-
-    # We need to account for the OOV buckets in our vocabulary size.
-    if max_tokens is not None:
-      self._max_elements = max_tokens - num_oov_tokens
-    else:
-      self._max_elements = None
+    self.num_oov_indices = num_oov_indices
+    self.oov_token = oov_token
+    self.mask_token = mask_token
 
     # If there is only one OOV bucket, we can determine the OOV value (either 0
     # or 1 depending on whether 0 is reserved) and set that as the default
@@ -141,20 +106,17 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # do a further hashing step; to make this easier, we set the OOV value to
     # -1. (This lets us do a vectorized add and cast to boolean to determine
     # locations where we need to do extra hashing.)
-    if self.num_oov_tokens == 1:
-      self._oov_value = 1 if reserve_zero else 0
+    if self.num_oov_indices == 1:
+      self._oov_value = 0 if mask_token is None else 1
     else:
       self._oov_value = -1
 
     super(IndexLookup, self).__init__(
-        combiner=_IndexLookupCombiner(self.max_tokens), **kwargs)
+        combiner=_IndexLookupCombiner(self.max_tokens, self.mask_token),
+        **kwargs)
+
+    self._output_dtype = dtypes.int64
 
-    # If the layer's input type is int32, we can only output int32 values -
-    # MutableHashTable doesn't allow us to map int32->int64.
-    if self.dtype == dtypes.int32:
-      self._output_dtype = dtypes.int32
-    else:
-      self._output_dtype = dtypes.int64
     self._table = lookup_ops.MutableHashTable(
         key_dtype=self.dtype,
         value_dtype=self._output_dtype,
@@ -167,33 +129,27 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # counting code in the Model object doesn't throw an attribute error.
     tracked_table.shape = tensor_shape.TensorShape((0,))
 
-    if self.num_oov_tokens <= 1:
-      oov_tokens = None
+    if self.num_oov_indices <= 1:
+      oov_indices = None
     else:
-      oov_start = 1 if reserve_zero else 0
-      oov_tokens = list(range(oov_start, self._reserved_values))
+      oov_start = 1 if mask_token is not None else 0
+      oov_end = oov_start + num_oov_indices
+      oov_indices = list(range(oov_start, oov_end))
 
     self._table_handler = table_utils.TableHandler(
         table=self._table,
-        oov_tokens=oov_tokens,
+        oov_tokens=oov_indices,
         use_v1_apis=self._use_v1_apis())
 
     if vocabulary is not None:
-      if isinstance(vocabulary, str):
-        vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
-      table_utils.validate_vocabulary_is_unique(vocabulary)
-
       self.set_vocabulary(vocabulary)
 
   def compute_output_shape(self, input_shape):
     return input_shape
 
-  def compute_output_signature(self, input_spec, invert=False):
+  def compute_output_signature(self, input_spec):
     output_shape = self.compute_output_shape(input_spec.shape.as_list())
-    if invert:
-      output_dtype = dtypes.string
-    else:
-      output_dtype = dtypes.int64
+    output_dtype = dtypes.int64
     return tensor_spec.TensorSpec(shape=output_shape, dtype=output_dtype)
 
   def adapt(self, data, reset_state=True):
@@ -220,10 +176,7 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     keys, values = self._table_handler.data()
     # This is required because the MutableHashTable doesn't preserve insertion
     # order, but we rely on the order of the array to assign indices.
-    if self.dtype == dtypes.string:
-      return [x.decode("utf-8") for _, x in sorted(zip(values, keys))]
-    else:
-      return [x for _, x in sorted(zip(values, keys))]
+    return [x for _, x in sorted(zip(values, keys))]
 
   def vocab_size(self):
     return self._table_handler.vocab_size()
@@ -231,10 +184,9 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
   def get_config(self):
     config = {
         "max_tokens": self.max_tokens,
-        "num_oov_tokens": self.num_oov_tokens,
-        "vocabulary": None,
-        "reserve_zero": self.reserve_zero,
-        "mask_zero": self.mask_zero,
+        "num_oov_indices": self.num_oov_indices,
+        "oov_token": self.oov_token,
+        "mask_token": self.mask_token,
     }
     base_config = super(IndexLookup, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
@@ -246,46 +198,101 @@ class IndexLookup(base_preprocessing_layer.CombinerPreprocessingLayer):
     # abstraction for ease of saving!) we return 0.
     return 0
 
-  def set_vocabulary(self,
-                     vocab,
-                     append=False):
+  def set_vocabulary(self, vocab):
     """Sets vocabulary (and optionally document frequency) data for this layer.
 
     This method sets the vocabulary for this layer directly, instead of
     analyzing a dataset through 'adapt'. It should be used whenever the vocab
     information is already known. If vocabulary data is already present in the
-    layer, this method will either replace it, if 'append' is set to False, or
-    append to it (if 'append' is set to True).
+    layer, this method will either replace it
 
     Arguments:
       vocab: An array of string tokens.
-      append: Whether to overwrite or append any existing vocabulary data.
 
     Raises:
       ValueError: If there are too many inputs, the inputs do not match, or
         input data is missing.
     """
-    current_table_size = self._table_handler.vocab_size()
-    total_vocab_size = len(vocab) + (current_table_size if append else 0)
-    if self.max_tokens is not None and total_vocab_size > self._max_elements:
+
+    table_utils.validate_vocabulary_is_unique(vocab)
+
+    should_have_mask = self.mask_token is not None
+    if should_have_mask:
+      has_mask = vocab[0] == self.mask_token
+      oov_start = 1
+    else:
+      has_mask = False
+      oov_start = 0
+
+    should_have_oov = self.num_oov_indices > 0
+    if should_have_oov:
+      oov_end = oov_start + self.num_oov_indices
+      expected_oov = [self.oov_token] * self.num_oov_indices
+      has_oov = vocab[oov_start:oov_end] == expected_oov
+      # If we get a numpy array, then has_oov may end up being a numpy array
+      # instead of a bool. Fix this by collapsing the variable if it's not bool.
+      if not isinstance(has_oov, bool):
+        has_oov = any(has_oov)
+    else:
+      has_oov = False
+
+    if all([should_have_mask, has_mask, should_have_oov]) and not has_oov:
+      raise ValueError("The passed vocabulary has the correct mask token `%s` "
+                       "at index 0, but does not have the OOV token `%s` in "
+                       "indices [%s:%s]. Instead, we found `%s`. Was this "
+                       "vocabulary generated by a layer with incompatible "
+                       "settings?" %
+                       (self.mask_token, self.oov_token, oov_start, oov_end,
+                        vocab[oov_start:oov_end]))
+
+    if all([should_have_oov, has_oov, should_have_mask]) and not has_mask:
+      raise ValueError(
+          "The passed vocabulary has the correct OOV token `%s` at "
+          "indices [%s:%s], but does not have the mask token `%s` in "
+          "index 0. Instead, we found `%s`. Was this vocabulary "
+          "generated by a layer with incompatible settings?" %
+          (self.oov_token, oov_start, oov_end, self.mask_token, vocab[0]))
+
+    insert_special_tokens = not has_oov and not has_mask
+
+    special_tokens = [] if self.mask_token is None else [self.mask_token]
+    special_tokens.extend([self.oov_token] * self.num_oov_indices)
+
+    num_special_tokens = len(special_tokens)
+    tokens = vocab if insert_special_tokens else vocab[num_special_tokens:]
+    if self.mask_token in tokens:
+      raise ValueError("Reserved mask token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "mask token for this layer." %
+                       (self.mask_token, tokens.index(self.mask_token)))
+    if self.oov_token in tokens:
+      raise ValueError("Reserved OOV token %s was found in the passed "
+                       "vocabulary at index %s. Please either remove the "
+                       "reserved token from the vocabulary or change the "
+                       "OOV token for this layer." %
+                       (self.oov_token, tokens.index(self.oov_token)))
+
+    if insert_special_tokens:
+      total_vocab_size = len(vocab) + num_special_tokens
+    else:
+      total_vocab_size = len(vocab)
+    if self.max_tokens is not None and total_vocab_size > self.max_tokens:
       raise ValueError(
           "Attempted to set a vocabulary larger than the maximum vocab size. "
-          "Passed vocab size is %s, max vocab size is %s. Note that the OOV "
-          "token(s) are automatically added to the number of tokens." %
+          "Passed vocab size is %s, max vocab size is %s." %
           (total_vocab_size, self.max_tokens))
 
-    start_index = self._reserved_values + (current_table_size if append else 0)
+    start_index = num_special_tokens
     values = np.arange(start_index, len(vocab) + start_index, dtype=np.int64)
-    vocab = table_utils.convert_to_ndarray(vocab, self.dtype)
-    table_utils.assert_same_type(self.dtype, vocab, "vocab")
 
-    values = table_utils.convert_to_ndarray(values, self._output_dtype)
-    table_utils.assert_same_type(self._output_dtype, values, "values")
-
-    if not append and current_table_size > 0:
-      self._table_handler.clear()
+    self._table_handler.clear()
     self._table_handler.insert(vocab, values)
 
+    if insert_special_tokens and num_special_tokens > 0:
+      special_token_values = np.arange(num_special_tokens, dtype=np.int64)
+      self._table_handler.insert(special_tokens, special_token_values)
+
   def _set_state_variables(self, updates):
     if not self.built:
       raise RuntimeError("_set_state_variables() must be called after build().")
@@ -316,18 +323,20 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
       dataset, all tokens are retained.s
   """
 
-  def __init__(self, vocab_size=None):
+  def __init__(self, vocab_size=None, mask_value=None):
     self._vocab_size = vocab_size
+    self._mask_value = mask_value
 
   def compute(self, values, accumulator=None):
     """Compute a step in this computation, returning a new accumulator."""
-    values = base_preprocessing_layer.convert_to_list(values)
+    values = base_preprocessing_layer.convert_to_list(
+        values, sparse_default_value=self._mask_value)
 
     if accumulator is None:
       accumulator = self._create_accumulator()
 
     # TODO(momernick): Benchmark improvements to this algorithm.
-    if isinstance(values, (str, bytes)):
+    if isinstance(values, (str, bytes, np.int64)):
       accumulator.count_dict[values] += 1
     else:
       for document in values:
@@ -362,6 +371,8 @@ class _IndexLookupCombiner(base_preprocessing_layer.Combiner):
         "vocab": A list of the retained items in the vocabulary.
     """
     vocab_counts = accumulator.count_dict
+    if self._mask_value in vocab_counts:
+      del vocab_counts[self._mask_value]
     sorted_counts = sorted(
         vocab_counts.items(), key=operator.itemgetter(1, 0), reverse=True)
     vocab_data = (
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
index 3360dad6ffe..098e67f5f6b 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_distribution_test.py
@@ -65,7 +65,12 @@ class IndexLookupDistributionTest(
 
     with distribution.scope():
       input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-      layer = get_layer_class()()
+      layer = get_layer_class()(
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
       layer.adapt(vocab_dataset)
       int_data = layer(input_data)
       model = keras.Model(inputs=input_data, outputs=int_data)
diff --git a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
index 3c5b5757ec2..a95834233b3 100644
--- a/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/index_lookup_test.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 import itertools
 import os
 import random
-import six
 import string
 
 from absl.testing import parameterized
@@ -31,7 +30,6 @@ from tensorflow.python import keras
 from tensorflow.python import tf2
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -44,7 +42,6 @@ from tensorflow.python.keras.layers.preprocessing import preprocessing_test_util
 from tensorflow.python.keras.saving import save
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
@@ -71,6 +68,10 @@ def _get_end_to_end_test_cases():
                         ["and"], ["earth"], ["michigan"]]),
           "kwargs": {
               "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": "",
+              "oov_token": "[OOV]",
+              "dtype": dtypes.string,
           },
           "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
           "input_dtype":
@@ -91,6 +92,9 @@ def _get_end_to_end_test_cases():
                        dtype=np.int64),
           "kwargs": {
               "max_tokens": None,
+              "num_oov_indices": 1,
+              "mask_token": 0,
+              "oov_token": -1,
               "dtype": dtypes.int64,
           },
           "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
@@ -172,7 +176,12 @@ class CategoricalEncodingInputTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -193,7 +202,12 @@ class CategoricalEncodingInputTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -209,7 +223,12 @@ class CategoricalEncodingInputTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -223,7 +242,12 @@ class CategoricalEncodingInputTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -248,7 +272,12 @@ class CategoricalEncodingMultiOOVTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, sparse=True)
-    layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=2,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -270,7 +299,11 @@ class CategoricalEncodingMultiOOVTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
     layer = get_layer_class()(
-        max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -286,7 +319,12 @@ class CategoricalEncodingMultiOOVTest(
     expected_output = [[3, 4, 6], [6, 5, 3, 2]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None, num_oov_tokens=2)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=2,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -301,7 +339,11 @@ class CategoricalEncodingMultiOOVTest(
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
     layer = get_layer_class()(
-        max_tokens=None, dtype=dtypes.int64, num_oov_tokens=2)
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -321,13 +363,14 @@ class CategoricalEncodingAdaptTest(
         dense_shape=[3, 4])
     vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
 
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.adapt(vocab_dataset)
-    # Note that the expected vocabulary has a null string (''). This is because
-    # we assume that sparse tensors are in fact dense tensors with elided
-    # values, not ragged tensors. Therefore, we assume that any missing data
-    # is important and give it a spot in our vocab.
-    expected_vocabulary = ["", "michigan", "fire"]
+    expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
     self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
 
   def test_ragged_adapt(self):
@@ -335,9 +378,14 @@ class CategoricalEncodingAdaptTest(
                                               ["fire", "michigan"]])
     vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
 
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.adapt(vocab_dataset)
-    expected_vocabulary = ["michigan", "fire"]
+    expected_vocabulary = ["", "[OOV]", "michigan", "fire"]
     self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
 
   def test_sparse_int_input(self):
@@ -352,7 +400,12 @@ class CategoricalEncodingAdaptTest(
     expected_dense_shape = [3, 4]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -368,7 +421,12 @@ class CategoricalEncodingAdaptTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string, ragged=True)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -382,7 +440,12 @@ class CategoricalEncodingAdaptTest(
     expected_output = [[2, 3, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
-    layer = get_layer_class()(max_tokens=None, dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_tokens=None,
+        dtype=dtypes.int64,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -400,34 +463,15 @@ class CategoricalEncodingAdaptTest(
     batched_ds = ds.take(2)
     input_t = keras.Input(shape=(), dtype=dtypes.string)
     layer = get_layer_class()(
-        max_tokens=10, num_oov_tokens=0, reserve_zero=False)
+        max_tokens=10,
+        num_oov_indices=0,
+        mask_token=None,
+        oov_token=None,
+        dtype=dtypes.string)
     _ = layer(input_t)
     layer.adapt(batched_ds)
 
 
-@keras_parameterized.run_all_keras_modes
-class IndexLookupDistributionTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_cpu_distribution(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    strategy = one_device_strategy.OneDeviceStrategy("/cpu:0")
-
-    with strategy.scope():
-      input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-      layer = get_layer_class()()
-      layer.set_vocabulary(vocab_data)
-      int_data = layer(input_data)
-      model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-
 @keras_parameterized.run_all_keras_modes
 class IndexLookupOutputTest(keras_parameterized.TestCase,
                             preprocessing_test_utils.PreprocessingLayerTest):
@@ -439,7 +483,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()()
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -448,7 +497,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
 
   def test_output_shape(self):
     input_data = keras.Input(shape=(4,), dtype=dtypes.string)
-    layer = get_layer_class()()
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     int_data = layer(input_data)
     self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
 
@@ -459,7 +513,12 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(reserve_zero=False)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=None,
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -473,7 +532,13 @@ class IndexLookupOutputTest(keras_parameterized.TestCase,
     expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_data)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
     output_dataset = model.predict(input_array)
@@ -485,15 +550,6 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
                                 preprocessing_test_utils.PreprocessingLayerTest
                                ):
 
-  def _write_to_temp_file(self, file_name, vocab_list):
-    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
-    with gfile.GFile(vocab_path, "w") as writer:
-      for vocab in vocab_list:
-        writer.write(vocab + "\n")
-      writer.flush()
-      writer.close()
-    return vocab_path
-
   def test_int_output_explicit_vocab(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     input_array = np.array([["earth", "wind", "and", "fire"],
@@ -501,107 +557,195 @@ class IndexLookupVocabularyTest(keras_parameterized.TestCase,
     expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_data)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
-  def test_get_vocab_returns_str(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    layer = get_layer_class()(vocabulary=vocab_data)
-    layer_vocab = layer.get_vocabulary()
-    self.assertAllEqual(vocab_data, layer_vocab)
-    self.assertIsInstance(layer_vocab[0], six.text_type)
+  def test_vocab_with_max_cap(self):
+    vocab_data = ["", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
 
-  def test_int_output_explicit_vocab_from_file(self):
-    vocab_list = ["earth", "wind", "and", "fire"]
-    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
-
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(vocabulary=vocab_path)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
-
-  def test_vocab_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=5)
-    layer.set_vocabulary(vocab_data[0])
-    layer.set_vocabulary(vocab_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
+  def test_int_vocab_with_max_cap(self):
+    vocab_data = [0, -1, 42, 1276, 1138]
+    layer = get_layer_class()(
+        max_tokens=5,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    layer.set_vocabulary(vocab_data)
+    returned_vocab = layer.get_vocabulary()
+    self.assertAllEqual(vocab_data, returned_vocab)
 
   def test_non_unique_vocab_fails(self):
     vocab_data = ["earth", "wind", "and", "fire", "fire"]
     with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
-      _ = get_layer_class()(vocabulary=vocab_data)
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
 
-  def test_non_unique_vocab_from_file_fails(self):
-    vocab_list = ["earth", "wind", "and", "fire", "earth"]
-    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+  def test_vocab_with_oov_and_wrong_mask_fails(self):
+    vocab_data = ["custom_mask", "[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_oov_and_no_mask_fails(self):
+    vocab_data = ["[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_mask_but_no_oov_fails(self):
+    vocab_data = ["", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_vocab_with_repeated_element_fails(self):
+    vocab_data = ["earth", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
-      _ = get_layer_class()(vocabulary=vocab_path)
+      layer.set_vocabulary(vocab_data)
 
+  def test_vocab_with_reserved_oov_element_fails(self):
+    vocab_data = ["earth", "test", "[OOV]", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
 
-@keras_parameterized.run_all_keras_modes
-class InverseLookupOutputTest(keras_parameterized.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
+  def test_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = ["earth", "mask_token", "wind", "and", "fire"]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="mask_token",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)
 
-  def DISABLE_test_inverse_output(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
-    # Note that the token 'michigan' has been replaced by ''. This is because
-    # 'michigan' is OOV for this layer.
-    expected_strings = np.array([["earth", "wind", "and", "fire"],
-                                 ["fire", "and", "earth", ""]])
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    string_data = layer(int_data, invert=True)
-    model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
-    int_outputs, string_outputs = model.predict(input_array)
-    self.assertAllEqual(expected_ints, int_outputs)
-    self.assertAllEqual(expected_strings, string_outputs)
+  def test_non_unique_int_vocab_fails(self):
+    vocab_data = [12, 13, 14, 15, 15]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*15.*"):
+      _ = get_layer_class()(
+          vocabulary=vocab_data,
+          max_tokens=None,
+          num_oov_indices=1,
+          mask_token=0,
+          oov_token=-1,
+          dtype=dtypes.int64)
 
-  def DISABLE_test_inverse_output_serialization(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_ints = [[2, 3, 4, 5], [5, 4, 2, 1]]
-    # Note that the token 'michigan' has been replaced by ''. This is because
-    # 'michigan' is OOV for this layer.
-    expected_strings = np.array([["earth", "wind", "and", "fire"],
-                                 ["fire", "and", "earth", ""]])
+  def test_int_vocab_with_oov_and_wrong_mask_fails(self):
+    vocab_data = [1234, -1, 11, 21, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*does not have the mask token.*"):
+      layer.set_vocabulary(vocab_data)
 
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
-    layer.set_vocabulary(vocab_data)
-    int_data = layer(input_data)
-    string_data = layer(int_data, invert=True)
-    model = keras.Model(inputs=input_data, outputs=[int_data, string_data])
+  def test_int_vocab_with_oov_and_no_mask_fails(self):
+    vocab_data = [-1, 11, 12, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
 
-    with CustomObjectScope({"IndexLookup": get_layer_class()}):
-      new_model = keras.Model.from_config(model.get_config())
-    new_model.set_weights(model.get_weights())
-    int_outputs, string_outputs = new_model.predict(input_array)
-    self.assertAllEqual(expected_ints, int_outputs)
-    self.assertAllEqual(expected_strings, string_outputs)
+  def test_int_vocab_with_mask_but_no_oov_fails(self):
+    vocab_data = [0, 11, 12, 13, 14]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*does not have the OOV token.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_repeated_element_fails(self):
+    vocab_data = [11, 11, 34, 23, 124]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*11.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_reserved_oov_element_fails(self):
+    vocab_data = [14, 38, -1, 34, 3, 84]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved OOV.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_int_vocab_with_reserved_mask_element_fails(self):
+    vocab_data = [125, 0, 3, 4, 94]
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token=0,
+        oov_token=-1,
+        dtype=dtypes.int64)
+    with self.assertRaisesRegex(ValueError, ".*Reserved mask.*"):
+      layer.set_vocabulary(vocab_data)
 
 
 @keras_parameterized.run_all_keras_modes(always_skip_eager=True)
@@ -612,7 +756,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
     vocab_data = ["earth", "wind", "and", "fire"]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=10)
+    layer = get_layer_class()(
+        max_tokens=10,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -626,7 +775,12 @@ class IndexLookupSaveableTest(keras_parameterized.TestCase,
     vocab_data = ["earth", "wind", "and", "fire"]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=10)
+    layer = get_layer_class()(
+        max_tokens=10,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -643,25 +797,24 @@ class IndexLookupErrorTest(keras_parameterized.TestCase,
   def test_too_long_vocab_fails_in_single_setting(self):
     vocab_data = ["earth", "wind", "and", "fire"]
 
-    layer = get_layer_class()(max_tokens=4)
+    layer = get_layer_class()(
+        max_tokens=4,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     with self.assertRaisesRegex(ValueError,
                                 "vocabulary larger than the maximum vocab.*"):
       layer.set_vocabulary(vocab_data)
 
-  def test_too_long_vocab_fails_in_multiple_settings(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    layer = get_layer_class()(max_tokens=4)
-
-    # The first time we call set_vocabulary, we're under the max_tokens
-    # so it should be fine.
-    layer.set_vocabulary(vocab_data[0])
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab.*"):
-      layer.set_vocabulary(vocab_data[1], append=True)
-
   def test_zero_max_tokens_fails(self):
     with self.assertRaisesRegex(ValueError, ".*max_tokens.*"):
-      _ = get_layer_class()(max_tokens=0)
+      _ = get_layer_class()(
+          max_tokens=0,
+          num_oov_indices=1,
+          mask_token="",
+          oov_token="[OOV]",
+          dtype=dtypes.string)
 
 
 @keras_parameterized.run_all_keras_modes
@@ -676,7 +829,12 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,
 
     # Build and validate a golden model.
     input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(max_tokens=None)
+    layer = get_layer_class()(
+        max_tokens=None,
+        num_oov_indices=1,
+        mask_token="",
+        oov_token="[OOV]",
+        dtype=dtypes.string)
     layer.set_vocabulary(vocab_data)
     int_data = layer(input_data)
     model = keras.Model(inputs=input_data, outputs=int_data)
@@ -705,8 +863,9 @@ class IndexLookupSavingTest(keras_parameterized.TestCase,
 
 
 @keras_parameterized.run_all_keras_modes
-class IndexLookupCombinerTest(keras_parameterized.TestCase,
-                              preprocessing_test_utils.PreprocessingLayerTest):
+class IndexLookupStringCombinerTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
 
   def compare_text_accumulators(self, a, b, msg=None):
     if a is None or b is None:
@@ -834,5 +993,123 @@ class IndexLookupCombinerTest(keras_parameterized.TestCase,
     self.validate_accumulator_extract(combiner, data, expected_extract_output)
 
 
+@keras_parameterized.run_all_keras_modes
+class IndexLookupIntCombinerTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
+
+  def compare_text_accumulators(self, a, b, msg=None):
+    if a is None or b is None:
+      self.assertAllEqual(a, b, msg=msg)
+
+    self.assertAllEqual(a.count_dict, b.count_dict, msg=msg)
+
+  compare_accumulators = compare_text_accumulators
+
+  def update_accumulator(self, accumulator, data):
+    accumulator.count_dict.update(dict(zip(data["vocab"], data["counts"])))
+
+    return accumulator
+
+  def test_combiner_api_compatibility_int_mode(self):
+    data = np.array([[42, 1138, 725, 1729], [42, 1138, 725, 203]])
+    combiner = index_lookup._IndexLookupCombiner()
+    expected_accumulator_output = {
+        "vocab": np.array([1138, 725, 42, 1729, 203]),
+        "counts": np.array([2, 2, 2, 1, 1]),
+    }
+    expected_extract_output = {
+        "vocab": np.array([1138, 725, 42, 1729, 203]),
+    }
+    expected_accumulator = combiner._create_accumulator()
+    expected_accumulator = self.update_accumulator(expected_accumulator,
+                                                   expected_accumulator_output)
+    self.validate_accumulator_serialize_and_deserialize(combiner, data,
+                                                        expected_accumulator)
+    self.validate_accumulator_uniqueness(combiner, data)
+    self.validate_accumulator_extract(combiner, data, expected_extract_output)
+
+  # TODO(askerryryan): Add tests confirming equivalence to behavior of
+  # existing tf.keras.preprocessing.text.Tokenizer.
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "top_k_smaller_than_full_vocab",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": 3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725]),
+          },
+      },
+      {
+          "testcase_name": "top_k_larger_than_full_vocab",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": 10,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+          },
+      },
+      {
+          "testcase_name": "no_top_k",
+          "data": np.array([[42, 1138], [1729, 1138], [725], [1729, 1138]]),
+          "vocab_size": None,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([3, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+          },
+      },
+      {
+          "testcase_name": "single_element_per_row",
+          "data": np.array([[42], [1138], [1729], [1138], [725]]),
+          "vocab_size": 3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 1729, 725, 42]),
+              "counts": np.array([2, 1, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 725]),
+          },
+      },
+      # Which tokens are retained are based on global frequency, and thus are
+      # sensitive to frequency within a document. In contrast, because idf only
+      # considers the presence of a token in a document, it is insensitive
+      # to the frequency of the token within the document.
+      {
+          "testcase_name":
+              "retained_tokens_sensitive_to_within_document_frequency",
+          "data":
+              np.array([[42, 42], [1138, 1138], [1729, 1729], [1138, 1138],
+                        [725, 203]]),
+          "vocab_size":
+              3,
+          "expected_accumulator_output": {
+              "vocab": np.array([1138, 42, 1729, 725, 203]),
+              "counts": np.array([4, 2, 2, 1, 1]),
+          },
+          "expected_extract_output": {
+              "vocab": np.array([1138, 1729, 42]),
+          },
+      })
+  def test_combiner_computation(self, data, vocab_size,
+                                expected_accumulator_output,
+                                expected_extract_output):
+    combiner = index_lookup._IndexLookupCombiner(vocab_size=vocab_size)
+    expected_accumulator = combiner._create_accumulator()
+    expected_accumulator = self.update_accumulator(expected_accumulator,
+                                                   expected_accumulator_output)
+    self.validate_accumulator_computation(combiner, data, expected_accumulator)
+    self.validate_accumulator_extract(combiner, data, expected_extract_output)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
new file mode 100644
index 00000000000..671c02573db
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup.py
@@ -0,0 +1,112 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import table_utils
+
+
+class IntegerLookup(index_lookup.IndexLookup):
+  """Maps integers from a vocabulary to integer indices.
+
+  This layer translates a set of arbitrary integers into an integer output via a
+  table-based lookup, with optional out-of-vocabulary handling.
+
+  If desired, the user can call this layer's `adapt()` method on a data set,
+  which will analyze the data set, determine the frequency of individual string
+  values, and create a vocabulary from them. This vocabulary can have
+  unlimited size or be capped, depending on the configuration options for this
+  layer; if there are more unique values in the input than the maximum
+  vocabulary size, the most frequent terms will be used to create the
+  vocabulary.
+
+  Attributes:
+    max_values: The maximum size of the vocabulary for this layer. If None,
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_tokens - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary values to use; defaults to
+      1. If this value is more than 1, OOV inputs are hashed to determine their
+      OOV value; if this value is 0, passing an OOV input will result in a '-1'
+      being returned for that value in the output tensor. (Note that, because
+      the value is -1 and not 0, this will allow you to effectively drop OOV
+      values from categorical encodings.)
+    mask_value: A value that represents masked inputs, and which is mapped to
+      index 0. Defaults to 0. If set to None, no mask term will be added and the
+      OOV tokens, if any, will be indexed from (0...num_oov_tokens) instead of
+      (1...num_oov_tokens+1).
+    oov_value: The value representing an out-of-vocabulary value. Defaults to
+      -1.
+    vocabulary: An optional list of values, or a path to a text file containing
+      a vocabulary to load into this layer. The file should contain one value
+      per line. If the list or file contains the same token multiple times, an
+      error will be thrown.
+  """
+
+  def __init__(self,
+               max_values=None,
+               num_oov_indices=1,
+               mask_value=0,
+               oov_value=-1,
+               vocabulary=None,
+               **kwargs):
+    allowed_dtypes = [dtypes.int64]
+
+    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
+      raise ValueError("IntegerLookup may only have a dtype in %s." %
+                       allowed_dtypes)
+
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = dtypes.int64
+
+    # If max_values is set, the value must be greater than 1 - otherwise we
+    # are creating a 0-element vocab, which doesn't make sense.
+    if max_values is not None and max_values <= 1:
+      raise ValueError("If set, max_values must be greater than 1.")
+
+    if num_oov_indices < 0:
+      raise ValueError("num_oov_indices must be greater than 0. You passed %s" %
+                       num_oov_indices)
+
+    if vocabulary is not None:
+      if isinstance(vocabulary, str):
+        vocabulary = table_utils.get_vocabulary_from_file(vocabulary)
+        vocabulary = [int(v) for v in vocabulary]
+
+    super(IntegerLookup, self).__init__(
+        max_tokens=max_values,
+        num_oov_indices=num_oov_indices,
+        mask_token=mask_value,
+        oov_token=oov_value,
+        vocabulary=vocabulary,
+        **kwargs)
+
+  def get_config(self):
+    base_config = super(IntegerLookup, self).get_config()
+    # Because the super config has a bunch of args we're also passing,
+    # we need to rename and remove them from the config dict.
+    base_config["max_values"] = base_config["max_tokens"]
+    del base_config["max_tokens"]
+
+    base_config["mask_value"] = base_config["mask_token"]
+    del base_config["mask_token"]
+
+    base_config["oov_value"] = base_config["oov_token"]
+    del base_config["oov_token"]
+    return base_config
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
new file mode 100644
index 00000000000..515a1ca6667
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_test.py
@@ -0,0 +1,501 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras text vectorization preprocessing layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+import os
+import random
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python import tf2
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.preprocessing import integer_lookup
+from tensorflow.python.keras.layers.preprocessing import integer_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.saving import save
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return integer_lookup.IntegerLookup
+  else:
+    return integer_lookup_v1.IntegerLookup
+
+
+def _get_end_to_end_test_cases():
+  test_cases = (
+      {
+          "testcase_name":
+              "test_ints_soft_vocab_cap",
+          # Create an array where 1138 is the most frequent term, followed by
+          # 1729, then 725, then 42. This ensures that the vocab accumulator
+          # is sorting by frequency.
+          "vocab_data":
+              np.array([[42], [1138], [1138], [1138], [1138], [1729], [1729],
+                        [1729], [725], [725]],
+                       dtype=np.int64),
+          "input_data":
+              np.array([[1138], [1729], [725], [42], [42], [725], [1138], [4]],
+                       dtype=np.int64),
+          "kwargs": {
+              "max_values": None,
+              "dtype": dtypes.int64,
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+          "input_dtype":
+              dtypes.int64
+      },)
+
+  crossed_test_cases = []
+  # Cross above test cases with use_dataset in (True, False)
+  for use_dataset in (True, False):
+    for case in test_cases:
+      case = case.copy()
+      if use_dataset:
+        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+      case["use_dataset"] = use_dataset
+      crossed_test_cases.append(case)
+
+  return crossed_test_cases
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupLayerTest(keras_parameterized.TestCase,
+                             preprocessing_test_utils.PreprocessingLayerTest):
+
+  @parameterized.named_parameters(*_get_end_to_end_test_cases())
+  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
+                                       use_dataset, expected_output,
+                                       input_dtype):
+    cls = get_layer_class()
+    expected_output_dtype = dtypes.int64
+    input_shape = input_data.shape
+
+    if use_dataset:
+      # Keras APIs expect batched datasets.
+      # TODO(rachelim): `model.predict` predicts the result on each
+      # dataset batch separately, then tries to concatenate the results
+      # together. When the results have different shapes on the non-concat
+      # axis (which can happen in the output_mode = INT case for
+      # IntegerLookup), the concatenation fails. In real use cases, this may
+      # not be an issue because users are likely to pipe the preprocessing layer
+      # into other keras layers instead of predicting it directly. A workaround
+      # for these unit tests is to have the dataset only contain one batch, so
+      # no concatenation needs to happen with the result. For consistency with
+      # numpy input, we should make `predict` join differently shaped results
+      # together sensibly, with 0 padding.
+      input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+          input_shape[0])
+      vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
+          input_shape[0])
+
+    with CustomObjectScope({"IntegerLookup": cls}):
+      output_data = testing_utils.layer_test(
+          cls,
+          kwargs=kwargs,
+          input_shape=input_shape,
+          input_data=input_data,
+          input_dtype=input_dtype,
+          expected_output_dtype=expected_output_dtype,
+          validate_training=False,
+          adapt_data=vocab_data)
+    self.assertAllClose(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingInputTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingMultiOOVTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 133], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [6, 2]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(
+        max_values=None,
+        dtype=dtypes.int64,
+        num_oov_indices=2,
+        mask_value=0,
+        oov_value=-1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input_multi_bucket(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 133]],
+                                              dtype=np.int64)
+    expected_output = [[3, 4, 6], [6, 5, 3, 2]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None, num_oov_indices=2)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class CategoricalEncodingAdaptTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_sparse_adapt(self):
+    vocab_data = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [1, 2]],
+        values=[203, 1729, 203],
+        dense_shape=[3, 4])
+    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
+
+    layer = get_layer_class()()
+    layer.adapt(vocab_dataset)
+    expected_vocabulary = [0, -1, 203, 1729]
+    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+  def test_ragged_adapt(self):
+    vocab_data = ragged_factory_ops.constant([[203], [1729, 203]])
+    vocab_dataset = dataset_ops.Dataset.from_tensors(vocab_data)
+
+    layer = get_layer_class()()
+    layer.adapt(vocab_dataset)
+    expected_vocabulary = [0, -1, 203, 1729]
+    self.assertAllEqual(expected_vocabulary, layer.get_vocabulary())
+
+  def test_sparse_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [1, 2]],
+        values=np.array([13, 32], dtype=np.int64),
+        dense_shape=[3, 4])
+
+    expected_indices = [[0, 0], [1, 2]]
+    expected_values = [5, 1]
+    expected_dense_shape = [3, 4]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, sparse=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_data = model.predict(input_array, steps=1)
+    self.assertAllEqual(expected_indices, output_data.indices)
+    self.assertAllEqual(expected_values, output_data.values)
+    self.assertAllEqual(expected_dense_shape, output_data.dense_shape)
+
+  def test_ragged_int_input(self):
+    vocab_data = np.array([10, 11, 12, 13], dtype=np.int64)
+    input_array = ragged_factory_ops.constant([[10, 11, 13], [13, 12, 10, 42]],
+                                              dtype=np.int64)
+    expected_output = [[2, 3, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64, ragged=True)
+    layer = get_layer_class()(max_values=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_single_int_generator_dataset(self):
+
+    def word_gen():
+      for _ in itertools.count(1):
+        yield random.randint(0, 100)
+
+    ds = dataset_ops.Dataset.from_generator(word_gen, dtypes.int64,
+                                            tensor_shape.TensorShape([]))
+    batched_ds = ds.take(2)
+    input_t = keras.Input(shape=(), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        max_values=10, num_oov_indices=0, mask_value=None, oov_value=None)
+    _ = layer(input_t)
+    layer.adapt(batched_ds)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupOutputTest(keras_parameterized.TestCase,
+                              preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_int_output(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()()
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_output_shape(self):
+    input_data = keras.Input(shape=(4,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, num_oov_indices=1)
+    int_data = layer(input_data)
+    self.assertAllEqual(int_data.shape[1:], input_data.shape[1:])
+
+  def test_int_output_no_reserved_zero(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[1, 2, 3, 4], [4, 3, 1, 0]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, mask_value=None)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(
+        vocabulary=vocab_data,
+        max_values=None,
+    )
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupVocabularyTest(
+    keras_parameterized.TestCase,
+    preprocessing_test_utils.PreprocessingLayerTest):
+
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(str(vocab) + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_get_vocab_returns_int(self):
+    vocab_data = [42, 1138, 725, 1729]
+    expected_vocab = [0, -1, 42, 1138, 725, 1729]
+    layer = get_layer_class()(vocabulary=vocab_data)
+    layer_vocab = layer.get_vocabulary()
+    self.assertAllEqual(expected_vocab, layer_vocab)
+    self.assertIsInstance(layer_vocab[0], np.int64)
+
+  def test_int_output_explicit_vocab_from_file(self):
+    vocab_list = [42, 1138, 725, 1729]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = [42, 1138, 725, 1729, 1729]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*1729.*"):
+      _ = get_layer_class()(vocabulary=vocab_data)
+
+  def test_non_unique_vocab_from_file_fails(self):
+    vocab_list = [42, 1138, 725, 1729, 42]
+    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*42.*"):
+      _ = get_layer_class()(vocabulary=vocab_path)
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
+class IntegerLookupSaveableTest(keras_parameterized.TestCase,
+                                preprocessing_test_utils.PreprocessingLayerTest
+                               ):
+
+  def test_ops_are_not_added_with_multiple_get_set_weights(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    weights = model.get_weights()
+    model.set_weights(weights)
+    keras.backend.get_session().graph.finalize()
+    weights = model.get_weights()
+    model.set_weights(weights)
+
+  def test_layer_saving_with_h5(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    path = os.path.join(self.get_temp_dir(), "model")
+    with self.assertRaisesRegex(NotImplementedError,
+                                "Save or restore weights that is not.*"):
+      save.save_model(model, path, save_format="h5")
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupErrorTest(keras_parameterized.TestCase,
+                             preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_too_long_vocab_fails_in_single_setting(self):
+    vocab_data = [42, 1138, 725, 1729]
+
+    layer = get_layer_class()(max_values=4, num_oov_indices=1)
+    with self.assertRaisesRegex(ValueError,
+                                "vocabulary larger than the maximum vocab.*"):
+      layer.set_vocabulary(vocab_data)
+
+  def test_zero_max_values_fails(self):
+    with self.assertRaisesRegex(ValueError, ".*max_values.*"):
+      _ = get_layer_class()(max_values=0, num_oov_indices=1)
+
+
+@keras_parameterized.run_all_keras_modes
+class IntegerLookupSavingTest(keras_parameterized.TestCase,
+                              preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_vocabulary_persistence_across_saving(self):
+    vocab_data = [42, 1138, 725, 1729]
+    input_array = np.array([[42, 1138, 725, 1729], [1729, 725, 42, 203]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    # Build and validate a golden model.
+    input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
+    layer = get_layer_class()(max_values=None, num_oov_indices=1)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(output_dataset, expected_output)
+
+    # Save the model to disk.
+    output_path = os.path.join(self.get_temp_dir(), "tf_keras_saved_model")
+    model.save(output_path, save_format="tf")
+
+    # Delete the session and graph to ensure that the loaded model is generated
+    # from scratch.
+    # TODO(b/149526183): Can't clear session when TF2 is disabled.
+    if tf2.enabled():
+      keras.backend.clear_session()
+
+    loaded_model = keras.models.load_model(
+        output_path, custom_objects={"IntegerLookup": get_layer_class()})
+
+    # Ensure that the loaded model is unique (so that the save/load is real)
+    self.assertIsNot(model, loaded_model)
+
+    # Validate correctness of the new model.
+    new_output_dataset = loaded_model.predict(input_array)
+    self.assertAllEqual(new_output_dataset, expected_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
new file mode 100644
index 00000000000..ec326f4d78b
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/integer_lookup_v1.py
@@ -0,0 +1,25 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import integer_lookup
+
+
+class IntegerLookup(integer_lookup.IntegerLookup, index_lookup_v1.IndexLookup):
+  pass
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup.py b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
new file mode 100644
index 00000000000..4032486b5f0
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup.py
@@ -0,0 +1,106 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import table_utils
+
+
+class StringLookup(index_lookup.IndexLookup):
+  """Maps strings from a vocabulary to integer indices.
+
+  This layer translates a set of arbitrary strings into an integer output via a
+  table-based lookup, with optional out-of-vocabulary handling.
+
+  If desired, the user can call this layer's `adapt()` method on a data set,
+  which will analyze the data set, determine the frequency of individual string
+  values, and create a vocabulary from them. This vocabulary can have
+  unlimited size or be capped, depending on the configuration options for this
+  layer; if there are more unique values in the input than the maximum
+  vocabulary size, the most frequent terms will be used to create the
+  vocabulary.
+
+  Attributes:
+    max_tokens: The maximum size of the vocabulary for this layer. If None,
+      there is no cap on the size of the vocabulary. Note that this vocabulary
+      includes the OOV and mask tokens, so the effective number of tokens is
+      (max_tokens - num_oov_indices - (1 if mask_token else 0))
+    num_oov_indices: The number of out-of-vocabulary tokens to use; defaults to
+      1. If this value is more than 1, OOV inputs are hashed to determine their
+      OOV value; if this value is 0, passing an OOV input will result in a '-1'
+      being returned for that value in the output tensor. (Note that, because
+      the value is -1 and not 0, this will allow you to effectively drop OOV
+      values from categorical encodings.)
+    mask_token: A token that represents masked values, and which is mapped to
+      index 0. Defaults to the empty string "". If set to None, no mask term
+      will be added and the OOV tokens, if any, will be indexed from
+      (0...num_oov_indices) instead of (1...num_oov_indices+1).
+    oov_token: The token representing an out-of-vocabulary value. Defaults to
+      "[OOV]".
+    vocabulary: An optional list of vocabulary terms, or a path to a text file
+      containing a vocabulary to load into this layer. The file should contain
+      one token per line. If the list or file contains the same token multiple
+      times, an error will be thrown.
+    encoding: The Python string encoding to use. Defaults to `'utf-8'`.
+  """
+
+  def __init__(self,
+               max_tokens=None,
+               num_oov_indices=1,
+               mask_token="",
+               oov_token="[OOV]",
+               vocabulary=None,
+               encoding="utf-8",
+               **kwargs):
+    allowed_dtypes = [dtypes.string]
+
+    if "dtype" in kwargs and kwargs["dtype"] not in allowed_dtypes:
+      raise ValueError("StringLookup may only have a dtype in %s." %
+                       allowed_dtypes)
+
+    if "dtype" not in kwargs:
+      kwargs["dtype"] = dtypes.string
+
+    if vocabulary is not None:
+      if isinstance(vocabulary, str):
+        vocabulary = table_utils.get_vocabulary_from_file(vocabulary, encoding)
+
+    self.encoding = encoding
+
+    super(StringLookup, self).__init__(
+        max_tokens=max_tokens,
+        num_oov_indices=num_oov_indices,
+        mask_token=mask_token,
+        oov_token=oov_token,
+        vocabulary=vocabulary,
+        **kwargs)
+
+  def get_config(self):
+    config = {"encoding": self.encoding}
+    base_config = super(StringLookup, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  def get_vocabulary(self):
+    if self._table_handler.vocab_size() == 0:
+      return []
+
+    keys, values = self._table_handler.data()
+    # This is required because the MutableHashTable doesn't preserve insertion
+    # order, but we rely on the order of the array to assign indices.
+    return [x.decode(self.encoding) for _, x in sorted(zip(values, keys))]
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
new file mode 100644
index 00000000000..b2a610ac328
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_test.py
@@ -0,0 +1,224 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras text vectorization preprocessing layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from absl.testing import parameterized
+import numpy as np
+import six
+
+from tensorflow.python import keras
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.preprocessing import preprocessing_test_utils
+from tensorflow.python.keras.layers.preprocessing import string_lookup
+from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
+from tensorflow.python.keras.saving import save
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+
+def get_layer_class():
+  if context.executing_eagerly():
+    return string_lookup.StringLookup
+  else:
+    return string_lookup_v1.StringLookup
+
+
+def _get_end_to_end_test_cases():
+  test_cases = (
+      {
+          "testcase_name":
+              "test_strings_soft_vocab_cap",
+          # Create an array where 'earth' is the most frequent term, followed by
+          # 'wind', then 'and', then 'fire'. This ensures that the vocab
+          # accumulator is sorting by frequency.
+          "vocab_data":
+              np.array([["fire"], ["earth"], ["earth"], ["earth"], ["earth"],
+                        ["wind"], ["wind"], ["wind"], ["and"], ["and"]]),
+          "input_data":
+              np.array([["earth"], ["wind"], ["and"], ["fire"], ["fire"],
+                        ["and"], ["earth"], ["michigan"]]),
+          "kwargs": {
+              "max_tokens": None,
+          },
+          "expected_output": [[2], [3], [4], [5], [5], [4], [2], [1]],
+          "input_dtype":
+              dtypes.string
+      },
+  )
+
+  crossed_test_cases = []
+  # Cross above test cases with use_dataset in (True, False)
+  for use_dataset in (True, False):
+    for case in test_cases:
+      case = case.copy()
+      if use_dataset:
+        case["testcase_name"] = case["testcase_name"] + "_with_dataset"
+      case["use_dataset"] = use_dataset
+      crossed_test_cases.append(case)
+
+  return crossed_test_cases
+
+
+@keras_parameterized.run_all_keras_modes
+class StringLookupLayerTest(keras_parameterized.TestCase,
+                            preprocessing_test_utils.PreprocessingLayerTest):
+
+  @parameterized.named_parameters(*_get_end_to_end_test_cases())
+  def test_layer_end_to_end_with_adapt(self, vocab_data, input_data, kwargs,
+                                       use_dataset, expected_output,
+                                       input_dtype):
+    cls = get_layer_class()
+    expected_output_dtype = dtypes.int64
+    input_shape = input_data.shape
+
+    if use_dataset:
+      # Keras APIs expect batched datasets.
+      # TODO(rachelim): `model.predict` predicts the result on each
+      # dataset batch separately, then tries to concatenate the results
+      # together. When the results have different shapes on the non-concat
+      # axis (which can happen in the output_mode = INT case for
+      # StringLookup), the concatenation fails. In real use cases, this may
+      # not be an issue because users are likely to pipe the preprocessing layer
+      # into other keras layers instead of predicting it directly. A workaround
+      # for these unit tests is to have the dataset only contain one batch, so
+      # no concatenation needs to happen with the result. For consistency with
+      # numpy input, we should make `predict` join differently shaped results
+      # together sensibly, with 0 padding.
+      input_data = dataset_ops.Dataset.from_tensor_slices(input_data).batch(
+          input_shape[0])
+      vocab_data = dataset_ops.Dataset.from_tensor_slices(vocab_data).batch(
+          input_shape[0])
+
+    with CustomObjectScope({"StringLookup": cls}):
+      output_data = testing_utils.layer_test(
+          cls,
+          kwargs=kwargs,
+          input_shape=input_shape,
+          input_data=input_data,
+          input_dtype=input_dtype,
+          expected_output_dtype=expected_output_dtype,
+          validate_training=False,
+          adapt_data=vocab_data)
+    self.assertAllClose(expected_output, output_data)
+
+
+@keras_parameterized.run_all_keras_modes
+class StringLookupVocabularyTest(keras_parameterized.TestCase,
+                                 preprocessing_test_utils.PreprocessingLayerTest
+                                ):
+
+  def _write_to_temp_file(self, file_name, vocab_list):
+    vocab_path = os.path.join(self.get_temp_dir(), file_name + ".txt")
+    with gfile.GFile(vocab_path, "w") as writer:
+      for vocab in vocab_list:
+        writer.write(vocab + "\n")
+      writer.flush()
+      writer.close()
+    return vocab_path
+
+  def test_int_output_explicit_vocab(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_get_vocab_returns_str(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    expected_vocab = ["", "[OOV]", "earth", "wind", "and", "fire"]
+    layer = get_layer_class()(vocabulary=vocab_data)
+    layer_vocab = layer.get_vocabulary()
+    self.assertAllEqual(expected_vocab, layer_vocab)
+    self.assertIsInstance(layer_vocab[0], six.text_type)
+
+  def test_int_output_explicit_vocab_from_file(self):
+    vocab_list = ["earth", "wind", "and", "fire"]
+    vocab_path = self._write_to_temp_file("vocab_file", vocab_list)
+
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(vocabulary=vocab_path)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
+  def test_non_unique_vocab_fails(self):
+    vocab_data = ["earth", "wind", "and", "fire", "fire"]
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*fire.*"):
+      _ = get_layer_class()(vocabulary=vocab_data)
+
+  def test_non_unique_vocab_from_file_fails(self):
+    vocab_list = ["earth", "wind", "and", "fire", "earth"]
+    vocab_path = self._write_to_temp_file("repeat_vocab_file", vocab_list)
+    with self.assertRaisesRegex(ValueError, ".*repeated term.*earth.*"):
+      _ = get_layer_class()(vocabulary=vocab_path)
+
+
+@keras_parameterized.run_all_keras_modes(always_skip_eager=True)
+class StringLookupSaveableTest(keras_parameterized.TestCase,
+                               preprocessing_test_utils.PreprocessingLayerTest):
+
+  def test_ops_are_not_added_with_multiple_get_set_weights(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(max_tokens=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    weights = model.get_weights()
+    model.set_weights(weights)
+    keras.backend.get_session().graph.finalize()
+    weights = model.get_weights()
+    model.set_weights(weights)
+
+  def test_layer_saving_with_h5(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(max_tokens=10)
+    layer.set_vocabulary(vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+    path = os.path.join(self.get_temp_dir(), "model")
+    with self.assertRaisesRegex(NotImplementedError,
+                                "Save or restore weights that is not.*"):
+      save.save_model(model, path, save_format="h5")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
new file mode 100644
index 00000000000..0d4c70de655
--- /dev/null
+++ b/tensorflow/python/keras/layers/preprocessing/string_lookup_v1.py
@@ -0,0 +1,25 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras string lookup preprocessing layer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import string_lookup
+
+
+class StringLookup(string_lookup.StringLookup, index_lookup_v1.IndexLookup):
+  pass
diff --git a/tensorflow/python/keras/layers/preprocessing/table_utils.py b/tensorflow/python/keras/layers/preprocessing/table_utils.py
index f5397da1f3e..05447f6e9ff 100644
--- a/tensorflow/python/keras/layers/preprocessing/table_utils.py
+++ b/tensorflow/python/keras/layers/preprocessing/table_utils.py
@@ -189,4 +189,3 @@ def convert_to_ndarray(x, dtype=None):
     if np.can_cast(array.dtype, np_dtype):
       array = array.astype(np_dtype, casting="safe")
   return array
-
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 119e0b5ccff..4156ba50c02 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -32,7 +32,7 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine.base_preprocessing_layer import Combiner
 from tensorflow.python.keras.engine.base_preprocessing_layer import CombinerPreprocessingLayer
 from tensorflow.python.keras.layers.preprocessing import categorical_encoding
-from tensorflow.python.keras.layers.preprocessing import index_lookup
+from tensorflow.python.keras.layers.preprocessing import string_lookup
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -269,10 +269,6 @@ class TextVectorization(CombinerPreprocessingLayer):
 
     self._max_tokens = max_tokens
 
-    # In INT mode, we have two reserved values (PAD and OOV). However, non-INT
-    # modes don't have a PAD value, so we only need to reserve one value.
-    self._reserved_values = 2 if output_mode == INT else 1
-
     # In INT mode, the zero value is reserved for padding (per Keras standard
     # padding approaches). In non-INT modes, there is no padding so we can set
     # the OOV value to zero instead of one.
@@ -303,9 +299,9 @@ class TextVectorization(CombinerPreprocessingLayer):
             self._max_vocab_size, compute_idf=output_mode == TFIDF),
         **kwargs)
 
-    reserve_zero = output_mode in [None, INT]
+    mask_token = "" if output_mode in [None, INT] else None
     self._index_lookup_layer = self._get_index_lookup_class()(
-        max_tokens=max_tokens, reserve_zero=reserve_zero, dtype=dtypes.string)
+        max_tokens=max_tokens, mask_token=mask_token)
 
     # If this layer is configured for string or integer output, we do not
     # create a vectorization layer (as the output is not vectorized).
@@ -328,7 +324,7 @@ class TextVectorization(CombinerPreprocessingLayer):
     return (keys.numpy(), values.numpy())
 
   def _get_index_lookup_class(self):
-    return index_lookup.IndexLookup
+    return string_lookup.StringLookup
 
   def _to_numpy(self, preprocessed_data):
     """Converts preprocessed inputs into numpy arrays."""
@@ -428,26 +424,21 @@ class TextVectorization(CombinerPreprocessingLayer):
   def set_vocabulary(self,
                      vocab,
                      df_data=None,
-                     oov_df_value=None,
-                     append=False):
+                     oov_df_value=None):
     """Sets vocabulary (and optionally document frequency) data for this layer.
 
     This method sets the vocabulary and DF data for this layer directly, instead
     of analyzing a dataset through 'adapt'. It should be used whenever the vocab
     (and optionally document frequency) information is already known. If
-    vocabulary data is already present in the layer, this method will either
-    replace it, if 'append' is set to False, or append to it (if 'append' is set
-    to True).
+    vocabulary data is already present in the layer, this method will replace
+    it.
 
     Arguments:
       vocab: An array of string tokens.
       df_data: An array of document frequency data. Only necessary if the layer
         output_mode is TFIDF.
       oov_df_value: The document frequency of the OOV token. Only necessary if
-        output_mode is TFIDF. OOV data is optional when appending additional
-        data in TFIDF mode; if an OOV value is supplied it will overwrite the
-        existing OOV value.
-      append: Whether to overwrite or append any existing vocabulary data.
+        output_mode is TFIDF.
 
     Raises:
       ValueError: If there are too many inputs, the inputs do not match, or
@@ -468,8 +459,7 @@ class TextVectorization(CombinerPreprocessingLayer):
                           "be changed after the layer is "
                           "called.").format(mode=self._output_mode))
 
-    current_table_size = self._index_lookup_layer.vocab_size()
-    self._index_lookup_layer.set_vocabulary(vocab, append)
+    self._index_lookup_layer.set_vocabulary(vocab)
 
     # When doing raw or integer output, we don't have a Vectorize layer to
     # manage. In this case, we can return directly.
@@ -477,14 +467,9 @@ class TextVectorization(CombinerPreprocessingLayer):
       return
 
     if not self._pad_to_max or self._max_tokens is None:
-      num_tokens = self._index_lookup_layer.vocab_size() + self._reserved_values
+      num_tokens = self._index_lookup_layer.vocab_size()
       self._vectorize_layer.set_num_elements(num_tokens)
 
-    # We're only _really_ appending if the table_size is nonzero. This is
-    # important for some sanity checks in tfidf mode (specifically, checking if
-    # oov_df_value is set or not) and handling existing tfidf weight data.
-    append = append if current_table_size > 0 else False
-
     if self._output_mode == TFIDF:
       if df_data is None:
         raise ValueError("df_data must be set if output_mode is TFIDF")
@@ -492,31 +477,14 @@ class TextVectorization(CombinerPreprocessingLayer):
         raise ValueError("df_data must be the same length as vocab. "
                          "len(df_data) is %s, len(vocab) is %s" %
                          (len(vocab), len(df_data)))
-      if not append and oov_df_value is None:
-        raise ValueError("You must pass an oov_df_value the first time "
-                         "'set_vocabulary' is called when output_mode is "
+      if oov_df_value is None:
+        raise ValueError("You must pass an oov_df_value when output_mode is "
                          "TFIDF.")
 
       df_data = self._convert_to_ndarray(df_data)
-      if append:
-        # The existing IDF data is stored in a Keras weight, so we can get it
-        # by calling K.get_value() on the weight object. Take the first
-        # table_size+1 values in case we're padding the weight with zeros
-        existing_df_data = K.get_value(
-            self._vectorize_layer.tf_idf_weights)[:current_table_size + 1]
-        df_data = np.append(existing_df_data, df_data, axis=0)
-        # If we are appending and need to replace the OOV DF value, we can
-        # assign it over the existing OOV DF value at index 0 of the (already-
-        # concatenated) DF value array.
-        if oov_df_value is not None:
-          df_data[0] = oov_df_value
-      else:
-        # If we are not appending (that is, we have only new data) we need to
-        # insert the OOV value to the front of the array. (This is a append to
-        # the head, not a replacement of the zeroth value.)
-        if not isinstance(oov_df_value, np.ndarray):
-          oov_df_value = np.array([oov_df_value])
-        df_data = np.insert(df_data, 0, oov_df_value)
+      if not isinstance(oov_df_value, np.ndarray):
+        oov_df_value = np.array([oov_df_value])
+      df_data = np.insert(df_data, 0, oov_df_value)
       self._vectorize_layer.set_tfidf_data(df_data)
 
   def build(self, input_shape):
@@ -536,8 +504,10 @@ class TextVectorization(CombinerPreprocessingLayer):
     if not self.built:
       raise RuntimeError("_set_state_variables() must be called after build().")
     if self._output_mode == TFIDF:
-      self.set_vocabulary(updates[_VOCAB_NAME], updates[_IDF_NAME],
-                          updates[_OOV_IDF_NAME])
+      self.set_vocabulary(
+          updates[_VOCAB_NAME],
+          updates[_IDF_NAME],
+          updates[_OOV_IDF_NAME])
     else:
       self.set_vocabulary(updates[_VOCAB_NAME])
 
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index d8325f39149..f8a1f5b9434 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -619,25 +619,6 @@ class TextVectorizationOutputTest(
     output_dataset = model.predict(input_array)
     self.assertAllEqual(expected_output, output_dataset)
 
-  def test_vocab_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    input_array = np.array([["earth", "wind", "and", "fire"],
-                            ["fire", "and", "earth", "michigan"]])
-    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-    layer.set_vocabulary(vocab_data[0])
-    layer.set_vocabulary(vocab_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
   def test_int_output_densifies_with_zeros(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     # Create an input array that has 5 elements in the first example and 4 in
@@ -1046,7 +1027,10 @@ class TextVectorizationOutputTest(
         split=None,
         output_mode=text_vectorization.TFIDF,
         pad_to_max_tokens=True)
-    layer.set_vocabulary(vocab_data, df_data=tfidf_data, oov_df_value=.05)
+    layer.set_vocabulary(
+        vocab_data,
+        df_data=tfidf_data,
+        oov_df_value=.05)
     int_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, int_data.shape.as_list())
 
@@ -1084,60 +1068,6 @@ class TextVectorizationOutputTest(
     output_dataset = model.predict(input_array)
     self.assertAllClose(expected_output, output_dataset)
 
-  def test_tfidf_appending(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    tfidf_data = [[.5, .25], [.2, .125]]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0,  1, .25, .2,    0],
-                       [.1, .5,   0,  0, .125]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
-    layer.set_vocabulary(vocab_data[1], df_data=tfidf_data[1], append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
-  def test_tfidf_appending_with_oov_replacement(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-    tfidf_data = [[.5, .25], [.2, .125]]
-    input_array = np.array([["earth", "wind", "and", "earth"],
-                            ["ohio", "fire", "earth", "michigan"]])
-
-    # pyformat: disable
-    # pylint: disable=bad-whitespace
-    expected_output = [[ 0,  1, .25, .2,    0],
-                       [1.5, .5,   0,  0, .125]]
-    # pylint: enable=bad-whitespace
-    # pyformat: enable
-
-    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    layer.set_vocabulary(vocab_data[0], df_data=tfidf_data[0], oov_df_value=.05)
-    # Note that here we've replaced the OOV vaue.
-    layer.set_vocabulary(
-        vocab_data[1], df_data=tfidf_data[1], oov_df_value=.75, append=True)
-    int_data = layer(input_data)
-    model = keras.Model(inputs=input_data, outputs=int_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllClose(expected_output, output_dataset)
-
   def test_accept_1D_input(self):
     input_array = np.array(["earth wind and fire",
                             "fire and earth michigan"])
@@ -1274,22 +1204,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                 "vocabulary larger than the maximum vocab.*"):
       layer.set_vocabulary(vocab_data)
 
-  def test_too_long_vocab_fails_in_multiple_settings(self):
-    vocab_data = [["earth", "wind"], ["and", "fire"]]
-
-    layer = get_layer_class()(
-        max_tokens=4,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.INT)
-
-    # The first time we call set_vocabulary, we're under the max_tokens limit
-    # so it should be fine.
-    layer.set_vocabulary(vocab_data[0])
-    with self.assertRaisesRegex(ValueError,
-                                "vocabulary larger than the maximum vocab.*"):
-      layer.set_vocabulary(vocab_data[1], append=True)
-
   def test_setting_vocab_without_tfidf_data_fails_in_tfidf_mode(self):
     vocab_data = ["earth", "wind", "and", "fire"]
 
@@ -1326,18 +1240,6 @@ class TextVectorizationErrorTest(keras_parameterized.TestCase,
                                 "You must pass an oov_df_value.*"):
       layer.set_vocabulary(vocab_data, df_data)
 
-  def test_tfidf_set_vocab_with_no_oov_fails_with_append_set(self):
-    vocab_data = ["earth", "wind", "and", "fire"]
-    df_data = [1, 2, 3, 4]
-    layer = get_layer_class()(
-        max_tokens=5,
-        standardize=None,
-        split=None,
-        output_mode=text_vectorization.TFIDF)
-    with self.assertRaisesRegex(ValueError,
-                                "You must pass an oov_df_value.*"):
-      layer.set_vocabulary(vocab_data, df_data, append=True)
-
   def test_set_tfidf_in_non_tfidf_fails(self):
     vocab_data = ["earth", "wind", "and", "fire"]
     df_data = [1, 2, 3, 4]
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
index b869bee52ab..59cf2c61288 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_v1.py
@@ -23,7 +23,7 @@ import numpy as np
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.engine import base_preprocessing_layer_v1
 from tensorflow.python.keras.layers.preprocessing import categorical_encoding_v1
-from tensorflow.python.keras.layers.preprocessing import index_lookup_v1
+from tensorflow.python.keras.layers.preprocessing import string_lookup_v1
 from tensorflow.python.keras.layers.preprocessing import text_vectorization
 from tensorflow.python.ops.ragged import ragged_tensor_value
 from tensorflow.python.util.tf_export import keras_export
@@ -84,7 +84,7 @@ class TextVectorization(text_vectorization.TextVectorization,
     return categorical_encoding_v1.CategoricalEncoding
 
   def _get_index_lookup_class(self):
-    return index_lookup_v1.IndexLookup
+    return string_lookup_v1.StringLookup
 
   def _to_numpy(self, data):
     """Converts preprocessed inputs into numpy arrays."""
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 47852865558..4f5b0f480e4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -221,7 +221,7 @@ tf_class {
   }
   member_method {
     name: "set_vocabulary"
-    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 05154268354..a33f65189fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -219,7 +219,7 @@ tf_class {
   }
   member_method {
     name: "set_vocabulary"
-    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\', \'append\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'vocab\', \'df_data\', \'oov_df_value\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "set_weights"

From 37df93331ee0ee9f00830ee223a79198edca8f89 Mon Sep 17 00:00:00 2001
From: David Rim <davidrim@google.com>
Date: Thu, 14 May 2020 19:19:49 -0700
Subject: [PATCH 235/412] Adds utility methods for storing SignatureDefs in the
 metadata table in the flatbuffer

PiperOrigin-RevId: 311652937
Change-Id: I397c7ce6fad843cff789dedb583d6df44545db3f
---
 tensorflow/lite/tools/signature/BUILD         | 106 +++++++++++
 .../tools/signature/signature_def_util.cc     | 175 ++++++++++++++++++
 .../lite/tools/signature/signature_def_util.h |  71 +++++++
 .../signature/signature_def_util_test.cc      | 167 +++++++++++++++++
 .../signature_def_util_wrapper_pybind11.cc    |  95 ++++++++++
 .../tools/signature/signature_def_utils.py    |  95 ++++++++++
 .../signature/signature_def_utils_test.py     |  76 ++++++++
 7 files changed, 785 insertions(+)
 create mode 100644 tensorflow/lite/tools/signature/BUILD
 create mode 100644 tensorflow/lite/tools/signature/signature_def_util.cc
 create mode 100644 tensorflow/lite/tools/signature/signature_def_util.h
 create mode 100644 tensorflow/lite/tools/signature/signature_def_util_test.cc
 create mode 100644 tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
 create mode 100644 tensorflow/lite/tools/signature/signature_def_utils.py
 create mode 100644 tensorflow/lite/tools/signature/signature_def_utils_test.py

diff --git a/tensorflow/lite/tools/signature/BUILD b/tensorflow/lite/tools/signature/BUILD
new file mode 100644
index 00000000000..cf28b2eab72
--- /dev/null
+++ b/tensorflow/lite/tools/signature/BUILD
@@ -0,0 +1,106 @@
+# Utilities for signature_defs in TFLite
+load("//tensorflow:tensorflow.bzl", "pybind_extension")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/lite/micro:build_def.bzl", "cc_library")
+load("//tensorflow/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+TFLITE_DEFAULT_COPTS = if_not_windows([
+    "-Wall",
+    "-Wno-comment",
+    "-Wno-extern-c-compat",
+])
+
+cc_library(
+    name = "signature_def_util",
+    srcs = ["signature_def_util.cc"],
+    hdrs = ["signature_def_util.h"],
+    copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
+    deps = [
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:protos_all_cc_impl",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/memory",
+        "@com_google_protobuf//:protobuf",
+        "@flatbuffers",
+    ],
+)
+
+cc_test(
+    name = "signature_def_util_test",
+    size = "small",
+    srcs = ["signature_def_util_test.cc"],
+    data = [
+        "//tensorflow/lite:testdata/add.bin",
+    ],
+    tags = [
+        "tflite_not_portable",
+    ],
+    deps = [
+        ":signature_def_util",
+        "//tensorflow/cc/saved_model:signature_constants",
+        "//tensorflow/core:tflite_portable_logging",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+pybind_extension(
+    name = "_pywrap_signature_def_util_wrapper",
+    srcs = [
+        "signature_def_util_wrapper_pybind11.cc",
+    ],
+    module_name = "_pywrap_signature_def_util_wrapper",
+    deps = [
+        ":signature_def_util",
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/python:pybind11_lib",
+        "@pybind11",
+    ],
+)
+
+py_library(
+    name = "signature_def_utils",
+    srcs = ["signature_def_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":_pywrap_signature_def_util_wrapper",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_test(
+    name = "signature_def_utils_test",
+    srcs = ["signature_def_utils_test.py"],
+    data = ["//tensorflow/lite:testdata/add.bin"],
+    python_version = "PY3",
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_mac",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":signature_def_utils",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/lite/tools/signature/signature_def_util.cc b/tensorflow/lite/tools/signature/signature_def_util.cc
new file mode 100644
index 00000000000..e44fe98b3cc
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util.cc
@@ -0,0 +1,175 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/signature/signature_def_util.h"
+
+#include <string>
+
+#include "absl/memory/memory.h"
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "flatbuffers/flexbuffers.h"  // from @flatbuffers
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace {
+
+using tensorflow::Status;
+using SerializedSignatureDefMap = std::map<std::string, std::string>;
+using SignatureDefMap = std::map<std::string, tensorflow::SignatureDef>;
+
+const Metadata* GetSignatureDefMetadata(const Model* model) {
+  if (!model || !model->metadata()) {
+    return nullptr;
+  }
+  for (int i = 0; i < model->metadata()->size(); ++i) {
+    const Metadata* metadata = model->metadata()->Get(i);
+    if (metadata->name()->str() == kSignatureDefsMetadataName) {
+      return metadata;
+    }
+  }
+  return nullptr;
+}
+
+Status ReadSignatureDefMap(const Model* model, const Metadata* metadata,
+                           SerializedSignatureDefMap* map) {
+  if (!model || !metadata || !map) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  const flatbuffers::Vector<uint8_t>* flatbuffer_data =
+      model->buffers()->Get(metadata->buffer())->data();
+  const auto signature_defs =
+      flexbuffers::GetRoot(flatbuffer_data->data(), flatbuffer_data->size())
+          .AsMap();
+  for (int i = 0; i < signature_defs.Keys().size(); ++i) {
+    const std::string key = signature_defs.Keys()[i].AsString().c_str();
+    (*map)[key] = signature_defs[key].AsString().c_str();
+  }
+  return tensorflow::Status::OK();
+}
+
+}  // namespace
+
+Status SetSignatureDefMap(const Model* model,
+                          const SignatureDefMap& signature_def_map,
+                          std::string* model_data_with_signature_def) {
+  if (!model || !model_data_with_signature_def) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  if (signature_def_map.empty()) {
+    return tensorflow::errors::InvalidArgument(
+        "signature_def_map should not be empty");
+  }
+  flexbuffers::Builder fbb;
+  const size_t start_map = fbb.StartMap();
+  auto mutable_model = absl::make_unique<ModelT>();
+  model->UnPackTo(mutable_model.get(), nullptr);
+  int buffer_id = mutable_model->buffers.size();
+  const Metadata* metadata = GetSignatureDefMetadata(model);
+  if (metadata) {
+    buffer_id = metadata->buffer();
+  } else {
+    auto buffer = absl::make_unique<BufferT>();
+    mutable_model->buffers.emplace_back(std::move(buffer));
+    auto sigdef_metadata = absl::make_unique<MetadataT>();
+    sigdef_metadata->buffer = buffer_id;
+    sigdef_metadata->name = kSignatureDefsMetadataName;
+    mutable_model->metadata.emplace_back(std::move(sigdef_metadata));
+  }
+  for (const auto& entry : signature_def_map) {
+    fbb.String(entry.first.c_str(), entry.second.SerializeAsString());
+  }
+  fbb.EndMap(start_map);
+  fbb.Finish();
+  mutable_model->buffers[buffer_id]->data = fbb.GetBuffer();
+  flatbuffers::FlatBufferBuilder builder;
+  auto packed_model = Model::Pack(builder, mutable_model.get());
+  FinishModelBuffer(builder, packed_model);
+  *model_data_with_signature_def =
+      std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
+                  builder.GetSize());
+  return Status::OK();
+}
+
+bool HasSignatureDef(const Model* model, const std::string& signature_key) {
+  if (!model) {
+    return false;
+  }
+  const Metadata* metadata = GetSignatureDefMetadata(model);
+  if (!metadata) {
+    return false;
+  }
+  SerializedSignatureDefMap signature_defs;
+  if (ReadSignatureDefMap(model, metadata, &signature_defs) !=
+      tensorflow::Status::OK()) {
+    return false;
+  }
+  return (signature_defs.find(signature_key) != signature_defs.end());
+}
+
+Status GetSignatureDefMap(const Model* model,
+                          SignatureDefMap* signature_def_map) {
+  if (!model || !signature_def_map) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  SignatureDefMap retrieved_signature_def_map;
+  const Metadata* metadata = GetSignatureDefMetadata(model);
+  if (metadata) {
+    SerializedSignatureDefMap signature_defs;
+    auto status = ReadSignatureDefMap(model, metadata, &signature_defs);
+    if (status != tensorflow::Status::OK()) {
+      return tensorflow::errors::Internal("Error reading signature def map: %s",
+                                          status.error_message());
+    }
+    for (const auto& entry : signature_defs) {
+      tensorflow::SignatureDef signature_def;
+      if (!signature_def.ParseFromString(entry.second)) {
+        return tensorflow::errors::Internal(
+            "Cannot parse signature def found in flatbuffer.");
+      }
+      retrieved_signature_def_map[entry.first] = signature_def;
+    }
+    *signature_def_map = retrieved_signature_def_map;
+  }
+  return Status::OK();
+}
+
+Status ClearSignatureDefMap(const Model* model, std::string* model_data) {
+  if (!model || !model_data) {
+    return tensorflow::errors::InvalidArgument("Arguments must not be nullptr");
+  }
+  auto mutable_model = absl::make_unique<ModelT>();
+  model->UnPackTo(mutable_model.get(), nullptr);
+  for (int id = 0; id < model->metadata()->size(); ++id) {
+    const Metadata* metadata = model->metadata()->Get(id);
+    if (metadata->name()->str() == kSignatureDefsMetadataName) {
+      auto* buffers = &(mutable_model->buffers);
+      buffers->erase(buffers->begin() + metadata->buffer());
+      mutable_model->metadata.erase(mutable_model->metadata.begin() + id);
+      break;
+    }
+  }
+  flatbuffers::FlatBufferBuilder builder;
+  auto packed_model = Model::Pack(builder, mutable_model.get());
+  FinishModelBuffer(builder, packed_model);
+  *model_data =
+      std::string(reinterpret_cast<const char*>(builder.GetBufferPointer()),
+                  builder.GetSize());
+  return Status::OK();
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/tools/signature/signature_def_util.h b/tensorflow/lite/tools/signature/signature_def_util.h
new file mode 100644
index 00000000000..7e9c96ffc43
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util.h
@@ -0,0 +1,71 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOOLS_SIGNATURE_DEF_UTIL_H_
+#define TENSORFLOW_LITE_TOOLS_SIGNATURE_DEF_UTIL_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+// Constant for name of the Metadata entry associated with SignatureDefs.
+constexpr char kSignatureDefsMetadataName[] = "signature_defs_metadata";
+
+// The function `SetSignatureDefMap()` results in
+// `model_data_with_signature_defs` containing a serialized TFLite model
+// identical to `model` with a metadata and associated buffer containing
+// a FlexBuffer::Map with `signature_def_map` keys and values serialized to
+// String.
+//
+// If a Metadata entry containing a SignatureDef map exists, it will be
+//   overwritten.
+//
+// Returns error if `model_data_with_signature_defs` is null or
+//   `signature_def_map` is empty.
+//
+// On success, returns tensorflow::Status::OK() or error otherwise.
+// On error, `model_data_with_signature_defs` is unchanged.
+tensorflow::Status SetSignatureDefMap(
+    const Model* model,
+    const std::map<std::string, tensorflow::SignatureDef>& signature_def_map,
+    std::string* model_data_with_signature_defs);
+
+// The function `HasSignatureDef()` returns true if `model` contains a Metadata
+// table pointing to a buffer containing a FlexBuffer::Map and the map has
+// `signature_key` as a key, or false otherwise.
+bool HasSignatureDef(const Model* model, const std::string& signature_key);
+
+// The function `GetSignatureDefMap()` results in `signature_def_map`
+// pointing to a map<std::string, tensorflow::SignatureDef>
+// parsed from `model`'s metadata buffer.
+//
+// If the Metadata entry does not exist, `signature_def_map` is unchanged.
+// If the Metadata entry exists but cannot be parsed, returns an error.
+tensorflow::Status GetSignatureDefMap(
+    const Model* model,
+    std::map<std::string, tensorflow::SignatureDef>* signature_def_map);
+
+// The function `ClearSignatureDefs` results in `model_data`
+// containing a serialized Model identical to `model` omitting any
+// SignatureDef-related metadata or buffers.
+tensorflow::Status ClearSignatureDefMap(const Model* model,
+                                        std::string* model_data);
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TOOLS_SIGNATURE_DEF_UTIL_H_
diff --git a/tensorflow/lite/tools/signature/signature_def_util_test.cc b/tensorflow/lite/tools/signature/signature_def_util_test.cc
new file mode 100644
index 00000000000..d4581e262a4
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util_test.cc
@@ -0,0 +1,167 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tools/signature/signature_def_util.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/cc/saved_model/signature_constants.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/lite/c/c_api.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+using tensorflow::kClassifyMethodName;
+using tensorflow::kDefaultServingSignatureDefKey;
+using tensorflow::kPredictMethodName;
+using tensorflow::SignatureDef;
+using tensorflow::Status;
+
+constexpr char kSignatureInput[] = "input";
+constexpr char kSignatureOutput[] = "output";
+constexpr char kTestFilePath[] = "tensorflow/lite/testdata/add.bin";
+
+class SimpleSignatureDefUtilTest : public testing::Test {
+ protected:
+  void SetUp() override {
+    flatbuffer_model_ = FlatBufferModel::BuildFromFile(kTestFilePath);
+    ASSERT_NE(flatbuffer_model_, nullptr);
+    model_ = flatbuffer_model_->GetModel();
+    ASSERT_NE(model_, nullptr);
+  }
+
+  SignatureDef GetTestSignatureDef() {
+    auto signature_def = SignatureDef();
+    tensorflow::TensorInfo input_tensor;
+    tensorflow::TensorInfo output_tensor;
+    *input_tensor.mutable_name() = kSignatureInput;
+    *output_tensor.mutable_name() = kSignatureOutput;
+    *signature_def.mutable_method_name() = kClassifyMethodName;
+    (*signature_def.mutable_inputs())[kSignatureInput] = input_tensor;
+    (*signature_def.mutable_outputs())[kSignatureOutput] = output_tensor;
+    return signature_def;
+  }
+  std::unique_ptr<FlatBufferModel> flatbuffer_model_;
+  const Model* model_;
+};
+
+TEST_F(SimpleSignatureDefUtilTest, SetSignatureDefTest) {
+  SignatureDef expected_signature_def = GetTestSignatureDef();
+  std::string model_output;
+  const std::map<string, SignatureDef> expected_signature_def_map = {
+      {kDefaultServingSignatureDefKey, expected_signature_def}};
+  EXPECT_EQ(Status::OK(), SetSignatureDefMap(model_, expected_signature_def_map,
+                                             &model_output));
+  const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(add_model, &test_signature_def_map));
+  SignatureDef test_signature_def =
+      test_signature_def_map[kDefaultServingSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+}
+
+TEST_F(SimpleSignatureDefUtilTest, OverwriteSignatureDefTest) {
+  auto expected_signature_def = GetTestSignatureDef();
+  std::string model_output;
+  std::map<string, SignatureDef> expected_signature_def_map = {
+      {kDefaultServingSignatureDefKey, expected_signature_def}};
+  EXPECT_EQ(Status::OK(), SetSignatureDefMap(model_, expected_signature_def_map,
+                                             &model_output));
+  const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(add_model, &test_signature_def_map));
+  SignatureDef test_signature_def =
+      test_signature_def_map[kDefaultServingSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+  *expected_signature_def.mutable_method_name() = kPredictMethodName;
+  expected_signature_def_map.erase(
+      expected_signature_def_map.find(kDefaultServingSignatureDefKey));
+  constexpr char kTestSignatureDefKey[] = "ServingTest";
+  expected_signature_def_map[kTestSignatureDefKey] = expected_signature_def;
+  EXPECT_EQ(
+      Status::OK(),
+      SetSignatureDefMap(add_model, expected_signature_def_map, &model_output));
+  const Model* final_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_FALSE(HasSignatureDef(final_model, kDefaultServingSignatureDefKey));
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(final_model, &test_signature_def_map));
+  EXPECT_NE(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+  EXPECT_TRUE(HasSignatureDef(final_model, kTestSignatureDefKey));
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(final_model, &test_signature_def_map));
+  test_signature_def = test_signature_def_map[kTestSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+}
+
+TEST_F(SimpleSignatureDefUtilTest, GetSignatureDefTest) {
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(), GetSignatureDefMap(model_, &test_signature_def_map));
+  EXPECT_FALSE(HasSignatureDef(model_, kDefaultServingSignatureDefKey));
+}
+
+TEST_F(SimpleSignatureDefUtilTest, ClearSignatureDefTest) {
+  const int expected_num_buffers = model_->buffers()->size();
+  auto expected_signature_def = GetTestSignatureDef();
+  std::string model_output;
+  std::map<string, SignatureDef> expected_signature_def_map = {
+      {kDefaultServingSignatureDefKey, expected_signature_def}};
+  EXPECT_EQ(Status::OK(), SetSignatureDefMap(model_, expected_signature_def_map,
+                                             &model_output));
+  const Model* add_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_TRUE(HasSignatureDef(add_model, kDefaultServingSignatureDefKey));
+  SignatureDef test_signature_def;
+  std::map<string, SignatureDef> test_signature_def_map;
+  EXPECT_EQ(Status::OK(),
+            GetSignatureDefMap(add_model, &test_signature_def_map));
+  test_signature_def = test_signature_def_map[kDefaultServingSignatureDefKey];
+  EXPECT_EQ(expected_signature_def.SerializeAsString(),
+            test_signature_def.SerializeAsString());
+  EXPECT_EQ(Status::OK(), ClearSignatureDefMap(add_model, &model_output));
+  const Model* clear_model = flatbuffers::GetRoot<Model>(model_output.data());
+  EXPECT_FALSE(HasSignatureDef(clear_model, kDefaultServingSignatureDefKey));
+  EXPECT_EQ(expected_num_buffers, clear_model->buffers()->size());
+}
+
+TEST_F(SimpleSignatureDefUtilTest, SetSignatureDefErrorsTest) {
+  std::map<string, SignatureDef> test_signature_def_map;
+  std::string model_output;
+  EXPECT_TRUE(tensorflow::errors::IsInvalidArgument(
+      SetSignatureDefMap(model_, test_signature_def_map, &model_output)));
+  SignatureDef test_signature_def;
+  test_signature_def_map[kDefaultServingSignatureDefKey] = test_signature_def;
+  EXPECT_TRUE(tensorflow::errors::IsInvalidArgument(
+      SetSignatureDefMap(model_, test_signature_def_map, nullptr)));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
new file mode 100644
index 00000000000..9477305d433
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_util_wrapper_pybind11.cc
@@ -0,0 +1,95 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "pybind11/pybind11.h"
+#include "pybind11/pytypes.h"
+#include "pybind11/stl.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/tools/signature/signature_def_util.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+
+py::bytes WrappedSetSignatureDefMap(
+    const std::vector<uint8_t>& model_buffer,
+    const std::map<std::string, std::string>& serialized_signature_def_map) {
+  auto flatbuffer_model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(model_buffer.data()), model_buffer.size());
+  auto* model = flatbuffer_model->GetModel();
+  if (!model) {
+    throw std::invalid_argument("Invalid model");
+  }
+  std::string data;
+  std::map<std::string, tensorflow::SignatureDef> signature_def_map;
+  for (const auto& entry : serialized_signature_def_map) {
+    tensorflow::SignatureDef signature_def;
+    if (!signature_def.ParseFromString(entry.second)) {
+      throw std::invalid_argument("Cannot parse signature def");
+    }
+    signature_def_map[entry.first] = signature_def;
+  }
+  auto status = tflite::SetSignatureDefMap(model, signature_def_map, &data);
+  if (status != tensorflow::Status::OK()) {
+    throw std::invalid_argument(status.error_message());
+  }
+  return py::bytes(data);
+}
+
+std::map<std::string, py::bytes> WrappedGetSignatureDefMap(
+    const std::vector<uint8_t>& model_buffer) {
+  auto flatbuffer_model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(model_buffer.data()), model_buffer.size());
+  auto* model = flatbuffer_model->GetModel();
+  if (!model) {
+    throw std::invalid_argument("Invalid model");
+  }
+  std::string content;
+  std::map<std::string, tensorflow::SignatureDef> signature_def_map;
+  auto status = tflite::GetSignatureDefMap(model, &signature_def_map);
+  if (status != tensorflow::Status::OK()) {
+    throw std::invalid_argument("Cannot parse signature def");
+  }
+  std::map<std::string, py::bytes> serialized_signature_def_map;
+  for (const auto& entry : signature_def_map) {
+    serialized_signature_def_map[entry.first] =
+        py::bytes(entry.second.SerializeAsString());
+  }
+  return serialized_signature_def_map;
+}
+
+py::bytes WrappedClearSignatureDefs(const std::vector<uint8_t>& model_buffer) {
+  auto flatbuffer_model = tflite::FlatBufferModel::BuildFromBuffer(
+      reinterpret_cast<const char*>(model_buffer.data()), model_buffer.size());
+  auto* model = flatbuffer_model->GetModel();
+  if (!model) {
+    throw std::invalid_argument("Invalid model");
+  }
+  std::string content;
+  auto status = tflite::ClearSignatureDefMap(model, &content);
+  if (status != tensorflow::Status::OK()) {
+    throw std::invalid_argument("An unknown error occurred");
+  }
+  return py::bytes(content);
+}
+
+PYBIND11_MODULE(_pywrap_signature_def_util_wrapper, m) {
+  m.doc() = R"pbdoc(
+    _pywrap_signature_def_util_wrapper
+    -----
+  )pbdoc";
+
+  m.def("SetSignatureDefMap", &WrappedSetSignatureDefMap);
+
+  m.def("GetSignatureDefMap", &WrappedGetSignatureDefMap);
+
+  m.def("ClearSignatureDefs", &WrappedClearSignatureDefs);
+}
diff --git a/tensorflow/lite/tools/signature/signature_def_utils.py b/tensorflow/lite/tools/signature/signature_def_utils.py
new file mode 100644
index 00000000000..df25c651172
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_utils.py
@@ -0,0 +1,95 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions related to SignatureDefs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.lite.tools.signature import _pywrap_signature_def_util_wrapper as signature_def_util
+
+
+def set_signature_defs(tflite_model, signature_def_map):
+  """Sets SignatureDefs to the Metadata of a TfLite flatbuffer buffer.
+
+  Args:
+    tflite_model: Binary TFLite model (bytes or bytes-like object) to which to
+      add signature_def.
+    signature_def_map: dict containing SignatureDefs to store in metadata.
+  Returns:
+    buffer: A TFLite model binary identical to model buffer with
+      metadata field containing SignatureDef.
+
+  Raises:
+    ValueError:
+      tflite_model buffer does not contain a valid TFLite model.
+      signature_def_map is empty or does not contain a SignatureDef.
+  """
+  model = tflite_model
+  if not isinstance(tflite_model, bytearray):
+    model = bytearray(tflite_model)
+  serialized_signature_def_map = {
+      k: v.SerializeToString() for k, v in signature_def_map.items()}
+  model_buffer = signature_def_util.SetSignatureDefMap(
+      model, serialized_signature_def_map)
+  return model_buffer
+
+
+def get_signature_defs(tflite_model):
+  """Get SignatureDef dict from the Metadata of a TfLite flatbuffer buffer.
+
+  Args:
+    tflite_model: TFLite model buffer to get the signature_def.
+
+  Returns:
+    dict containing serving names to SignatureDefs if exists, otherwise, empty
+      dict.
+
+  Raises:
+    ValueError:
+      tflite_model buffer does not contain a valid TFLite model.
+    DecodeError:
+      SignatureDef cannot be parsed from TfLite SignatureDef metadata.
+  """
+  model = tflite_model
+  if not isinstance(tflite_model, bytearray):
+    model = bytearray(tflite_model)
+  serialized_signature_def_map = signature_def_util.GetSignatureDefMap(model)
+  def _deserialize(serialized):
+    signature_def = meta_graph_pb2.SignatureDef()
+    signature_def.ParseFromString(serialized)
+    return signature_def
+  return {k: _deserialize(v) for k, v in serialized_signature_def_map.items()}
+
+
+def clear_signature_defs(tflite_model):
+  """Clears SignatureDefs from the Metadata of a TfLite flatbuffer buffer.
+
+  Args:
+    tflite_model: TFLite model buffer to remove signature_defs.
+
+  Returns:
+    buffer: A TFLite model binary identical to model buffer with
+      no SignatureDef metadata.
+
+  Raises:
+    ValueError:
+      tflite_model buffer does not contain a valid TFLite model.
+  """
+  model = tflite_model
+  if not isinstance(tflite_model, bytearray):
+    model = bytearray(tflite_model)
+  return signature_def_util.ClearSignatureDefs(model)
diff --git a/tensorflow/lite/tools/signature/signature_def_utils_test.py b/tensorflow/lite/tools/signature/signature_def_utils_test.py
new file mode 100644
index 00000000000..f7cb33188af
--- /dev/null
+++ b/tensorflow/lite/tools/signature/signature_def_utils_test.py
@@ -0,0 +1,76 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for signature_def_util.py.
+
+   - Tests adding a SignatureDef to TFLite metadata.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.lite.tools.signature import signature_def_utils
+
+
+class SignatureDefUtilsTest(tf.test.TestCase):
+
+  def testAddSignatureDefToFlatbufferMetadata(self):
+    """Test a SavedModel conversion has correct Metadata."""
+    filename = tf.compat.v1.resource_loader.get_path_to_datafile(
+        '../../testdata/add.bin')
+    if not os.path.exists(filename):
+      raise IOError('File "{0}" does not exist in {1}.'.format(
+          filename,
+          tf.compat.v1.resource_loader.get_root_dir_with_all_resources()))
+
+    with tf.io.gfile.GFile(filename, 'rb') as fp:
+      tflite_model = bytearray(fp.read())
+
+    self.assertIsNotNone(tflite_model, 'TFLite model is none')
+    sig_input_tensor = meta_graph_pb2.TensorInfo(
+        dtype=tf.as_dtype(tf.float32).as_datatype_enum,
+        tensor_shape=tf.TensorShape([1, 8, 8, 3]).as_proto())
+    sig_input_tensor_signature = {'x': sig_input_tensor}
+    sig_output_tensor = meta_graph_pb2.TensorInfo(
+        dtype=tf.as_dtype(tf.float32).as_datatype_enum,
+        tensor_shape=tf.TensorShape([1, 8, 8, 3]).as_proto())
+    sig_output_tensor_signature = {'y': sig_output_tensor}
+    predict_signature_def = (
+        tf.compat.v1.saved_model.build_signature_def(
+            sig_input_tensor_signature, sig_output_tensor_signature,
+            tf.saved_model.PREDICT_METHOD_NAME))
+    serving_key = tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    signature_def_map = {serving_key: predict_signature_def}
+    tflite_model = signature_def_utils.set_signature_defs(
+        tflite_model, signature_def_map)
+    saved_signature_def_map = signature_def_utils.get_signature_defs(
+        tflite_model)
+    signature_def = saved_signature_def_map.get(serving_key)
+    self.assertIsNotNone(signature_def, 'SignatureDef not found')
+    self.assertEqual(signature_def.SerializeToString(),
+                     predict_signature_def.SerializeToString())
+    remove_tflite_model = (
+        signature_def_utils.clear_signature_defs(tflite_model))
+    signature_def_map = signature_def_utils.get_signature_defs(
+        remove_tflite_model)
+    self.assertIsNone(signature_def_map.get(serving_key),
+                      'SignatureDef found, but should be missing')
+
+
+if __name__ == '__main__':
+  tf.test.main()

From 3b225a9776de735ded763d227e0c4c869d8e85c6 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Thu, 14 May 2020 19:42:15 -0700
Subject: [PATCH 236/412] Rework kernel check for fully_connected.

PiperOrigin-RevId: 311655034
Change-Id: Ic82fd9a9350cac89043db85d1ba1d4ec480435e5
---
 tensorflow/lite/kernels/kernel_util.cc | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/kernels/kernel_util.cc b/tensorflow/lite/kernels/kernel_util.cc
index b30747eac61..ded536ab3a7 100644
--- a/tensorflow/lite/kernels/kernel_util.cc
+++ b/tensorflow/lite/kernels/kernel_util.cc
@@ -126,11 +126,27 @@ TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
   // pipeline.
   if (bias) {
     const double bias_scale = static_cast<double>(bias->params.scale);
-    // Here we're making sure the input_product_scale & bias_scale the same.
-    // Normally this should be guaranteed by the training pipeline, we are
-    // setting the threshold to be 2e-6 to allow some numeric stability
-    // difference.
-    TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <= 2e-6);
+    // Here we're making sure the input_product_scale & bias_scale are about the
+    // same. Since we have:
+    // (output - output_zp) * output_scale =
+    // input_product_scale * input_product + bias * bias_scale ---- (0)
+    //
+    // (0) equals:
+    // (input_product + bias) * input_product_scale ----- (1)
+    //           +
+    // bias * (bias_scale - input_product_scale)   ------ (2)
+    //
+    // For the real kernel computation, we're doing (1), so we really need to
+    // make sure (2) has minimum impact on the output, so:
+    // bias * (bias_scale - input_product_scale) / output_scale should be
+    // a small number for an integer.
+    // Since normally bias should be within a small range.
+    // We should expect (bias_scale - input_product_scale) / output_scale to
+    // be a small number like 0.02.
+    const double scale_diff = std::abs(input_product_scale - bias_scale);
+    const double output_scale = static_cast<double>(output->params.scale);
+
+    TF_LITE_ENSURE(context, scale_diff / output_scale <= 0.02);
   }
   return GetQuantizedConvolutionMultipler(context, input, filter, output,
                                           multiplier);

From a98948acf8c92f580db09fe739f028796985b9e3 Mon Sep 17 00:00:00 2001
From: Jonathan Hseu <jhseu@google.com>
Date: Thu, 14 May 2020 19:43:19 -0700
Subject: [PATCH 237/412] Use CompactTextString instead of String for
 generating ops.

PiperOrigin-RevId: 311655146
Change-Id: I57e5c595522b47dd9badbf0720569ffef69fed66
---
 tensorflow/go/genop/internal/genop.go      |   9 +-
 tensorflow/go/genop/internal/genop_test.go | 255 +++++++++++++++++++++
 2 files changed, 259 insertions(+), 5 deletions(-)

diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go
index 95547045111..c4ea8abb543 100644
--- a/tensorflow/go/genop/internal/genop.go
+++ b/tensorflow/go/genop/internal/genop.go
@@ -567,11 +567,10 @@ func isListAttr(attrdef *odpb.OpDef_AttrDef) bool {
 // This is useful when 's' corresponds to a "oneof" protocol buffer message.
 // For example, consider the protocol buffer message:
 //   oneof value { bool b = 1;  int64 i = 2; }
-// String() on a Go corresponding object (using proto.CompactTextString) will
-// print "b:true", or "i:7" etc. This function strips out the leading "b:" or
-// "i:".
-func stripLeadingColon(s fmt.Stringer) string {
-	x := s.String()
+// proto.CompactTextString) will print "b:true", or "i:7" etc. This function
+// strips out the leading "b:" or "i:".
+func stripLeadingColon(m proto.Message) string {
+	x := proto.CompactTextString(m)
 	y := strings.SplitN(x, ":", 2)
 	if len(y) < 2 {
 		return x
diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go
index a339d181e8d..b467efc7aea 100644
--- a/tensorflow/go/genop/internal/genop_test.go
+++ b/tensorflow/go/genop/internal/genop_test.go
@@ -533,6 +533,261 @@ func TestOp(scope *Scope, bb tf.Output, aa tf.Output, optional ...TestOpAttr) (c
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
+`,
+		},
+		{
+			tag: "SampleDistortedBoundingBox",
+			opdef: `
+name: "SampleDistortedBoundingBox"
+input_arg {
+	name: "image_size"
+	type_attr: "T"
+}
+input_arg {
+	name: "bounding_boxes"
+	type: DT_FLOAT
+}
+output_arg {
+	name: "begin"
+	type_attr: "T"
+}
+output_arg {
+	name: "size"
+	type_attr: "T"
+}
+output_arg {
+	name: "bboxes"
+	type: DT_FLOAT
+}
+attr {
+	name: "T"
+	type: "type"
+	allowed_values {
+		list {
+			type: DT_UINT8
+			type: DT_INT8
+			type: DT_INT16
+			type: DT_INT32
+			type: DT_INT64
+		}
+	}
+}
+attr {
+	name: "seed"
+	type: "int"
+	default_value {
+		i: 0
+	}
+}
+attr {
+	name: "seed2"
+	type: "int"
+	default_value {
+		i: 0
+	}
+}
+attr {
+	name: "min_object_covered"
+	type: "float"
+	default_value {
+		f: 0.1
+	}
+}
+attr {
+	name: "aspect_ratio_range"
+	type: "list(float)"
+	default_value {
+		list {
+			f: 0.75
+			f: 1.33
+		}
+	}
+}
+attr {
+	name: "area_range"
+	type: "list(float)"
+	default_value {
+		list {
+			f: 0.05
+			f: 1
+		}
+	}
+}
+attr {
+	name: "max_attempts"
+	type: "int"
+	default_value {
+		i: 100
+	}
+}
+attr {
+	name: "use_image_if_no_bounding_boxes"
+	type: "bool"
+	default_value {
+		b: false
+	}
+}
+is_stateful: true
+`,
+			apidef: `
+op {
+  graph_op_name: "SampleDistortedBoundingBox"
+  in_arg {
+    name: "image_size"
+    description: "Blah blah"
+  }
+  in_arg {
+    name: "bounding_boxes"
+    description: "Blah blah"
+  }
+  out_arg {
+    name: "begin"
+    description: "Blah blah"
+  }
+  out_arg {
+    name: "size"
+    description: "Blah blah"
+  }
+  out_arg {
+    name: "bboxes"
+    description: "Blah blah"
+  }
+  attr {
+    name: "seed"
+    description: "Blah blah"
+  }
+  attr {
+    name: "seed2"
+    description: "Blah blah"
+  }
+  attr {
+    name: "min_object_covered"
+    description: "Blah blah"
+  }
+  attr {
+    name: "aspect_ratio_range"
+    description: "Blah blah"
+  }
+  attr {
+    name: "area_range"
+    description: "Blah blah"
+  }
+  attr {
+    name: "max_attempts"
+    description: "Blah blah"
+  }
+  attr {
+    name: "use_image_if_no_bounding_boxes"
+    description: "Blah blah"
+  }
+  summary: "Generate a single randomly distorted bounding box for an image."
+	description: "Blah blah"
+}
+`,
+			wanted: `
+// SampleDistortedBoundingBoxAttr is an optional argument to SampleDistortedBoundingBox.
+type SampleDistortedBoundingBoxAttr func(optionalAttr)
+
+// SampleDistortedBoundingBoxSeed sets the optional seed attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxSeed2 sets the optional seed2 attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 0
+func SampleDistortedBoundingBoxSeed2(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMinObjectCovered sets the optional min_object_covered attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 0.1
+func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["min_object_covered"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAspectRatioRange sets the optional aspect_ratio_range attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to <f:0.75 f:1.33 >
+func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["aspect_ratio_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to <f:0.05 f:1 >
+func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["area_range"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxMaxAttempts sets the optional max_attempts attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to 100
+func SampleDistortedBoundingBoxMaxAttempts(value int64) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["max_attempts"] = value
+	}
+}
+
+// SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes sets the optional use_image_if_no_bounding_boxes attribute to value.
+//
+// value: Blah blah
+// If not specified, defaults to false
+func SampleDistortedBoundingBoxUseImageIfNoBoundingBoxes(value bool) SampleDistortedBoundingBoxAttr {
+	return func(m optionalAttr) {
+		m["use_image_if_no_bounding_boxes"] = value
+	}
+}
+
+// Generate a single randomly distorted bounding box for an image.
+//
+// Blah blah
+//
+// Arguments:
+//	image_size: Blah blah
+//	bounding_boxes: Blah blah
+//
+// Returns:
+//	begin: Blah blah
+//	size: Blah blah
+//	bboxes: Blah blah
+func SampleDistortedBoundingBox(scope *Scope, image_size tf.Output, bounding_boxes tf.Output, optional ...SampleDistortedBoundingBoxAttr) (begin tf.Output, size tf.Output, bboxes tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SampleDistortedBoundingBox",
+		Input: []tf.Input{
+			image_size, bounding_boxes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
 `,
 		},
 	}

From 9489fbca6759050ac9d4c9348a65d79b0c5c06ad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 14 May 2020 19:47:32 -0700
Subject: [PATCH 238/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 311655487
Change-Id: Ia4b492dc27139b316a3f8a5b90d68582c05efe4a
---
 tensorflow/go/op/wrappers.go | 206 +++++++++++++++++------------------
 1 file changed, 103 insertions(+), 103 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index e6725269279..04c36ed3399 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -1274,7 +1274,7 @@ type SqueezeAttr func(optionalAttr)
 // value: If specified, only squeezes the dimensions listed. The dimension
 // index starts at 0. It is an error to squeeze a dimension that is not 1. Must
 // be in the range `[-rank(input), rank(input))`.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func SqueezeAxis(value []int64) SqueezeAttr {
@@ -1358,7 +1358,7 @@ type PlaceholderAttr func(optionalAttr)
 //
 // value: (Optional) The shape of the tensor. If the shape has 0 dimensions, the
 // shape is unconstrained.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func PlaceholderShape(value tf.Shape) PlaceholderAttr {
 	return func(m optionalAttr) {
 		m["shape"] = value
@@ -4016,7 +4016,7 @@ func FixedUnigramCandidateSamplerShard(value int64) FixedUnigramCandidateSampler
 //
 // value: A list of unigram counts or probabilities, one per ID in sequential
 // order. Exactly one of vocab_file and unigrams should be passed to this op.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func FixedUnigramCandidateSamplerUnigrams(value []float32) FixedUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
 		m["unigrams"] = value
@@ -7166,7 +7166,7 @@ func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source
 type TensorArrayV2Attr func(optionalAttr)
 
 // TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -7291,7 +7291,7 @@ type TensorArrayConcatV3Attr func(optionalAttr)
 // excluding the first dimension. Used to validate the shapes of
 // TensorArray elements. If this shape is not fully specified, concatenating
 // zero-size TensorArrays is an error.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3Attr {
 	return func(m optionalAttr) {
 		m["element_shape_except0"] = value
@@ -7350,7 +7350,7 @@ type TensorArrayGatherV3Attr func(optionalAttr)
 // value: The expected shape of an element, if known. Used to
 // validate the shapes of TensorArray elements. If this shape is not
 // fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayGatherV3ElementShape(value tf.Shape) TensorArrayGatherV3Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -7841,7 +7841,7 @@ type PriorityQueueV2Attr func(optionalAttr)
 // PriorityQueueV2ComponentTypes sets the optional component_types attribute to value.
 //
 // value: The type of each component in a value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func PriorityQueueV2ComponentTypes(value []tf.DataType) PriorityQueueV2Attr {
@@ -8148,7 +8148,7 @@ type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
 // MultiDeviceIteratorFromStringHandleOutputTypes sets the optional output_types attribute to value.
 //
 // value: The type list for the return values.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDeviceIteratorFromStringHandleAttr {
@@ -8160,7 +8160,7 @@ func MultiDeviceIteratorFromStringHandleOutputTypes(value []tf.DataType) MultiDe
 // MultiDeviceIteratorFromStringHandleOutputShapes sets the optional output_shapes attribute to value.
 //
 // value: The list of shapes being produced.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func MultiDeviceIteratorFromStringHandleOutputShapes(value []tf.Shape) MultiDeviceIteratorFromStringHandleAttr {
@@ -8516,7 +8516,7 @@ func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output
 type OptimizeDatasetAttr func(optionalAttr)
 
 // OptimizeDatasetOptimizationConfigs sets the optional optimization_configs attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func OptimizeDatasetOptimizationConfigs(value []string) OptimizeDatasetAttr {
 	return func(m optionalAttr) {
 		m["optimization_configs"] = value
@@ -9292,7 +9292,7 @@ type RandomShuffleQueueV2Attr func(optionalAttr)
 // be either 0 or the same as the length of component_types. If the length of
 // this attr is 0, the shapes of queue elements are not constrained, and
 // only one element may be dequeued at a time.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func RandomShuffleQueueV2Shapes(value []tf.Shape) RandomShuffleQueueV2Attr {
@@ -9515,7 +9515,7 @@ func DebugIdentityV2TensorDebugMode(value int64) DebugIdentityV2Attr {
 // DebugIdentityV2DebugUrls sets the optional debug_urls attribute to value.
 //
 // value: List of URLs to debug targets, e.g., file:///foo/tfdbg_dump.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugIdentityV2DebugUrls(value []string) DebugIdentityV2Attr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -9580,7 +9580,7 @@ func DebugNanCountTensorName(value string) DebugNanCountAttr {
 //
 // value: List of URLs to debug targets, e.g.,
 //   file:///foo/tfdbg_dump, grpc:://localhost:11011.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugNanCountDebugUrls(value []string) DebugNanCountAttr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -9654,7 +9654,7 @@ func DebugIdentityTensorName(value string) DebugIdentityAttr {
 //
 // value: List of URLs to debug targets, e.g.,
 //   file:///foo/tfdbg_dump, grpc:://localhost:11011
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugIdentityDebugUrls(value []string) DebugIdentityAttr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -10521,7 +10521,7 @@ func ParseExampleDatasetV2Deterministic(value string) ParseExampleDatasetV2Attr
 }
 
 // ParseExampleDatasetV2RaggedKeys sets the optional ragged_keys attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetV2RaggedKeys(value []string) ParseExampleDatasetV2Attr {
@@ -10531,7 +10531,7 @@ func ParseExampleDatasetV2RaggedKeys(value []string) ParseExampleDatasetV2Attr {
 }
 
 // ParseExampleDatasetV2RaggedValueTypes sets the optional ragged_value_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetV2RaggedValueTypes(value []tf.DataType) ParseExampleDatasetV2Attr {
@@ -10541,7 +10541,7 @@ func ParseExampleDatasetV2RaggedValueTypes(value []tf.DataType) ParseExampleData
 }
 
 // ParseExampleDatasetV2RaggedSplitTypes sets the optional ragged_split_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetV2RaggedSplitTypes(value []tf.DataType) ParseExampleDatasetV2Attr {
@@ -12053,7 +12053,7 @@ func SampleDistortedBoundingBoxMinObjectCovered(value float32) SampleDistortedBo
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to <f:0.75 f:1.33 >
 func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -12064,7 +12064,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -13310,7 +13310,7 @@ func ParseExampleDatasetSloppy(value bool) ParseExampleDatasetAttr {
 }
 
 // ParseExampleDatasetRaggedKeys sets the optional ragged_keys attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetRaggedKeys(value []string) ParseExampleDatasetAttr {
@@ -13320,7 +13320,7 @@ func ParseExampleDatasetRaggedKeys(value []string) ParseExampleDatasetAttr {
 }
 
 // ParseExampleDatasetRaggedValueTypes sets the optional ragged_value_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetRaggedValueTypes(value []tf.DataType) ParseExampleDatasetAttr {
@@ -13330,7 +13330,7 @@ func ParseExampleDatasetRaggedValueTypes(value []tf.DataType) ParseExampleDatase
 }
 
 // ParseExampleDatasetRaggedSplitTypes sets the optional ragged_split_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseExampleDatasetRaggedSplitTypes(value []tf.DataType) ParseExampleDatasetAttr {
@@ -13895,7 +13895,7 @@ func DebugNumericSummaryTensorName(value string) DebugNumericSummaryAttr {
 //
 // value: List of URLs to debug targets, e.g.,
 //   file:///foo/tfdbg_dump, grpc:://localhost:11011.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DebugNumericSummaryDebugUrls(value []string) DebugNumericSummaryAttr {
 	return func(m optionalAttr) {
 		m["debug_urls"] = value
@@ -15139,7 +15139,7 @@ func TensorSummaryDescription(value string) TensorSummaryAttr {
 // TensorSummaryLabels sets the optional labels attribute to value.
 //
 // value: An unused list of strings.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TensorSummaryLabels(value []string) TensorSummaryAttr {
 	return func(m optionalAttr) {
 		m["labels"] = value
@@ -15396,7 +15396,7 @@ func MutableHashTableOfTensorsV2UseNodeNameSharing(value bool) MutableHashTableO
 }
 
 // MutableHashTableOfTensorsV2ValueShape sets the optional value_shape attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func MutableHashTableOfTensorsV2ValueShape(value tf.Shape) MutableHashTableOfTensorsV2Attr {
 	return func(m optionalAttr) {
 		m["value_shape"] = value
@@ -16112,7 +16112,7 @@ type ParseSingleSequenceExampleAttr func(optionalAttr)
 // each context Feature given in context_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
@@ -16122,7 +16122,7 @@ func ParseSingleSequenceExampleContextSparseTypes(value []tf.DataType) ParseSing
 }
 
 // ParseSingleSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
@@ -16138,7 +16138,7 @@ func ParseSingleSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseS
 // The number of elements in the Feature corresponding to context_dense_key[j]
 // must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
@@ -16153,7 +16153,7 @@ func ParseSingleSequenceExampleContextDenseShapes(value []tf.Shape) ParseSingleS
 // of data in each FeatureList given in feature_list_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSingleSequenceExampleAttr {
@@ -16169,7 +16169,7 @@ func ParseSingleSequenceExampleFeatureListSparseTypes(value []tf.DataType) Parse
 // The shape of each Feature in the FeatureList corresponding to
 // feature_list_dense_key[j] must always equal
 // feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSingleSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSingleSequenceExampleAttr {
@@ -18969,7 +18969,7 @@ func SampleDistortedBoundingBoxV2Seed2(value int64) SampleDistortedBoundingBoxV2
 //
 // value: The cropped area of the image must have an aspect ratio =
 // width / height within this range.
-// If not specified, defaults to {f:0.75 f:1.33}
+// If not specified, defaults to <f:0.75 f:1.33 >
 func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["aspect_ratio_range"] = value
@@ -18980,7 +18980,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 //
 // value: The cropped area of the image must contain a fraction of the
 // supplied image within this range.
-// If not specified, defaults to {f:0.05 f:1}
+// If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
 		m["area_range"] = value
@@ -19384,7 +19384,7 @@ func ImageSummaryMaxImages(value int64) ImageSummaryAttr {
 // ImageSummaryBadColor sets the optional bad_color attribute to value.
 //
 // value: Color to use for pixels with non-finite values.
-// If not specified, defaults to {dtype:DT_UINT8 tensor_shape:{dim:{size:4}} int_val:255 int_val:0 int_val:0 int_val:255}
+// If not specified, defaults to <dtype:DT_UINT8 tensor_shape:<dim:<size:4 > > int_val:255 int_val:0 int_val:0 int_val:255 >
 func ImageSummaryBadColor(value tf.Tensor) ImageSummaryAttr {
 	return func(m optionalAttr) {
 		m["bad_color"] = value
@@ -20455,7 +20455,7 @@ func Conv3DBackpropFilterV2DataFormat(value string) Conv3DBackpropFilterV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropFilterV2Dilations(value []int64) Conv3DBackpropFilterV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -21599,7 +21599,7 @@ func Conv2DBackpropInputUseCudnnOnGpu(value bool) Conv2DBackpropInputAttr {
 // dimension, the amount of padding inserted before and after the dimension is
 // `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
 // `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func Conv2DBackpropInputExplicitPaddings(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -21627,7 +21627,7 @@ func Conv2DBackpropInputDataFormat(value string) Conv2DBackpropInputAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func Conv2DBackpropInputDilations(value []int64) Conv2DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22307,7 +22307,7 @@ func Conv2DUseCudnnOnGpu(value bool) Conv2DAttr {
 // dimension, the amount of padding inserted before and after the dimension is
 // `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
 // `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func Conv2DExplicitPaddings(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -22335,7 +22335,7 @@ func Conv2DDataFormat(value string) Conv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func Conv2DDilations(value []int64) Conv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22531,7 +22531,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeOutType(value tf.DataTy
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22539,7 +22539,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeDilations(value []int64
 }
 
 // QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizePaddingList sets the optional padding_list attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizePaddingList(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAndRequantizeAttr {
 	return func(m optionalAttr) {
 		m["padding_list"] = value
@@ -22600,7 +22600,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluOutType(value tf.DataType) Quantized
 // QuantizedDepthwiseConv2DWithBiasAndReluDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22608,7 +22608,7 @@ func QuantizedDepthwiseConv2DWithBiasAndReluDilations(value []int64) QuantizedDe
 }
 
 // QuantizedDepthwiseConv2DWithBiasAndReluPaddingList sets the optional padding_list attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func QuantizedDepthwiseConv2DWithBiasAndReluPaddingList(value []int64) QuantizedDepthwiseConv2DWithBiasAndReluAttr {
 	return func(m optionalAttr) {
 		m["padding_list"] = value
@@ -22715,7 +22715,7 @@ func QuantizedDepthwiseConv2DWithBiasOutType(value tf.DataType) QuantizedDepthwi
 // QuantizedDepthwiseConv2DWithBiasDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DWithBiasDilations(value []int64) QuantizedDepthwiseConv2DWithBiasAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22774,7 +22774,7 @@ func QuantizedDepthwiseConv2DOutType(value tf.DataType) QuantizedDepthwiseConv2D
 // QuantizedDepthwiseConv2DDilations sets the optional dilations attribute to value.
 //
 // value: List of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedDepthwiseConv2DDilations(value []int64) QuantizedDepthwiseConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -22948,7 +22948,7 @@ func QuantizedConv2DPerChannelOutType(value tf.DataType) QuantizedConv2DPerChann
 // QuantizedConv2DPerChannelDilations sets the optional dilations attribute to value.
 //
 // value: list of dilation values.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedConv2DPerChannelDilations(value []int64) QuantizedConv2DPerChannelAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23325,7 +23325,7 @@ func Conv3DBackpropInputV2DataFormat(value string) Conv3DBackpropInputV2Attr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropInputV2Dilations(value []int64) Conv3DBackpropInputV2Attr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -23700,7 +23700,7 @@ func QuantizedMatMulWithBias(scope *Scope, a tf.Output, b tf.Output, bias tf.Out
 type TensorArrayGatherV2Attr func(optionalAttr)
 
 // TensorArrayGatherV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayGatherV2ElementShape(value tf.Shape) TensorArrayGatherV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -23895,7 +23895,7 @@ func CopyTensorName(value string) CopyAttr {
 // <debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented
 // as 0/1. E.g., "DebugIdentity;grpc://foo:3333;1",
 // "DebugIdentity;file:///tmp/tfdbg_1;0".
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func CopyDebugOpsSpec(value []string) CopyAttr {
 	return func(m optionalAttr) {
 		m["debug_ops_spec"] = value
@@ -24127,7 +24127,7 @@ type FIFOQueueV2Attr func(optionalAttr)
 // be either 0 or the same as the length of component_types. If the length of
 // this attr is 0, the shapes of queue elements are not constrained, and
 // only one element may be dequeued at a time.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func FIFOQueueV2Shapes(value []tf.Shape) FIFOQueueV2Attr {
@@ -24471,7 +24471,7 @@ func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTable
 // MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
 //
 // value: The shape of each value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
 		m["value_shape"] = value
@@ -25648,7 +25648,7 @@ func AvgPool3DGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksi
 type Conv3DBackpropFilterAttr func(optionalAttr)
 
 // Conv3DBackpropFilterDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropFilterDilations(value []int64) Conv3DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25711,7 +25711,7 @@ func Conv3DDataFormat(value string) Conv3DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DDilations(value []int64) Conv3DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -25934,7 +25934,7 @@ func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *
 type DepthwiseConv2dNativeBackpropInputAttr func(optionalAttr)
 
 // DepthwiseConv2dNativeBackpropInputExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DepthwiseConv2dNativeBackpropInputExplicitPaddings(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -25962,7 +25962,7 @@ func DepthwiseConv2dNativeBackpropInputDataFormat(value string) DepthwiseConv2dN
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func DepthwiseConv2dNativeBackpropInputDilations(value []int64) DepthwiseConv2dNativeBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -26446,7 +26446,7 @@ func QuantizedConv2DOutType(value tf.DataType) QuantizedConv2DAttr {
 // filter element on that dimension. The dimension order is determined by the
 // value of `data_format`, see above for details. Dilations in the batch and
 // depth dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func QuantizedConv2DDilations(value []int64) QuantizedConv2DAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -27242,7 +27242,7 @@ func ParseSequenceExampleV2NcontextSparse(value int64) ParseSequenceExampleV2Att
 // each context Feature given in context_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextSparseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27254,7 +27254,7 @@ func ParseSequenceExampleV2ContextSparseTypes(value []tf.DataType) ParseSequence
 // ParseSequenceExampleV2ContextRaggedValueTypes sets the optional context_ragged_value_types attribute to value.
 //
 // value: RaggedTensor.value dtypes for the ragged context features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextRaggedValueTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27266,7 +27266,7 @@ func ParseSequenceExampleV2ContextRaggedValueTypes(value []tf.DataType) ParseSeq
 // ParseSequenceExampleV2ContextRaggedSplitTypes sets the optional context_ragged_split_types attribute to value.
 //
 // value: RaggedTensor.row_split dtypes for the ragged context features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextRaggedSplitTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27282,7 +27282,7 @@ func ParseSequenceExampleV2ContextRaggedSplitTypes(value []tf.DataType) ParseSeq
 // The number of elements in the Feature corresponding to context_dense_key[j]
 // must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2ContextDenseShapes(value []tf.Shape) ParseSequenceExampleV2Attr {
@@ -27312,7 +27312,7 @@ func ParseSequenceExampleV2NfeatureListDense(value int64) ParseSequenceExampleV2
 }
 
 // ParseSequenceExampleV2FeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27327,7 +27327,7 @@ func ParseSequenceExampleV2FeatureListDenseTypes(value []tf.DataType) ParseSeque
 // of data in each FeatureList given in feature_list_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27339,7 +27339,7 @@ func ParseSequenceExampleV2FeatureListSparseTypes(value []tf.DataType) ParseSequ
 // ParseSequenceExampleV2FeatureListRaggedValueTypes sets the optional feature_list_ragged_value_types attribute to value.
 //
 // value: RaggedTensor.value dtypes for the ragged FeatureList features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListRaggedValueTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27351,7 +27351,7 @@ func ParseSequenceExampleV2FeatureListRaggedValueTypes(value []tf.DataType) Pars
 // ParseSequenceExampleV2FeatureListRaggedSplitTypes sets the optional feature_list_ragged_split_types attribute to value.
 //
 // value: RaggedTensor.row_split dtypes for the ragged FeatureList features.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListRaggedSplitTypes(value []tf.DataType) ParseSequenceExampleV2Attr {
@@ -27367,7 +27367,7 @@ func ParseSequenceExampleV2FeatureListRaggedSplitTypes(value []tf.DataType) Pars
 // The shape of each Feature in the FeatureList corresponding to
 // feature_list_dense_key[j] must always equal
 // feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleV2FeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleV2Attr {
@@ -28548,7 +28548,7 @@ func BatchMaxEnqueuedBatches(value int64) BatchAttr {
 }
 
 // BatchAllowedBatchSizes sets the optional allowed_batch_sizes attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func BatchAllowedBatchSizes(value []int64) BatchAttr {
 	return func(m optionalAttr) {
 		m["allowed_batch_sizes"] = value
@@ -31175,7 +31175,7 @@ func VarHandleOpSharedName(value string) VarHandleOpAttr {
 //
 // value: The allowed devices containing the resource variable. Set when the output
 // ResourceHandle represents a per-replica/partitioned resource variable.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func VarHandleOpAllowedDevices(value []string) VarHandleOpAttr {
 	return func(m optionalAttr) {
 		m["allowed_devices"] = value
@@ -32522,7 +32522,7 @@ func CopyHostTensorName(value string) CopyHostAttr {
 // <debug_op>;<grpc_url>;<gated_grpc>, wherein gated_grpc is boolean represented
 // as 0/1. E.g., "DebugIdentity;grpc://foo:3333;1",
 // "DebugIdentity;file:///tmp/tfdbg_1;0".
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func CopyHostDebugOpsSpec(value []string) CopyHostAttr {
 	return func(m optionalAttr) {
 		m["debug_ops_spec"] = value
@@ -32851,7 +32851,7 @@ type IteratorFromStringHandleAttr func(optionalAttr)
 //
 // value: If specified, defines the type of each tuple component in an
 // element produced by the resulting iterator.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromStringHandleAttr {
@@ -32864,7 +32864,7 @@ func IteratorFromStringHandleOutputTypes(value []tf.DataType) IteratorFromString
 //
 // value: If specified, defines the shape of each tuple component in an
 // element produced by the resulting iterator.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func IteratorFromStringHandleOutputShapes(value []tf.Shape) IteratorFromStringHandleAttr {
@@ -34520,7 +34520,7 @@ type TensorArrayV3Attr func(optionalAttr)
 // value: The expected shape of an element, if known. Used to
 // validate the shapes of TensorArray elements. If this shape is not
 // fully specified, gathering zero-size TensorArrays is an error.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayV3ElementShape(value tf.Shape) TensorArrayV3Attr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -36610,7 +36610,7 @@ func ParseSequenceExampleNfeatureListDense(value int64) ParseSequenceExampleAttr
 // each context Feature given in context_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
@@ -36620,7 +36620,7 @@ func ParseSequenceExampleContextSparseTypes(value []tf.DataType) ParseSequenceEx
 }
 
 // ParseSequenceExampleFeatureListDenseTypes sets the optional feature_list_dense_types attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenceExampleAttr {
@@ -36636,7 +36636,7 @@ func ParseSequenceExampleFeatureListDenseTypes(value []tf.DataType) ParseSequenc
 // The number of elements in the Feature corresponding to context_dense_key[j]
 // must always equal context_dense_shapes[j].NumEntries().
 // The shape of context_dense_values[j] will match context_dense_shapes[j].
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
@@ -36651,7 +36651,7 @@ func ParseSequenceExampleContextDenseShapes(value []tf.Shape) ParseSequenceExamp
 // of data in each FeatureList given in feature_list_sparse_keys.
 // Currently the ParseSingleSequenceExample supports DT_FLOAT (FloatList),
 // DT_INT64 (Int64List), and DT_STRING (BytesList).
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequenceExampleAttr {
@@ -36667,7 +36667,7 @@ func ParseSequenceExampleFeatureListSparseTypes(value []tf.DataType) ParseSequen
 // The shape of each Feature in the FeatureList corresponding to
 // feature_list_dense_key[j] must always equal
 // feature_list_dense_shapes[j].NumEntries().
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func ParseSequenceExampleFeatureListDenseShapes(value []tf.Shape) ParseSequenceExampleAttr {
@@ -39265,7 +39265,7 @@ type PrelinearizeTupleAttr func(optionalAttr)
 // tuple shapes in the order the shapes appear in the "shapes" input. The layout
 // elements for a sub-shape can be set to -1 in which case the corresponding layout
 // will be computed by the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func PrelinearizeTupleLayouts(value []int64) PrelinearizeTupleAttr {
 	return func(m optionalAttr) {
 		m["layouts"] = value
@@ -40936,7 +40936,7 @@ func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_ke
 type DatasetToGraphAttr func(optionalAttr)
 
 // DatasetToGraphStatefulWhitelist sets the optional stateful_whitelist attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func DatasetToGraphStatefulWhitelist(value []string) DatasetToGraphAttr {
@@ -41345,7 +41345,7 @@ func ResourceApplyKerasMomentum(scope *Scope, var_ tf.Output, accum tf.Output, l
 type TensorArrayConcatV2Attr func(optionalAttr)
 
 // TensorArrayConcatV2ElementShapeExcept0 sets the optional element_shape_except0 attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorArrayConcatV2ElementShapeExcept0(value tf.Shape) TensorArrayConcatV2Attr {
 	return func(m optionalAttr) {
 		m["element_shape_except0"] = value
@@ -41660,7 +41660,7 @@ func TPUReplicateMetadataUseTpu(value bool) TPUReplicateMetadataAttr {
 // TPUReplicateMetadataDeviceAssignment sets the optional device_assignment attribute to value.
 //
 // value: The assignment of devices for the computation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
 		m["device_assignment"] = value
@@ -41670,7 +41670,7 @@ func TPUReplicateMetadataDeviceAssignment(value []int64) TPUReplicateMetadataAtt
 // TPUReplicateMetadataComputationShape sets the optional computation_shape attribute to value.
 //
 // value: DEPRECATED. Use num_cores_per_replica instead.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
 		m["computation_shape"] = value
@@ -41678,7 +41678,7 @@ func TPUReplicateMetadataComputationShape(value []int64) TPUReplicateMetadataAtt
 }
 
 // TPUReplicateMetadataHostComputeCore sets the optional host_compute_core attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
 		m["host_compute_core"] = value
@@ -41686,7 +41686,7 @@ func TPUReplicateMetadataHostComputeCore(value []string) TPUReplicateMetadataAtt
 }
 
 // TPUReplicateMetadataPaddingMap sets the optional padding_map attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func TPUReplicateMetadataPaddingMap(value []string) TPUReplicateMetadataAttr {
 	return func(m optionalAttr) {
 		m["padding_map"] = value
@@ -41737,7 +41737,7 @@ func TPUReplicateMetadata(scope *Scope, num_replicas int64, optional ...TPURepli
 type TensorListConcatAttr func(optionalAttr)
 
 // TensorListConcatElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to {unknown_rank:true}
+// If not specified, defaults to <unknown_rank:true >
 func TensorListConcatElementShape(value tf.Shape) TensorListConcatAttr {
 	return func(m optionalAttr) {
 		m["element_shape"] = value
@@ -42546,7 +42546,7 @@ func Cosh(scope *Scope, x tf.Output) (y tf.Output) {
 type CollectiveReduceAttr func(optionalAttr)
 
 // CollectiveReduceWaitFor sets the optional wait_for attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func CollectiveReduceWaitFor(value []int64) CollectiveReduceAttr {
 	return func(m optionalAttr) {
 		m["wait_for"] = value
@@ -43077,7 +43077,7 @@ func DecodeCSVNaValue(value string) DecodeCSVAttr {
 }
 
 // DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
 	return func(m optionalAttr) {
 		m["select_cols"] = value
@@ -43622,7 +43622,7 @@ func EnqueueTPUEmbeddingSparseBatchDeviceOrdinal(value int64) EnqueueTPUEmbeddin
 // the sum of the weights be 0 for 'mean' or the sum of the squared weights be
 // 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
 // all tables.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingSparseBatchCombiners(value []string) EnqueueTPUEmbeddingSparseBatchAttr {
 	return func(m optionalAttr) {
 		m["combiners"] = value
@@ -44714,7 +44714,7 @@ func EnqueueTPUEmbeddingRaggedTensorBatchDeviceOrdinal(value int64) EnqueueTPUEm
 // the sum of the weights be 0 for 'mean' or the sum of the squared weights be
 // 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
 // all tables.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingRaggedTensorBatchCombiners(value []string) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["combiners"] = value
@@ -44722,7 +44722,7 @@ func EnqueueTPUEmbeddingRaggedTensorBatchCombiners(value []string) EnqueueTPUEmb
 }
 
 // EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingRaggedTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingRaggedTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["max_sequence_lengths"] = value
@@ -45506,7 +45506,7 @@ func CropAndResize(scope *Scope, image tf.Output, boxes tf.Output, box_ind tf.Ou
 type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
 // DepthwiseConv2dNativeBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DepthwiseConv2dNativeBackpropFilterExplicitPaddings(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -45534,7 +45534,7 @@ func DepthwiseConv2dNativeBackpropFilterDataFormat(value string) DepthwiseConv2d
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func DepthwiseConv2dNativeBackpropFilterDilations(value []int64) DepthwiseConv2dNativeBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -46632,7 +46632,7 @@ type InfeedEnqueueTupleAttr func(optionalAttr)
 // all the tuple shapes, in the order the shapes appear in the "shapes" input.
 // The layout elements for a sub-shape can be set to -1, in which case the
 // corresponding layout will be computed by the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func InfeedEnqueueTupleLayouts(value []int64) InfeedEnqueueTupleAttr {
 	return func(m optionalAttr) {
 		m["layouts"] = value
@@ -47474,7 +47474,7 @@ func LoadTPUEmbeddingFTRLParameters(scope *Scope, parameters tf.Output, accumula
 type Conv3DBackpropInputAttr func(optionalAttr)
 
 // Conv3DBackpropInputDilations sets the optional dilations attribute to value.
-// If not specified, defaults to {i:1 i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 i:1 >
 func Conv3DBackpropInputDilations(value []int64) Conv3DBackpropInputAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47517,7 +47517,7 @@ func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_ba
 type DepthwiseConv2dNativeAttr func(optionalAttr)
 
 // DepthwiseConv2dNativeExplicitPaddings sets the optional explicit_paddings attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func DepthwiseConv2dNativeExplicitPaddings(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -47545,7 +47545,7 @@ func DepthwiseConv2dNativeDataFormat(value string) DepthwiseConv2dNativeAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func DepthwiseConv2dNativeDilations(value []int64) DepthwiseConv2dNativeAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -47634,7 +47634,7 @@ type PaddingFIFOQueueV2Attr func(optionalAttr)
 // zeros up to the maximum shape of all elements in the given batch.
 // If the length of this attr is 0, different queue elements may have
 // different ranks and shapes, but only one element may be dequeued at a time.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 //
 // REQUIRES: len(value) >= 0
 func PaddingFIFOQueueV2Shapes(value []tf.Shape) PaddingFIFOQueueV2Attr {
@@ -47886,7 +47886,7 @@ type InfeedEnqueueAttr func(optionalAttr)
 // InfeedEnqueueShape sets the optional shape attribute to value.
 //
 // value: The shape of the tensor.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
 	return func(m optionalAttr) {
 		m["shape"] = value
@@ -47898,7 +47898,7 @@ func InfeedEnqueueShape(value tf.Shape) InfeedEnqueueAttr {
 // value: A vector holding the requested layout in minor-to-major sequence.
 // If a layout attribute is passed, but its values are all -1, the layout will
 // be computed by the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func InfeedEnqueueLayout(value []int64) InfeedEnqueueAttr {
 	return func(m optionalAttr) {
 		m["layout"] = value
@@ -48506,7 +48506,7 @@ func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
 // dimension, the amount of padding inserted before and after the dimension is
 // `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
 // `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["explicit_paddings"] = value
@@ -48534,7 +48534,7 @@ func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
 // element on that dimension. The dimension order is determined by the value of
 // `data_format`, see above for details. Dilations in the batch and depth
 // dimensions must be 1.
-// If not specified, defaults to {i:1 i:1 i:1 i:1}
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
 func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
 	return func(m optionalAttr) {
 		m["dilations"] = value
@@ -48653,7 +48653,7 @@ type PrelinearizeAttr func(optionalAttr)
 // PrelinearizeShape sets the optional shape attribute to value.
 //
 // value: The shape of the tensor.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
 	return func(m optionalAttr) {
 		m["shape"] = value
@@ -48665,7 +48665,7 @@ func PrelinearizeShape(value tf.Shape) PrelinearizeAttr {
 // value: A vector holding the requested layout in minor-to-major sequence. If a layout
 // attribute is passed but its values are all -1 the layout will be computed by
 // the infeed operation.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func PrelinearizeLayout(value []int64) PrelinearizeAttr {
 	return func(m optionalAttr) {
 		m["layout"] = value
@@ -49084,7 +49084,7 @@ func EnqueueTPUEmbeddingSparseTensorBatchDeviceOrdinal(value int64) EnqueueTPUEm
 // the sum of the weights be 0 for 'mean' or the sum of the squared weights be
 // 0 for 'sqrtn'. If combiners isn't passed, the default is to use 'sum' for
 // all tables.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmbeddingSparseTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["combiners"] = value
@@ -49092,7 +49092,7 @@ func EnqueueTPUEmbeddingSparseTensorBatchCombiners(value []string) EnqueueTPUEmb
 }
 
 // EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths sets the optional max_sequence_lengths attribute to value.
-// If not specified, defaults to {}
+// If not specified, defaults to <>
 func EnqueueTPUEmbeddingSparseTensorBatchMaxSequenceLengths(value []int64) EnqueueTPUEmbeddingSparseTensorBatchAttr {
 	return func(m optionalAttr) {
 		m["max_sequence_lengths"] = value

From a9c0ce87a68b30949da76bb921ee5985039f6fb8 Mon Sep 17 00:00:00 2001
From: Lu Wang <luwa@google.com>
Date: Thu, 14 May 2020 20:09:23 -0700
Subject: [PATCH 239/412] Check the minimum metadata parser version in the
 MetadataExtractor Java library.

PiperOrigin-RevId: 311657605
Change-Id: I39169392214b8a70d5882c5ec4af93021480ce23
---
 .../support/metadata/MetadataExtractor.java   | 73 ++++++++++++++++++-
 .../support/metadata/ModelMetadataInfo.java   | 13 ++++
 .../support/metadata/metadata_schema.fbs      | 33 ++++++++-
 .../lite/tools/versioning/runtime_version.h   |  4 +-
 4 files changed, 118 insertions(+), 5 deletions(-)

diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
index 054ea0e9730..3ded50e5d95 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/MetadataExtractor.java
@@ -54,6 +54,11 @@ import org.tensorflow.lite.support.metadata.schema.TensorMetadata;
  * MetadataExtractor} omits subgraph index as an input in its methods.
  */
 public class MetadataExtractor {
+  // TODO(b/156539454): remove the hardcode versioning number and populate the version through
+  // genrule.
+  /** The version of the metadata parser that this {@link MetadataExtractor} library depends on. */
+  public static final String METADATA_PARSER_VERSION = "1.0.0";
+
   /** The helper class to load metadata from TFLite model FlatBuffer. */
   private final ModelInfo modelInfo;
 
@@ -76,6 +81,15 @@ public class MetadataExtractor {
     ByteBuffer metadataBuffer = modelInfo.getMetadataBuffer();
     if (metadataBuffer != null) {
       metadataInfo = new ModelMetadataInfo(metadataBuffer);
+
+      // Prints warning message if the minimum parser version is not satisfied.
+      if (!isMinimumParserVersionSatisfied()) {
+        System.err.printf(
+            "<Warning> Some fields in the metadata belong to a future schema. The minimum parser"
+                + " version required is %s, but the version of the current metadata parser is %s",
+            metadataInfo.getMininumParserVersion(), METADATA_PARSER_VERSION);
+      }
+
       checkArgument(
           modelInfo.getInputTensorCount() == metadataInfo.getInputTensorCount(),
           String.format(
@@ -98,7 +112,7 @@ public class MetadataExtractor {
   }
 
   /** Returns {@code true} if the model has metadata. Otherwise, returns {@code false}. */
-  public Boolean hasMetadata() {
+  public boolean hasMetadata() {
     return metadataInfo != null;
   }
 
@@ -216,7 +230,31 @@ public class MetadataExtractor {
   }
 
   /**
-   * Asserts if {@link metdadataInfo} is not initialized. Some models may not have metadata and this
+   * Returns {@code true} if the minimum parser version required by the given metadata flatbuffer
+   * precedes or equals to the version of the metadata parser that this MetadataExtractor library is
+   * relying on. All fields in the metadata can be parsed correctly with this metadata extractor
+   * library in this case. Otherwise, it returns {@code false}.
+   *
+   * <p>For example, assume the underlying metadata parser version is {@code 1.14.1},
+   *
+   * <ul>
+   *   <li>it returns {@code true}, if the required minimum parser version is the same or older,
+   *       such as {@code 1.14.1} or {@code 1.14.0}. Null version precedes all numeric versions,
+   *       because some metadata flatbuffers are generated before the first versioned release; <br>
+   *   <li>it returns {@code false}, if the required minimum parser version is newer, such as {@code
+   *       1.14.2}.
+   * </ul>
+   */
+  public final boolean isMinimumParserVersionSatisfied() {
+    String minVersion = metadataInfo.getMininumParserVersion();
+    if (minVersion == null) {
+      return true;
+    }
+    return compareVersions(minVersion, METADATA_PARSER_VERSION) <= 0;
+  }
+
+  /**
+   * Asserts if {@link #metadataInfo} is not initialized. Some models may not have metadata and this
    * is allowed. However, invoking methods that reads the metadata is not allowed.
    *
    * @throws IllegalStateException if this model does not contain model metadata
@@ -260,4 +298,35 @@ public class MetadataExtractor {
       return null;
     }
   }
+
+  /**
+   * Compares two semantic version numbers.
+   *
+   * <p>Examples of comparing two versions: <br>
+   * {@code 1.9} precedes {@code 1.14}; <br>
+   * {@code 1.14} precedes {@code 1.14.1}; <br>
+   * {@code 1.14} and {@code 1.14.0} are euqal;
+   *
+   * @return the value {@code 0} if the two versions are equal; a value less than {@code 0} if
+   *     {@code version1} precedes {@code version2}; a value greater than {@code 0} if {@code
+   *     version2} precedes {@code version1}.
+   */
+  private static int compareVersions(String version1, String version2) {
+    // Using String.split instead of the recommanded Guava Splitter because we've been avoiding
+    // depending on other third party libraries in this project.
+    String[] levels1 = version1.split("\\.", 0);
+    String[] levels2 = version2.split("\\.", 0);
+
+    int length = Math.max(levels1.length, levels2.length);
+    for (int i = 0; i < length; i++) {
+      Integer v1 = i < levels1.length ? Integer.parseInt(levels1[i]) : 0;
+      Integer v2 = i < levels2.length ? Integer.parseInt(levels2[i]) : 0;
+      int compare = v1.compareTo(v2);
+      if (compare != 0) {
+        return compare;
+      }
+    }
+
+    return 0;
+  }
 }
diff --git a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
index 57fa7113c2a..751ed500dc2 100644
--- a/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
+++ b/tensorflow/lite/experimental/support/metadata/java/src/java/org/tensorflow/lite/support/metadata/ModelMetadataInfo.java
@@ -38,6 +38,9 @@ final class ModelMetadataInfo {
   /** Metadata array of output tensors. */
   private final List</* @Nullable */ TensorMetadata> outputsMetadata;
 
+  /** The minimum parser version required to fully understand the metadata flatbuffer. */
+  private final String /* @Nullable */ minVersion;
+
   /**
    * Creates a {@link ModelMetadataInfo} with the metadata FlatBuffer, {@code buffer}.
    *
@@ -56,6 +59,7 @@ final class ModelMetadataInfo {
 
     inputsMetadata = getInputsMetadata(modelMetadata);
     outputsMetadata = getOutputsMetadata(modelMetadata);
+    minVersion = modelMetadata.minParserVersion();
   }
 
   /** Gets the count of input tensors with metadata in the metadata FlatBuffer. */
@@ -77,6 +81,15 @@ final class ModelMetadataInfo {
     return inputsMetadata.get(inputIndex);
   }
 
+  /**
+   * Gets the minimum parser version of the metadata. It can be {@code null} if the version is not
+   * populated.
+   */
+  @Nullable
+  String getMininumParserVersion() {
+    return minVersion;
+  }
+
   /** Gets the root handler for the model metadata. */
   ModelMetadata getModelMetadata() {
     return modelMetadata;
diff --git a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
index 7e8d148d504..b8e529ad1c5 100644
--- a/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
+++ b/tensorflow/lite/experimental/support/metadata/metadata_schema.fbs
@@ -29,11 +29,31 @@ namespace tflite;
 // generate the model interface. It is recommended to fill in at least those
 // enties to boost the codegen performance.
 
-// This corresponds to the schema version.
+// LINT.IfChange
+
+// The Metadata schema is versioned by the Semantic versioning number, which
+// tracks the schema changes according to the Semantic versioning rules.
+//
+// ModelMetadata.min_parser_version indicates the minimum necessary metadata
+// parser version to fully understand all fields in a given metadata flatbuffer.
+//
+// New fields and types will have associated comments with the schema version for
+// which they were added.
+//
+// Schema Semantic version: 1.0.0
+
+// This indicates the flatbuffer compatibility. The number will bump up when a
+// break change is applied to the schema, such as removing fields or adding new
+// fields to the middle of a table.
 file_identifier "M001";
 // File extension of any written files.
 file_extension "tflitemeta";
 
+// LINT.ThenChange(//tensorflow/lite/experimental/\
+//     /supportmetadata/java/src/java/org/tensorflow/lite/support/metadata/\
+//     MetadataExtractor.java)
+
+// LINT.IfChange
 enum AssociatedFileType : byte {
   UNKNOWN = 0,
   // Files such as readme.txt
@@ -498,6 +518,17 @@ table ModelMetadata {
 
   // A list of associated files of this model.
   associated_files:[AssociatedFile];
+
+  // The minimum metadata parser version that can fully understand the fields in
+  // the metadata flatbuffer. The version is effectively the largest version
+  // number among the versions of all the fields populated and the smallest
+  // compatible version indicated by the file identifier.
+  //
+  // This field is automaticaly populated by the MetadataPopulator when
+  // the metadata is populated into a TFLite model.
+  min_parser_version:string;
 }
+// LINT.ThenChange(//tensorflow/lite/experimental/\
+//     support/metadata/cc/metadata_version.cc)
 
 root_type ModelMetadata;
diff --git a/tensorflow/lite/tools/versioning/runtime_version.h b/tensorflow/lite/tools/versioning/runtime_version.h
index e4c25221310..ad88bd2ab89 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.h
+++ b/tensorflow/lite/tools/versioning/runtime_version.h
@@ -24,8 +24,8 @@ namespace tflite {
 void UpdateMinimumRuntimeVersionForModel(uint8_t* model_buffer_pointer);
 
 // Returns true if the first version string precedes the second.
-// For example, '1.14' should precede '1.9', also '1.14.1' should precede
-// '1.14'. If two version string is equal, then false will be returned.
+// For example, '1.9' should precede '1.14', also '1.14' should precede
+// '1.14.1'. If two version string is equal, then false will be returned.
 bool CompareRuntimeVersion(const std::string&, const std::string&);
 
 }  // namespace tflite

From 97f2fffe7ef89d6eb1b013698538b1726345a7e5 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Thu, 14 May 2020 20:45:52 -0700
Subject: [PATCH 240/412] This is an internal change not visible to the public.

PiperOrigin-RevId: 311661259
Change-Id: I8443f6037e3cabca1dca72ce1748eea8fd71770c
---
 tensorflow/core/tpu/tpu_config_c_api.h | 33 +++++++++++++++-----------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/tpu/tpu_config_c_api.h b/tensorflow/core/tpu/tpu_config_c_api.h
index 334a6a19325..b7caf0648b1 100644
--- a/tensorflow/core/tpu/tpu_config_c_api.h
+++ b/tensorflow/core/tpu/tpu_config_c_api.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_
 
 #include <cstddef>
+#include <cstdint>
 
 #include "tensorflow/c/tf_status.h"
 
@@ -26,29 +27,33 @@ extern "C" {
 
 bool TPUHostInitialized();
 
-// TODO(frankchn): Modify API to take in raw values instead of Tensors.
-void ConfigureDistributedTpuOp_DoWork(size_t input_size,
-                                      TpuSerializedProto** inputs,
-                                      TpuSerializedProto* output,
+void ConfigureDistributedTpuOp_DoWork(const size_t num_cores_per_host_size,
+                                      const int32_t* num_cores_per_host,
+                                      size_t* host_config_output_size,
+                                      char** host_config_output,
                                       TF_Status* status);
 
-void WaitForDistributedTpuOp_DoWork(size_t input_size,
-                                    TpuSerializedProto** inputs,
-                                    TpuSerializedProto* output,
-                                    TF_Status* status);
+void WaitForDistributedTpuOp_DoWork(
+    const size_t num_hosts, const size_t num_cores_per_host,
+    const int32_t** host_ordinal_to_global_core_id_map,
+    size_t* tpu_topology_output_size, char** tpu_topology_output,
+    TF_Status* status);
 
 void ShutdownDistributedTpuOp_DoWork(TF_Status* status);
 
 void InitializeHostForDistributedTpuOp_DoWork(
-    size_t input_size, TpuSerializedProto** inputs,
-    bool enable_whole_mesh_compilations, TpuSerializedProto* output,
-    TF_Status* status);
+    const size_t tpu_host_config_size, const char* tpu_host_config,
+    const bool enable_whole_mesh_compilations, size_t* core_id_output_size,
+    int32_t** core_id_output, TF_Status* status);
 
-void SetGlobalTPUArrayOp_DoWork(size_t input_size, TpuSerializedProto** inputs,
-                                TF_Status* status);
+void SetGlobalTPUArrayOp_DoWork(const size_t tpu_topology_size,
+                                const char* tpu_topology, TF_Status* status);
 
-void DisconnectDistributedTpuChipsOp_DoWork(TpuSerializedProto* output,
+void DisconnectDistributedTpuChipsOp_DoWork(int32_t* number_of_chips_output,
                                             TF_Status* status);
+
+void TpuConfigurationApi_FreeCharArray(char* output);
+void TpuConfigurationApi_FreeInt32Array(int32_t* output);
 }
 
 #endif  // TENSORFLOW_CORE_TPU_TPU_CONFIG_C_API_H_

From 5cf4311435e9087e0e9c7f4e1d4b415de6761530 Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Thu, 14 May 2020 20:59:56 -0700
Subject: [PATCH 241/412] Fix a memory leak.

PiperOrigin-RevId: 311662668
Change-Id: I59f9c9cdb8baed7a9828bb818ce1d293d185e6b6
---
 tensorflow/c/eager/c_api_remote_test.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index 544dffb664c..d04e4ef4212 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -447,6 +447,9 @@ void VarIsInitialized(TFE_Context* ctx, TFE_TensorHandle* var_handle) {
   bool initialized = false;
   memcpy(&initialized, TF_TensorData(t), TF_TensorByteSize(t));
   EXPECT_EQ(initialized, true);
+  TF_DeleteTensor(t);
+  TFE_DeleteTensorHandle(is_initialized[0]);
+  TFE_DeleteOp(op);
   delete status;
 }
 

From 28899d991f8f7443a04343fe9f308a1ea28a0795 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Thu, 14 May 2020 21:28:44 -0700
Subject: [PATCH 242/412] Optimize int8 broadcast min.

PiperOrigin-RevId: 311665392
Change-Id: I566547f44975d3d88cb7a17e8c6418a4a186ccda
---
 .../internal/optimized/optimized_ops.h        | 109 ++++++++++++++----
 tensorflow/lite/kernels/maximum_minimum.cc    |  38 +++++-
 2 files changed, 124 insertions(+), 23 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index c72400f33a5..b18f0f4bb5a 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -7963,14 +7963,59 @@ inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
   }
 }
 
-inline void BroadcastMaximumFivefold(
-    const ArithmeticParams& unswitched_params,
-    const RuntimeShape& unswitched_input1_shape,
-    const int8* unswitched_input1_data,
-    const RuntimeShape& unswitched_input2_shape,
-    const int8* unswitched_input2_data, const RuntimeShape& output_shape,
-    int8* output_data) {
-  ruy::profiler::ScopeLabel label("BroadcastMaximumFivefoldInt8/8bit");
+// Assume input1 & input2 have the same scale & zero point.
+inline void MinimumElementwise(int size, const ArithmeticParams& params,
+                               const int8* input1_data, const int8* input2_data,
+                               int8* output_data) {
+  int i = 0;
+#ifdef USE_NEON
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t min_data =
+        vminq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, min_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8 input1_val = input1_data[i];
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::min(input1_val, input2_val);
+  }
+}
+
+inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
+                                   int8 input1_data, const int8* input2_data,
+                                   int8* output_data) {
+  int i = 0;
+
+#ifdef USE_NEON
+  const int8x16_t input1_val_original = vdupq_n_s8(input1_data);
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t min_data =
+        vminq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, min_data);
+  }
+#endif  // USE_NEON
+  for (; i < size; ++i) {
+    const int8 input2_val = input2_data[i];
+    output_data[i] = std::min(input1_data, input2_val);
+  }
+}
+
+template <typename ElementwiseF, typename ScalarBroadcastF>
+inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
+                                    const RuntimeShape& unswitched_input1_shape,
+                                    const int8* unswitched_input1_data,
+                                    const RuntimeShape& unswitched_input2_shape,
+                                    const int8* unswitched_input2_data,
+                                    const RuntimeShape& output_shape,
+                                    int8* output_data,
+                                    ElementwiseF elementwise_f,
+                                    ScalarBroadcastF scalar_broadcast_f,
+                                    const std::string& label_name) {
+  ruy::profiler::ScopeLabel label(label_name);
 
   ArithmeticParams switched_params = unswitched_params;
   switched_params.input1_offset = unswitched_params.input2_offset;
@@ -8000,9 +8045,8 @@ inline void BroadcastMaximumFivefold(
   const int8* input2_data_reset = input2_data;
   // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
   // between input shapes. y3 for input 1 is always broadcast, and so the
-  // dimension there is 1, whereas optionally y1 might be broadcast for input 2.
-  // Put another way,
-  // input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // dimension there is 1, whereas optionally y1 might be broadcast for
+  // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
   // input2.shape.FlatSize = y0 * y2 * y3 * y4.
   int y0 = params.broadcast_shape[0];
   int y1 = params.broadcast_shape[1];
@@ -8018,8 +8062,8 @@ inline void BroadcastMaximumFivefold(
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
           for (int i3 = 0; i3 < y3; ++i3) {
-            MaximumElementwise(y4, params, input1_data_ptr, input2_data_ptr,
-                               output_data_ptr);
+            elementwise_f(y4, params, input1_data_ptr, input2_data_ptr,
+                          output_data_ptr);
             input2_data_ptr += y4;
             output_data_ptr += y4;
           }
@@ -8031,23 +8075,23 @@ inline void BroadcastMaximumFivefold(
       input2_data_reset = input2_data_ptr;
     }
   } else {
-    // Special case of y4 == 1, in which the innermost loop is a single element
-    // and can be combined with the next (y3) as an inner broadcast.
+    // Special case of y4 == 1, in which the innermost loop is a single
+    // element and can be combined with the next (y3) as an inner broadcast.
     //
     // Note that this handles the case of pure scalar broadcast when
     // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
     // broadcast with batch (as y2 > 1).
     //
-    // NOTE The process is the same as the above general case except simplified
-    // for y4 == 1 and the loop over y3 is contained within the
+    // NOTE The process is the same as the above general case except
+    // simplified for y4 == 1 and the loop over y3 is contained within the
     // AddScalarBroadcast function.
     for (int i0 = 0; i0 < y0; ++i0) {
       const int8* input2_data_ptr = nullptr;
       for (int i1 = 0; i1 < y1; ++i1) {
         input2_data_ptr = input2_data_reset;
         for (int i2 = 0; i2 < y2; ++i2) {
-          MaximumScalarBroadcast(y3, params, *input1_data_ptr, input2_data_ptr,
-                                 output_data_ptr);
+          scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr,
+                             output_data_ptr);
           input2_data_ptr += y3;
           output_data_ptr += y3;
           input1_data_ptr += 1;
@@ -8058,7 +8102,6 @@ inline void BroadcastMaximumFivefold(
   }
 }
 
-// TODO(b/156140316): Try to unify the broadcast dispatch logic for binary ops.
 template <typename Op>
 inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
                                      const RuntimeShape& input1_shape,
@@ -8073,8 +8116,30 @@ inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
         output_data, op);
   }
 
-  BroadcastMaximumFivefold(params, input1_shape, input1_data, input2_shape,
-                           input2_data, output_shape, output_data);
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
+                          input2_data, output_shape, output_data,
+                          MaximumElementwise, MaximumScalarBroadcast,
+                          "BroadcastMaximumFivefoldInt8/8bit");
+}
+
+template <typename Op>
+inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
+                                     const RuntimeShape& input1_shape,
+                                     const int8* input1_data,
+                                     const RuntimeShape& input2_shape,
+                                     const int8* input2_data,
+                                     const RuntimeShape& output_shape,
+                                     int8* output_data, Op op) {
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast) {
+    return reference_ops::MaximumMinimumBroadcastSlow(
+        input1_shape, input1_data, input2_shape, input2_data, output_shape,
+        output_data, op);
+  }
+
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
+                          input2_data, output_shape, output_data,
+                          MinimumElementwise, MinimumScalarBroadcast,
+                          "BroadcastMinimumFivefoldInt8/8bit");
 }
 
 }  // namespace optimized_ops
diff --git a/tensorflow/lite/kernels/maximum_minimum.cc b/tensorflow/lite/kernels/maximum_minimum.cc
index abe9647f69e..cad86acd8dd 100644
--- a/tensorflow/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/lite/kernels/maximum_minimum.cc
@@ -125,6 +125,31 @@ void TFLiteOperation<maximum_minimum::kGenericOptimized, int8, MaximumOp>(
       MaximumOp::template op<int8>);
 }
 
+// Minimum generic opt int8.
+template <>
+void TFLiteOperation<maximum_minimum::kGenericOptimized, int8, MinimumOp>(
+    TfLiteContext* context, TfLiteNode* node, const OpContext& op_context) {
+  tflite::ArithmeticParams op_params;
+  const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
+      GetTensorShape(op_context.input1), GetTensorShape(op_context.input2),
+      &op_params);
+  if (need_broadcast) {
+    optimized_ops::BroadcastMinimumDispatch(
+        op_params, GetTensorShape(op_context.input1),
+        GetTensorData<int8>(op_context.input1),
+        GetTensorShape(op_context.input2),
+        GetTensorData<int8>(op_context.input2),
+        GetTensorShape(op_context.output),
+        GetTensorData<int8>(op_context.output), MinimumOp::template op<int8>);
+    return;
+  }
+  reference_ops::MaximumMinimumBroadcastSlow(
+      GetTensorShape(op_context.input1), GetTensorData<int8>(op_context.input1),
+      GetTensorShape(op_context.input2), GetTensorData<int8>(op_context.input2),
+      GetTensorShape(op_context.output), GetTensorData<int8>(op_context.output),
+      MinimumOp::template op<int8>);
+}
+
 template <KernelType kernel_type, typename OpType>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
@@ -186,10 +211,21 @@ TfLiteRegistration* Register_MINIMUM_REF() {
                             maximum_minimum::MinimumOp>};
   return &r;
 }
+
+TfLiteRegistration* Register_MINIMUM_GENERIC_OPT() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, maximum_minimum::Prepare,
+      maximum_minimum::Eval<maximum_minimum::kGenericOptimized,
+                            maximum_minimum::MinimumOp>};
+  return &r;
+}
+
 TfLiteRegistration* Register_MAXIMUM() {
   return Register_MAXIMUM_GENERIC_OPT();
 }
-TfLiteRegistration* Register_MINIMUM() { return Register_MINIMUM_REF(); }
+TfLiteRegistration* Register_MINIMUM() {
+  return Register_MINIMUM_GENERIC_OPT();
+}
 
 }  // namespace builtin
 }  // namespace ops

From 24c75ce5016efb4ab107f27b96aac07549d8617b Mon Sep 17 00:00:00 2001
From: Hye Soo Yang <hyey@google.com>
Date: Thu, 14 May 2020 22:18:50 -0700
Subject: [PATCH 243/412] Addresses
 https://github.com/tensorflow/tensorflow/issues/38694 by ensuring TensorShape
 v1/v2 backwards compatibility.

PiperOrigin-RevId: 311670326
Change-Id: I0e7045ff4eb19cb0096d6fa41a494c8f9a6b85c5
---
 tensorflow/python/ops/image_ops_impl.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index e6a5cdbf4e8..52b65efad67 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1231,8 +1231,10 @@ def _resize_images_common(images, resizer_fn, size, preserve_aspect_ratio, name,
                                    name='size')
 
     size_const_as_shape = tensor_util.constant_value_as_shape(size)
-    new_height_const = size_const_as_shape.dims[0].value
-    new_width_const = size_const_as_shape.dims[1].value
+    new_height_const = tensor_shape.dimension_at_index(size_const_as_shape,
+                                                       0).value
+    new_width_const = tensor_shape.dimension_at_index(size_const_as_shape,
+                                                      1).value
 
     # If we can determine that the height and width will be unmodified by this
     # transformation, we avoid performing the resize.

From 2b2e4412053fa3df5861cf1d9a8ff82061e06a65 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 00:27:20 -0700
Subject: [PATCH 244/412] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/dad2e92eaf53

PiperOrigin-RevId: 311683742
Change-Id: I6177dd65ae548b719c656201ed1a7f9829acd745
---
 third_party/mlir/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 8b61ce98dab..1bddf2180bc 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -564,6 +564,7 @@ cc_library(
         ":StandardOps",
         ":Support",
         ":Transforms",
+        ":VectorOps",
     ],
 )
 

From cca62cc73b19a017ab7e0a1b34690893f68cc9f5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 01:02:05 -0700
Subject: [PATCH 245/412] Add a new DelegateUtils::InvokeWithCPUFallback API
 for handling delegate failures automatically in Invoke(). This is especially
 useful for NNAPI usage, as drivers may evict clients.

The fallback is suitable when callers:
- Use CPU buffers only (no AHardwareBuffer or GL buffers)
- Do not store pointers to tensor data across Invoke() calls, as they are invalidated
- Do not have tensor state across multiple Invoke() calls

PiperOrigin-RevId: 311687567
Change-Id: Ib3b6fd3fb6c4e85c9512d939d8e87efea03ac4fe
---
 tensorflow/lite/BUILD                         |  19 +-
 tensorflow/lite/core/subgraph.cc              |   7 +
 tensorflow/lite/core/subgraph.h               |   6 +
 tensorflow/lite/delegates/BUILD               |  23 ++
 .../lite/delegates/interpreter_utils.cc       |  67 ++++
 tensorflow/lite/delegates/interpreter_utils.h |  52 +++
 .../lite/delegates/interpreter_utils_test.cc  |  92 +++++
 tensorflow/lite/delegates/utils.h             |   2 +
 tensorflow/lite/interpreter.cc                |   4 +
 tensorflow/lite/interpreter.h                 |  13 +
 tensorflow/lite/interpreter_test.cc           | 309 +---------------
 tensorflow/lite/interpreter_test.h            | 331 ++++++++++++++++++
 12 files changed, 618 insertions(+), 307 deletions(-)
 create mode 100644 tensorflow/lite/delegates/interpreter_utils.cc
 create mode 100644 tensorflow/lite/delegates/interpreter_utils.h
 create mode 100644 tensorflow/lite/delegates/interpreter_utils_test.cc
 create mode 100644 tensorflow/lite/interpreter_test.h

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 14babee2da7..4d8c07aa15b 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -340,11 +340,27 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "interpreter_test_fixtures",
+    testonly = True,
+    hdrs = ["interpreter_test.h"],
+    deps = [
+        ":framework",
+        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:kernel_util",
+        "//tensorflow/lite/kernels/internal:compatibility",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 # Test main interpreter
 cc_test(
     name = "interpreter_test",
     size = "small",
-    srcs = ["interpreter_test.cc"],
+    srcs = [
+        "interpreter_test.cc",
+    ],
     features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     tags = [
         "tflite_not_portable_ios",  # TODO(b/117786830)
@@ -352,6 +368,7 @@ cc_test(
     deps = [
         ":external_cpu_backend_context",
         ":framework",
+        ":interpreter_test_fixtures",
         ":string_util",
         ":version",
         "//tensorflow/lite/core/api",
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 7f4e0e286ea..81710df128b 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -533,6 +533,11 @@ void Subgraph::SetCancellationFunction(void* data,
   check_cancelled_func_ = check_cancelled_func;
 }
 
+bool Subgraph::IsCancelled() {
+  return (check_cancelled_func_ != nullptr) &&
+         (*check_cancelled_func_)(cancellation_data_);
+}
+
 void Subgraph::ReserveNodes(int count) {
   nodes_and_registration_.reserve(count);
 }
@@ -1316,6 +1321,8 @@ TfLiteStatus Subgraph::RemoveAllDelegates() {
   return kTfLiteOk;
 }
 
+bool Subgraph::HasDelegates() { return !delegates_applied_.empty(); }
+
 TfLiteStatus Subgraph::EnsureMemoryAllocations() {
   if (memory_planner_) {
     state_ = kStateUninvokable;
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 0b0c1e31e89..d6067daaa6a 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -553,6 +553,9 @@ class Subgraph {
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
+  // Returns true if the subgraph has delegates applied.
+  bool HasDelegates();
+
   // Cleanups up data reserved for the given node. Does not remove the {node,
   // registration} pair from nodes_and_registrations_.
   void CleanupNode(int node_index);
@@ -578,6 +581,9 @@ class Subgraph {
   // Ensures the memory required is planned and allocated.
   TfLiteStatus EnsureMemoryAllocations();
 
+  // Returns true if cancellation function returns true.
+  bool IsCancelled();
+
   // The state of the Interpreter.
   enum State {
     // The interpreter isn't ready to be invoked.
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index df671675ec9..8d4c921576d 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -43,3 +43,26 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+cc_library(
+    name = "interpreter_utils",
+    srcs = ["interpreter_utils.cc"],
+    hdrs = ["interpreter_utils.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite:framework",
+    ],
+)
+
+cc_test(
+    name = "interpreter_utils_test",
+    srcs = ["interpreter_utils_test.cc"],
+    linkopts = tflite_linkopts(),
+    linkstatic = 1,
+    deps = [
+        ":interpreter_utils",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite:interpreter_test_fixtures",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/delegates/interpreter_utils.cc b/tensorflow/lite/delegates/interpreter_utils.cc
new file mode 100644
index 00000000000..85d79d887fb
--- /dev/null
+++ b/tensorflow/lite/delegates/interpreter_utils.cc
@@ -0,0 +1,67 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/interpreter_utils.h"
+
+namespace tflite {
+namespace delegates {
+TfLiteStatus InterpreterUtils::InvokeWithCPUFallback(Interpreter* interpreter) {
+  TfLiteStatus status = interpreter->Invoke();
+  if (status == kTfLiteOk || interpreter->IsCancelled() ||
+      !interpreter->HasDelegates()) {
+    return status;
+  }
+  // Retry without delegation.
+  // TODO(b/138706191): retry only if error is due to delegation.
+  TF_LITE_REPORT_ERROR(
+      interpreter->error_reporter(),
+      "Invoke() failed in the presence of delegation. Retrying without.");
+
+  // Copy input data to a buffer.
+  // Input data is safe since Subgraph::PrepareOpsAndTensors() passes
+  // preserve_inputs=true to ArenaPlanner.
+  std::vector<char> buf;
+  size_t input_size = 0;
+
+  for (auto i : interpreter->inputs()) {
+    TfLiteTensor* t = interpreter->tensor(i);
+    input_size += t->bytes;
+  }
+  buf.reserve(input_size);
+  auto bufp = buf.begin();
+  for (auto i : interpreter->inputs()) {
+    // TF_LITE_ENSURE_STATUS(interpreter->EnsureTensorDataIsReadable(i));
+    TfLiteTensor* t = interpreter->tensor(i);
+    std::copy(t->data.raw, t->data.raw + t->bytes, bufp);
+    bufp += t->bytes;
+  }
+
+  TF_LITE_ENSURE_STATUS(interpreter->RemoveAllDelegates());
+
+  // Copy inputs from buffer.
+  bufp = buf.begin();
+  for (auto i : interpreter->inputs()) {
+    TfLiteTensor* t = interpreter->tensor(i);
+    std::copy(bufp, bufp + t->bytes, t->data.raw);
+    bufp += t->bytes;
+  }
+
+  // Invoke again.
+  TF_LITE_ENSURE_STATUS(interpreter->Invoke());
+  return kTfLiteDelegateError;
+}
+
+}  // namespace delegates
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/interpreter_utils.h b/tensorflow/lite/delegates/interpreter_utils.h
new file mode 100644
index 00000000000..f736c2db1f4
--- /dev/null
+++ b/tensorflow/lite/delegates/interpreter_utils.h
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
+#define TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
+
+#include "tensorflow/lite/interpreter.h"
+
+// Utility functions and classes for using delegates.
+
+namespace tflite {
+namespace delegates {
+#if !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
+class InterpreterUtils {
+ public:
+  /// Invokes an interpreter with automatic fallback from delegation to CPU.
+  ///
+  /// If using the delegate fails, the delegate is automatically undone and an
+  /// attempt made to return the interpreter to an invokable state.
+  ///
+  /// Allowing the fallback is suitable only if both of the following hold:
+  /// - The caller is known not to cache pointers to tensor data across Invoke()
+  ///   calls.
+  /// - The model is not stateful (no variables, no LSTMs) or the state isn't
+  ///   needed between batches.
+  ///
+  /// Returns one of the following three status codes:
+  /// 1. kTfLiteOk: Success. Output is valid.
+  /// 2. kTfLiteDelegateError: Delegate error but fallback succeeded. Output is
+  /// valid.
+  /// NOTE: This undoes all delegates previously applied to the Interpreter.
+  /// 3. kTfLiteError: Unexpected/runtime failure. Output is invalid.
+  /// WARNING: This is an experimental API and subject to change.
+  static TfLiteStatus InvokeWithCPUFallback(Interpreter* interpreter);
+};
+#endif  // !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
diff --git a/tensorflow/lite/delegates/interpreter_utils_test.cc b/tensorflow/lite/delegates/interpreter_utils_test.cc
new file mode 100644
index 00000000000..8dc856d796c
--- /dev/null
+++ b/tensorflow/lite/delegates/interpreter_utils_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/delegates/interpreter_utils.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter_test.h"
+
+namespace tflite {
+namespace {
+
+TEST_F(TestDelegate, DelegateNodeInvokeFailureFallback) {
+  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {0, 1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Delegation modified execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 3;
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(
+      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
+      kTfLiteDelegateError);
+  // Delegation removed, returning to original execution plan.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  // Check outputs.
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+TEST_F(TestDelegate, TestFallbackWithMultipleDelegates) {
+  // First delegate only supports node 0.
+  // This delegate should support dynamic tensors, otherwise the second won't be
+  // applied.
+  delegate_ = std::unique_ptr<SimpleDelegate>(
+      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
+  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
+  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
+      {1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
+      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
+  // Pre-delegation execution plan should have three nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  ASSERT_EQ(
+      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
+      kTfLiteOk);
+  // Should be two delegates nodes.
+  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
+
+  std::vector<float> input = {1.0f, 2.0f, 3.0f};
+  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
+  constexpr int kOutputTensorIndex = 2;
+  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
+
+  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
+  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
+  EXPECT_EQ(
+      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
+      kTfLiteDelegateError);
+  // All delegates should be undone.
+  EXPECT_EQ(interpreter_->execution_plan().size(), 3);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index d6d22c4efa2..3b0668af04b 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_UTILS_H_
 #define TENSORFLOW_LITE_DELEGATES_UTILS_H_
 
+// Utility functions and classes for implementing delegates.
+
 #include <functional>
 #include <limits>
 #include <set>
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index c8ccf671d60..167254a2a62 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -310,6 +310,8 @@ void Interpreter::SetCancellationFunction(void* data,
   }
 }
 
+bool Interpreter::IsCancelled() { return primary_subgraph().IsCancelled(); }
+
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   TfLiteStatus status = kTfLiteOk;
   for (auto& subgraph : subgraphs_) {
@@ -340,6 +342,8 @@ TfLiteStatus Interpreter::RemoveAllDelegates() {
   return kTfLiteOk;
 }
 
+bool Interpreter::HasDelegates() { return primary_subgraph().HasDelegates(); }
+
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle buffer_handle,
                                           TfLiteDelegate* delegate) {
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index b93fd76c13b..aa9c54d295f 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -42,6 +42,9 @@ namespace tflite {
 
 class InterpreterTest;
 class TestDelegate;
+namespace delegates {
+class InterpreterUtils;  // Class for friend declarations.
+}  // namespace delegates
 
 namespace impl {
 
@@ -529,6 +532,7 @@ class Interpreter {
   friend class InterpreterBuilder;
   friend class tflite::InterpreterTest;
   friend class tflite::TestDelegate;
+  friend class tflite::delegates::InterpreterUtils;
 
   /// Set the value of an external context.
   static void SetExternalContext(struct TfLiteContext* context,
@@ -542,6 +546,15 @@ class Interpreter {
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
+  // Returns true if delegates have been applied.
+  bool HasDelegates();
+
+  // Returns true if cancellation function returns true.
+  bool IsCancelled();
+
+  // Get the error reporter associated with this interpreter.
+  ErrorReporter* error_reporter() { return error_reporter_; }
+
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index abd92ad563d..1d8f82ef16a 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
+#include "tensorflow/lite/interpreter_test.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -35,25 +36,6 @@ limitations under the License.
 
 namespace tflite {
 
-// InterpreterTest is a friend of Interpreter, so it can access context_.
-class InterpreterTest : public ::testing::Test {
- public:
-  template <typename Delegate>
-  static TfLiteStatus ModifyGraphWithDelegate(
-      Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
-    Interpreter::TfLiteDelegatePtr tflite_delegate(
-        delegate.release(), [](TfLiteDelegate* delegate) {
-          delete reinterpret_cast<Delegate*>(delegate);
-        });
-    return interpreter->ModifyGraphWithDelegate(std::move(tflite_delegate));
-  }
-
- protected:
-  TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
-
-  Interpreter interpreter_;
-};
-
 namespace ops {
 namespace builtin {
 TfLiteRegistration* Register_PADV2();
@@ -1304,291 +1286,6 @@ TEST_F(TestExecutionPlan, NullExecutionPlan) {
   ASSERT_EQ(run_order_, std::vector<int>());
 }
 
-// Build a kernel registration for an op that copies its one input
-// to an output
-TfLiteRegistration AddOpRegistration() {
-  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-
-  reg.custom_name = "my_add";
-  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
-
-  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-    // Set output size to input size
-    const TfLiteTensor* input1 = GetInput(context, node, 0);
-    const TfLiteTensor* input2 = GetInput(context, node, 1);
-    TfLiteTensor* output = GetOutput(context, node, 0);
-
-    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
-    for (int i = 0; i < input1->dims->size; ++i) {
-      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
-    }
-
-    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-        context, output, TfLiteIntArrayCopy(input1->dims)));
-    return kTfLiteOk;
-  };
-
-  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    // Copy input data to output data.
-    const TfLiteTensor* a0 = GetInput(context, node, 0);
-    TF_LITE_ENSURE(context, a0);
-    TF_LITE_ENSURE(context, a0->data.f);
-    const TfLiteTensor* a1 = GetInput(context, node, 1);
-    TF_LITE_ENSURE(context, a1);
-    TF_LITE_ENSURE(context, a1->data.f);
-    TfLiteTensor* out = GetOutput(context, node, 0);
-    TF_LITE_ENSURE(context, out);
-    TF_LITE_ENSURE(context, out->data.f);
-    int num = a0->dims->data[0];
-    for (int i = 0; i < num; i++) {
-      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-    }
-    return kTfLiteOk;
-  };
-  return reg;
-}
-
-}  // namespace
-
-// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
-class TestDelegate : public ::testing::Test {
- protected:
-  void SetUp() override {
-    interpreter_.reset(new Interpreter);
-    interpreter_->AddTensors(5);
-    interpreter_->SetInputs({0, 1});
-    interpreter_->SetOutputs({3, 4});
-    TfLiteQuantizationParams quant;
-    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
-                                               quant);
-    TfLiteRegistration reg = AddOpRegistration();
-    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
-  }
-
-  void TearDown() override {
-    // Interpreter relies on delegate to free the resources properly. Thus
-    // the life cycle of delegate must be longer than interpreter.
-    interpreter_.reset();
-    delegate_.reset();
-  }
-
-  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
-
-  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
-
-  TfLiteStatus RemoveAllDelegates() {
-    return interpreter_->RemoveAllDelegates();
-  }
-
- protected:
-  class SimpleDelegate {
-   public:
-    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
-    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
-    // value-copyable and compatible with TfLite.
-    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
-    // min_ops_per_subset: If >0, partitioning preview is used to choose only
-    // those subsets with min_ops_per_subset number of nodes.
-    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
-    explicit SimpleDelegate(
-        const std::vector<int>& nodes,
-        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
-        bool fail_node_prepare = false, int min_ops_per_subset = 0,
-        bool fail_node_invoke = false)
-        : nodes_(nodes),
-          fail_delegate_node_prepare_(fail_node_prepare),
-          min_ops_per_subset_(min_ops_per_subset),
-          fail_delegate_node_invoke_(fail_node_invoke) {
-      delegate_.Prepare = [](TfLiteContext* context,
-                             TfLiteDelegate* delegate) -> TfLiteStatus {
-        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
-        TfLiteIntArray* nodes_to_separate =
-            TfLiteIntArrayCreate(simple->nodes_.size());
-        // Mark nodes that we want in TfLiteIntArray* structure.
-        int index = 0;
-        for (auto node_index : simple->nodes_) {
-          nodes_to_separate->data[index++] = node_index;
-          // make sure node is added
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-        }
-        // Check that all nodes are available
-        TfLiteIntArray* execution_plan;
-        TF_LITE_ENSURE_STATUS(
-            context->GetExecutionPlan(context, &execution_plan));
-        for (int exec_index = 0; exec_index < execution_plan->size;
-             exec_index++) {
-          int node_index = execution_plan->data[exec_index];
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          if (exec_index == node_index) {
-            // Check op details only if it wasn't delegated already.
-            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-          }
-        }
-
-        // Get preview of delegate partitioning from the context.
-        TfLiteDelegateParams* params_array;
-        int num_partitions;
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        if (simple->min_ops_per_subset() > 0) {
-          // Build a new vector of ops from subsets with atleast the minimum
-          // size.
-          std::vector<int> allowed_ops;
-          for (int idx = 0; idx < num_partitions; ++idx) {
-            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
-            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
-            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
-                               nodes_in_subset->data + nodes_in_subset->size);
-          }
-
-          // Free existing nodes_to_separate & initialize a new array with
-          // allowed_ops.
-          TfLiteIntArrayFree(nodes_to_separate);
-          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
-          memcpy(nodes_to_separate->data, allowed_ops.data(),
-                 sizeof(int) * nodes_to_separate->size);
-        }
-
-        // Another call to PreviewDelegateParitioning should be okay, since
-        // partitioning memory is managed by context.
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        context->ReplaceNodeSubsetsWithDelegateKernels(
-            context, simple->FakeFusedRegistration(), nodes_to_separate,
-            delegate);
-        TfLiteIntArrayFree(nodes_to_separate);
-        return kTfLiteOk;
-      };
-      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
-                                        TfLiteDelegate* delegate,
-                                        TfLiteBufferHandle buffer_handle,
-                                        TfLiteTensor* tensor) -> TfLiteStatus {
-        // TODO(ycling): Implement tests to test buffer copying logic.
-        return kTfLiteOk;
-      };
-      delegate_.CopyFromBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle,
-             TfLiteTensor* output) -> TfLiteStatus {
-        TFLITE_CHECK_GE(buffer_handle, -1);
-        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
-        const float floats[] = {6., 6., 6.};
-        int num = output->dims->data[0];
-        for (int i = 0; i < num; i++) {
-          output->data.f[i] = floats[i];
-        }
-        return kTfLiteOk;
-      };
-
-      delegate_.FreeBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
-      // Store type-punned data SimpleDelegate structure.
-      delegate_.data_ = static_cast<void*>(this);
-      delegate_.flags = delegate_flags;
-    }
-
-    TfLiteRegistration FakeFusedRegistration() {
-      TfLiteRegistration reg = {nullptr};
-      reg.custom_name = "fake_fused_op";
-
-      reg.invoke = [](TfLiteContext* context,
-                      TfLiteNode* node) -> TfLiteStatus {
-        // Copy input data to output data.
-        const TfLiteTensor* a0;
-        const TfLiteTensor* a1;
-        if (node->inputs->size == 2) {
-          a0 = GetInput(context, node, 0);
-          a1 = GetInput(context, node, 1);
-        } else {
-          a0 = GetInput(context, node, 0);
-          a1 = a0;
-        }
-        TfLiteTensor* out = GetOutput(context, node, 0);
-        int num = 1;
-        for (int i = 0; i < a0->dims->size; ++i) {
-          num *= a0->dims->data[i];
-        }
-        for (int i = 0; i < num; i++) {
-          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-        }
-        // Make the data stale so that CopyFromBufferHandle can be invoked
-        out->data_is_stale = true;
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_invoke_) {
-        reg.invoke = [](TfLiteContext* context,
-                        TfLiteNode* node) -> TfLiteStatus {
-          return kTfLiteError;
-        };
-      }
-
-      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-        // Set output size to input size
-        const TfLiteTensor* input1;
-        const TfLiteTensor* input2;
-        if (node->inputs->size == 2) {
-          input1 = GetInput(context, node, 0);
-          input2 = GetInput(context, node, 1);
-        } else {
-          input1 = GetInput(context, node, 0);
-          input2 = input1;
-        }
-        TfLiteTensor* output = GetOutput(context, node, 0);
-
-        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-            context, output, TfLiteIntArrayCopy(input1->dims)));
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_prepare_) {
-        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-          return kTfLiteError;
-        };
-      }
-
-      return reg;
-    }
-
-    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
-
-    int min_ops_per_subset() { return min_ops_per_subset_; }
-
-   private:
-    std::vector<int> nodes_;
-    TfLiteDelegate delegate_;
-    bool fail_delegate_node_prepare_ = false;
-    int min_ops_per_subset_ = 0;
-    bool fail_delegate_node_invoke_ = false;
-  };
-
-  std::unique_ptr<Interpreter> interpreter_;
-  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
-};
-namespace {
-
 TEST_F(TestDelegate, BasicDelegate) {
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
@@ -1967,7 +1664,7 @@ TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
   // Verify Invoke() behavior.
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
+  EXPECT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 3; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
@@ -1981,7 +1678,7 @@ TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
 
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
+  EXPECT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 4; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
diff --git a/tensorflow/lite/interpreter_test.h b/tensorflow/lite/interpreter_test.h
new file mode 100644
index 00000000000..d4f0c8a05c5
--- /dev/null
+++ b/tensorflow/lite/interpreter_test.h
@@ -0,0 +1,331 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_INTERPRETER_TEST_H_
+#define TENSORFLOW_LITE_INTERPRETER_TEST_H_
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/internal/compatibility.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/register.h"
+
+namespace tflite {
+// InterpreterTest is a friend of Interpreter, so it can access context_.
+class InterpreterTest : public ::testing::Test {
+ public:
+  template <typename Delegate>
+  static TfLiteStatus ModifyGraphWithDelegate(
+      Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
+    Interpreter::TfLiteDelegatePtr tflite_delegate(
+        delegate.release(), [](TfLiteDelegate* delegate) {
+          delete reinterpret_cast<Delegate*>(delegate);
+        });
+    return interpreter->ModifyGraphWithDelegate(std::move(tflite_delegate));
+  }
+
+ protected:
+  TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
+
+  Interpreter interpreter_;
+};
+
+// Build a kernel registration for an op that copies its one input
+// to an output
+TfLiteRegistration AddOpRegistration() {
+  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+  reg.custom_name = "my_add";
+  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
+
+  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    // Set output size to input size
+    const TfLiteTensor* input1 = GetInput(context, node, 0);
+    const TfLiteTensor* input2 = GetInput(context, node, 1);
+    TfLiteTensor* output = GetOutput(context, node, 0);
+
+    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
+    for (int i = 0; i < input1->dims->size; ++i) {
+      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
+    }
+
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+        context, output, TfLiteIntArrayCopy(input1->dims)));
+    return kTfLiteOk;
+  };
+
+  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    // Copy input data to output data.
+    const TfLiteTensor* a0 = GetInput(context, node, 0);
+    TF_LITE_ENSURE(context, a0);
+    TF_LITE_ENSURE(context, a0->data.f);
+    const TfLiteTensor* a1 = GetInput(context, node, 1);
+    TF_LITE_ENSURE(context, a1);
+    TF_LITE_ENSURE(context, a1->data.f);
+    TfLiteTensor* out = GetOutput(context, node, 0);
+    TF_LITE_ENSURE(context, out);
+    TF_LITE_ENSURE(context, out->data.f);
+    int num = a0->dims->data[0];
+    for (int i = 0; i < num; i++) {
+      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+    }
+    return kTfLiteOk;
+  };
+  return reg;
+}
+
+// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
+class TestDelegate : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration reg = AddOpRegistration();
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
+  }
+
+  void TearDown() override {
+    // Interpreter relies on delegate to free the resources properly. Thus
+    // the life cycle of delegate must be longer than interpreter.
+    interpreter_.reset();
+    delegate_.reset();
+  }
+
+  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
+
+  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
+
+  TfLiteStatus RemoveAllDelegates() {
+    return interpreter_->RemoveAllDelegates();
+  }
+
+ protected:
+  class SimpleDelegate {
+   public:
+    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
+    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
+    // value-copyable and compatible with TfLite.
+    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
+    // min_ops_per_subset: If >0, partitioning preview is used to choose only
+    // those subsets with min_ops_per_subset number of nodes.
+    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
+    explicit SimpleDelegate(
+        const std::vector<int>& nodes,
+        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
+        bool fail_node_prepare = false, int min_ops_per_subset = 0,
+        bool fail_node_invoke = false)
+        : nodes_(nodes),
+          fail_delegate_node_prepare_(fail_node_prepare),
+          min_ops_per_subset_(min_ops_per_subset),
+          fail_delegate_node_invoke_(fail_node_invoke) {
+      delegate_.Prepare = [](TfLiteContext* context,
+                             TfLiteDelegate* delegate) -> TfLiteStatus {
+        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
+        TfLiteIntArray* nodes_to_separate =
+            TfLiteIntArrayCreate(simple->nodes_.size());
+        // Mark nodes that we want in TfLiteIntArray* structure.
+        int index = 0;
+        for (auto node_index : simple->nodes_) {
+          nodes_to_separate->data[index++] = node_index;
+          // make sure node is added
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+        }
+        // Check that all nodes are available
+        TfLiteIntArray* execution_plan;
+        TF_LITE_ENSURE_STATUS(
+            context->GetExecutionPlan(context, &execution_plan));
+        for (int exec_index = 0; exec_index < execution_plan->size;
+             exec_index++) {
+          int node_index = execution_plan->data[exec_index];
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          if (exec_index == node_index) {
+            // Check op details only if it wasn't delegated already.
+            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+          }
+        }
+
+        // Get preview of delegate partitioning from the context.
+        TfLiteDelegateParams* params_array;
+        int num_partitions;
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        if (simple->min_ops_per_subset() > 0) {
+          // Build a new vector of ops from subsets with atleast the minimum
+          // size.
+          std::vector<int> allowed_ops;
+          for (int idx = 0; idx < num_partitions; ++idx) {
+            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
+            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
+            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
+                               nodes_in_subset->data + nodes_in_subset->size);
+          }
+
+          // Free existing nodes_to_separate & initialize a new array with
+          // allowed_ops.
+          TfLiteIntArrayFree(nodes_to_separate);
+          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
+          memcpy(nodes_to_separate->data, allowed_ops.data(),
+                 sizeof(int) * nodes_to_separate->size);
+        }
+
+        // Another call to PreviewDelegateParitioning should be okay, since
+        // partitioning memory is managed by context.
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        context->ReplaceNodeSubsetsWithDelegateKernels(
+            context, simple->FakeFusedRegistration(), nodes_to_separate,
+            delegate);
+        TfLiteIntArrayFree(nodes_to_separate);
+        return kTfLiteOk;
+      };
+      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* tensor) -> TfLiteStatus {
+        // TODO(b/156586986): Implement tests to test buffer copying logic.
+        return kTfLiteOk;
+      };
+      delegate_.CopyFromBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle buffer_handle,
+             TfLiteTensor* output) -> TfLiteStatus {
+        TFLITE_CHECK_GE(buffer_handle, -1);
+        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
+        const float floats[] = {6., 6., 6.};
+        int num = output->dims->data[0];
+        for (int i = 0; i < num; i++) {
+          output->data.f[i] = floats[i];
+        }
+        return kTfLiteOk;
+      };
+
+      delegate_.FreeBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
+      // Store type-punned data SimpleDelegate structure.
+      delegate_.data_ = static_cast<void*>(this);
+      delegate_.flags = delegate_flags;
+    }
+
+    TfLiteRegistration FakeFusedRegistration() {
+      TfLiteRegistration reg = {nullptr};
+      reg.custom_name = "fake_fused_op";
+
+      reg.invoke = [](TfLiteContext* context,
+                      TfLiteNode* node) -> TfLiteStatus {
+        // Copy input data to output data.
+        const TfLiteTensor* a0;
+        const TfLiteTensor* a1;
+        if (node->inputs->size == 2) {
+          a0 = GetInput(context, node, 0);
+          a1 = GetInput(context, node, 1);
+        } else {
+          a0 = GetInput(context, node, 0);
+          a1 = a0;
+        }
+        TfLiteTensor* out = GetOutput(context, node, 0);
+        int num = 1;
+        for (int i = 0; i < a0->dims->size; ++i) {
+          num *= a0->dims->data[i];
+        }
+        for (int i = 0; i < num; i++) {
+          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+        }
+        // Make the data stale so that CopyFromBufferHandle can be invoked
+        if (out->buffer_handle != kTfLiteNullBufferHandle) {
+          out->data_is_stale = true;
+        }
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_invoke_) {
+        reg.invoke = [](TfLiteContext* context,
+                        TfLiteNode* node) -> TfLiteStatus {
+          return kTfLiteError;
+        };
+      }
+
+      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+        // Set output size to input size
+        const TfLiteTensor* input1;
+        const TfLiteTensor* input2;
+        if (node->inputs->size == 2) {
+          input1 = GetInput(context, node, 0);
+          input2 = GetInput(context, node, 1);
+        } else {
+          input1 = GetInput(context, node, 0);
+          input2 = input1;
+        }
+        TfLiteTensor* output = GetOutput(context, node, 0);
+
+        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+            context, output, TfLiteIntArrayCopy(input1->dims)));
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_prepare_) {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          return kTfLiteError;
+        };
+      }
+
+      return reg;
+    }
+
+    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
+
+    int min_ops_per_subset() { return min_ops_per_subset_; }
+
+   private:
+    std::vector<int> nodes_;
+    TfLiteDelegate delegate_;
+    bool fail_delegate_node_prepare_ = false;
+    int min_ops_per_subset_ = 0;
+    bool fail_delegate_node_invoke_ = false;
+  };
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
+};
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_INTERPRETER_TEST_H_

From f6e2a28158eadfffc1e385abb054502ca4cf96d4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 02:02:46 -0700
Subject: [PATCH 246/412] Update GraphDef version to 402.

PiperOrigin-RevId: 311693254
Change-Id: Ieb9a5f81784013ad6bb4a6b1fd1f119b34c68604
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index a534c0cf827..8f0967c1eaa 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 401  // Updated: 2020/5/14
+#define TF_GRAPH_DEF_VERSION 402  // Updated: 2020/5/15
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From ab70af78dcac29dc886456b475e114848fd74665 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 02:02:49 -0700
Subject: [PATCH 247/412] compat: Update forward compatibility horizon to
 2020-05-15

PiperOrigin-RevId: 311693261
Change-Id: Id490a7dbe95ff4e493b3490d71c92a9b41f2b484
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 2a21590bb9a..29ba7317747 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 14)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 15)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 06c671cde8970541f3a8ef7604502adf9d4e5099 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 02:46:38 -0700
Subject: [PATCH 248/412] [XLA] Verify statically shaped result type in
 `xla_hlo.reshape` operation

The result type of the `xla_hlo.reshape` operation must have a static shape.
This is now checked by the operation's verifier.

PiperOrigin-RevId: 311697582
Change-Id: I90e8e513d205d62bb052a6cbecd7ebd88614db6d
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.cc    | 26 ++++++-----
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  2 +-
 .../compiler/mlir/xla/tests/legalize-tf.mlir  | 43 ++++++++++---------
 3 files changed, 38 insertions(+), 33 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
index 68eafb8b33e..b6036ee2130 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.cc
@@ -1358,19 +1358,23 @@ static LogicalResult Verify(PadOp op) {
 //===----------------------------------------------------------------------===//
 
 static LogicalResult Verify(ReshapeOp op) {
-  auto operand_ty = op.operand().getType().cast<TensorType>();
+  // If the operand type is dynamically shaped there is nothing to verify.
+  auto operand_ty = op.operand().getType().cast<RankedTensorType>();
   if (!operand_ty || !operand_ty.hasStaticShape()) return success();
-  int64_t num_input_elements = operand_ty.getNumElements();
 
-  auto out_ty = op.getType().cast<RankedTensorType>();
-  if (out_ty && out_ty.hasStaticShape()) {
-    int64_t num_output_elements = out_ty.getNumElements();
-    if (num_input_elements != num_output_elements)
-      return op.emitOpError()
-             << "number of output elements (" << num_output_elements
-             << ") doesn't match expected number of elements ("
-             << num_input_elements << ")";
-  }
+  // If the operand type is statically shaped (not required) the number of
+  // elements must match that of the result type.
+  auto result_ty = op.getType().cast<RankedTensorType>();
+  assert(result_ty && result_ty.hasStaticShape() &&
+         "result type must be statically shaped");
+  int64_t num_result_elements = result_ty.getNumElements();
+  int64_t num_operand_elements = operand_ty.getNumElements();
+  if (num_result_elements != num_operand_elements)
+    return op.emitOpError()
+           << "number of output elements (" << num_result_elements
+           << ") doesn't match expected number of elements ("
+           << num_operand_elements << ")";
+
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index f78ac7624d2..1ca4e0c5d82 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -1058,7 +1058,7 @@ def HLO_ReshapeOp: HLO_Op<"reshape",
       [NoSideEffect, SameOperandsAndResultElementType]>, BASE_HLO_ReshapeOp {
   let arguments = (ins HLO_Tensor:$operand);
 
-  let results = (outs HLO_Tensor);
+  let results = (outs HLO_StaticShapeTensor);
   let hasFolder = 1;
 
   let hasCustomHLOConverter = 1;
diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
index 450910b2e4d..d5440a024ab 100644
--- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir
@@ -3555,30 +3555,31 @@ func @assert(%arg0: tensor<i1>, %arg1: tensor<*xf32>) {
 // tf.Unpack legalization
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: @unpack
-func @unpack(%input: tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>) {
-  // CHECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 1, 6]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
-  // CHECK: %[[RES1:.*]] = "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<4x1x6xf32>) -> tensor<4x?xf32>
-  // CHECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 2, 6]> : tensor<3xi64>, start_indices = dense<[0, 1, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
-  // CHECK: %[[RES2:.*]] = "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
-  // CHECK: %[[SLICE3:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 3, 6]> : tensor<3xi64>, start_indices = dense<[0, 2, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
-  // CHECK: %[[RES3:.*]] = "xla_hlo.reshape"(%[[SLICE3]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
+// TODO(b/156340000): Re-enable when fixed.
+// // C-HECK-LABEL: @unpack
+// func @unpack(%input: tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>) {
+//   // C-HECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 1, 6]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+//   // C-HECK: %[[RES1:.*]] = "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<4x1x6xf32>) -> tensor<4x?xf32>
+//   // C-HECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 2, 6]> : tensor<3xi64>, start_indices = dense<[0, 1, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+//   // C-HECK: %[[RES2:.*]] = "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
+//   // C-HECK: %[[SLICE3:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, 3, 6]> : tensor<3xi64>, start_indices = dense<[0, 2, 0]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<4x3x6xf32>) -> tensor<4x1x6xf32>
+//   // C-HECK: %[[RES3:.*]] = "xla_hlo.reshape"(%[[SLICE3]]) : (tensor<4x1x6xf32>) -> tensor<4x6xf32>
 
-  %0:3 = "tf.Unpack"(%input) {axis = 1} : (tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>)
-  // return %[[RES1]], %[[RES2]], %[[RES3]]
-  return %0#0, %0#1, %0#2 : tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>
-}
+//   %0:3 = "tf.Unpack"(%input) {axis = 1} : (tensor<4x3x6xf32>) -> (tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>)
+//   // return %[[RES1]], %[[RES2]], %[[RES3]]
+//   return %0#0, %0#1, %0#2 : tensor<4x?xf32>, tensor<4x6xf32>, tensor<4x6xf32>
+// }
 
-// CHECK-LABEL: @unpack_dynamic
-func @unpack_dynamic(%input: tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
-  // CHECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 1]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
-  // CHECK: "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 2]> : tensor<3xi64>, start_indices = dense<[0, 0, 1]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
-  // CHECK: "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
+// // C-HECK-LABEL: @unpack_dynamic
+// func @unpack_dynamic(%input: tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+//   // C-HECK: %[[SLICE1:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 1]> : tensor<3xi64>, start_indices = dense<0> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
+//   // C-HECK: "xla_hlo.reshape"(%[[SLICE1]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
+//   // C-HECK: %[[SLICE2:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, -1, 2]> : tensor<3xi64>, start_indices = dense<[0, 0, 1]> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} : (tensor<?x?x2xf32>) -> tensor<?x?x1xf32>
+//   // C-HECK: "xla_hlo.reshape"(%[[SLICE2]]) : (tensor<?x?x1xf32>) -> tensor<?x?xf32>
 
-  %0:2 = "tf.Unpack"(%input) {axis = -1} : (tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
-  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
-}
+//   %0:2 = "tf.Unpack"(%input) {axis = -1} : (tensor<?x?x2xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
+//   return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
+// }
 
 //===----------------------------------------------------------------------===//
 // tf.UnsortedSegment{Max|Min|Prod|Sum} legalization

From bfe99ed9d61220cae4eb9f4a7e35cb113cd6fce9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 03:33:15 -0700
Subject: [PATCH 249/412] Remove default initialization in
 tflite::StatefulNnApiDelegate::Data struct.

PiperOrigin-RevId: 311702143
Change-Id: I317473ef15a0ee8f31b1b99ee6e9f23f9f4f19cd
---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index 1bd9fb5c49f..b94c6d66978 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -90,7 +90,7 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // of number of nodes and selecting them until the limit is reached.
     int max_number_delegated_partitions = 3;
 
-    // allow fp32 compuation to be run in fp16
+    // allow fp32 compuation to be run in fp16.
     bool allow_fp16 = false;
   };
 
@@ -187,8 +187,8 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // Maximum number of NNAPI partition to delegate. Zero or negative means
     // no limit. Copied from StatefulNnApiDelegate::Options
     int max_number_delegated_partitions;
-    // allow fp32 computation to be run in fp32
-    bool allow_fp16 = false;
+    // allow fp32 computation to be run in fp16.
+    bool allow_fp16;
 
     ~Data();
 

From df2c8d282320ad4c8a81f1ec631537ad4752cfeb Mon Sep 17 00:00:00 2001
From: Stephan Herhut <herhut@google.com>
Date: Fri, 15 May 2020 03:46:01 -0700
Subject: [PATCH 250/412] Allow index typed values in
 `hlo_scalars_to_dimension_tensor`.

The limitation stems from a time where tensors with index element type were not allowed in MLIR. With this change, we can remove many `index_cast` ops.

PiperOrigin-RevId: 311703219
Change-Id: I56c7dba29e43b3ee13a1066c0974f72e696600ab
---
 tensorflow/compiler/mlir/xla/ir/hlo_ops.td    |  2 +-
 .../compiler/mlir/xla/ir/hlo_ops_base.td      |  4 ++-
 .../xla/tests/materialize-broadcasts.mlir     | 28 +++++++---------
 tensorflow/compiler/mlir/xla/tests/ops.mlir   |  8 +++++
 .../mlir/xla/tests/unfuse_batch_norm.mlir     | 19 ++++-------
 .../xla/transforms/materialize_broadcasts.cc  | 33 ++++++++-----------
 .../mlir/xla/transforms/unfuse_batch_norm.cc  |  4 +--
 7 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
index 1ca4e0c5d82..0d7771b180e 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.td
@@ -785,7 +785,7 @@ def HLO_ScalarsToDimensionTensorOp : HLO_Op<"scalars_to_dimension_tensor",
     compute shape arguments to dynamic operations.
   }];
 
-  let arguments = (ins Variadic<AnySignlessInteger>:$scalars);
+  let arguments = (ins Variadic<HLO_DimensionValue>:$scalars);
   let results = (outs HLO_DimensionTensor);
 
   // Cannot be exported to legacy formats.
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
index b5de675f13f..b5130eafd0e 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops_base.td
@@ -62,9 +62,11 @@ def HLO_Tuple : NestedTupleOf<[HLO_Tensor, HLO_Token]>;
 
 def HLO_TensorOrTuple : AnyTypeOf<[HLO_Tensor, HLO_Tuple]>;
 
+def HLO_DimensionValue : AnyTypeOf<[Index, HLO_Pred, HLO_Int]>;
+
 // Dynamic representation of a shape vector as a tensor.
 def HLO_DimensionTensor : ShapedContainerType<
-    [Index, HLO_Pred, HLO_Int],
+    [HLO_DimensionValue],
     And<[IsTensorTypePred, HasAnyRankOfPred<[1]>]>,
     "a 1D tensor of dimensions">;
 
diff --git a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
index 2340650dda8..a7f4a5b4474 100644
--- a/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/materialize-broadcasts.mlir
@@ -245,16 +245,14 @@ func @compareBroadcastRhs(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tenso
 // CHECK-LABEL: @dynamicCompareBroadcastRhs
 func @dynamicCompareBroadcastRhs(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tensor<?x?xi1> {
   // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
   // CHECK-NEXT: %c1 = constant 1 : index
   // CHECK-NEXT: %[[DIM1_0:.*]] = dim %arg0, 1 : tensor<?x?xf32>
   // CHECK-NEXT: %[[DIM1_1:.*]] = dim %arg1, 0 : tensor<?xf32>
   // CHECK-NEXT: %[[CMPI:.*]] = cmpi "eq", %[[DIM1_0]], %c1 : index
-  // CHECK-NEXT: %[[SEL:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
-  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[SEL]] : index to i32
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[DIM1:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
+  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0]], %[[DIM1]]) : (index, index) -> tensor<2xindex>
+  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-NEXT: "xla_hlo.compare"(%[[BROADCAST0]], %[[BROADCAST1]]) {comparison_direction = "NE"} : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xi1>
   %0 = "xla_hlo.compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = "NE"} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xi1>
   return %0 : tensor<?x?xi1>
@@ -265,16 +263,14 @@ func @dynamicCompareBroadcastRhs(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -
 // CHECK-LABEL: @dynamicBroadcastAdd
 func @dynamicBroadcastAdd(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tensor<?x?xf32> {
   // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
   // CHECK-NEXT: %c1 = constant 1 : index
   // CHECK-NEXT: %[[DIM1_0:.*]] = dim %arg0, 1 : tensor<?x?xf32>
   // CHECK-NEXT: %[[DIM1_1:.*]] = dim %arg1, 0 : tensor<?xf32>
   // CHECK-NEXT: %[[CMPI:.*]] = cmpi "eq", %[[DIM1_0]], %c1 : index
-  // CHECK-NEXT: %[[SEL:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
-  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[SEL]] : index to i32
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[DIM1:.*]] = select %[[CMPI]], %[[DIM1_0]], %[[DIM1_1]] : index
+  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0]], %[[DIM1]]) : (index, index) -> tensor<2xindex>
+  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-NEXT: xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<?x?xf32>
   %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?x?xf32>, tensor<?xf32>) -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
@@ -285,12 +281,10 @@ func @dynamicBroadcastAdd(%arg0: tensor<?x?xf32>, %arg1: tensor<?xf32>) -> tenso
 // CHECK-LABEL: @dynamicBroadcastAddScalar
 func @dynamicBroadcastAddScalar(%arg0: tensor<?x?xf32>, %arg1: tensor<f32>) -> tensor<?x?xf32> {
   // CHECK-NEXT: %[[DIM0:.*]] = dim %arg0, 0 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM0C:.*]] = index_cast %[[DIM0]] : index to i32
   // CHECK-NEXT: %[[DIM1:.*]] = dim %arg0, 1 : tensor<?x?xf32>
-  // CHECK-NEXT: %[[DIM1C:.*]] = index_cast %[[DIM1]] : index to i32
-  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0C]], %[[DIM1C]]) : (i32, i32) -> tensor<2xi32>
-  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xi32>) -> tensor<?x?xf32>
-  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<2xi32>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[SHAPE:.*]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM0]], %[[DIM1]]) : (index, index) -> tensor<2xindex>
+  // CHECK-NEXT: %[[BROADCAST0:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg0, %[[SHAPE]]) {broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  // CHECK-NEXT: %[[BROADCAST1:.*]] = "xla_hlo.dynamic_broadcast_in_dim"(%arg1, %[[SHAPE]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<2xindex>) -> tensor<?x?xf32>
   // CHECK-NEXT: xla_hlo.add %[[BROADCAST0]], %[[BROADCAST1]] : tensor<?x?xf32>
   %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
diff --git a/tensorflow/compiler/mlir/xla/tests/ops.mlir b/tensorflow/compiler/mlir/xla/tests/ops.mlir
index 8cb63311657..f09ec62c8dc 100644
--- a/tensorflow/compiler/mlir/xla/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/ops.mlir
@@ -461,6 +461,14 @@ func @scalars_to_dimension_tensor(%arg0: i32, %arg1: i32) -> tensor<2xi32> {
 
 // -----
 
+// CHECK-LABEL: @scalars_to_dimension_tensor_index
+func @scalars_to_dimension_tensor_index(%arg0: index, %arg1: index) -> tensor<2xindex> {
+  %0 = "xla_hlo.scalars_to_dimension_tensor"(%arg0, %arg1) : (index, index) -> tensor<2xindex>
+  return %0 : tensor<2xindex>
+}
+
+// -----
+
 // CHECK-LABEL: func @select
 func @select(%arg0: tensor<2x3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
   %0 = "xla_hlo.select"(%arg0, %arg1, %arg2) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
diff --git a/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir b/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
index 9778772e250..7a54de73db7 100644
--- a/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/unfuse_batch_norm.mlir
@@ -106,24 +106,19 @@ func @batchNormInference_dynamic_shape(
     -> tensor<?x?x?x?xf32> {
   // CHECK-DAG: %[[EPS:.+]] = xla_hlo.constant dense<1.000000e-03> : tensor<f32>
   // CHECK-DAG: %[[DIM:.+]] = dim %[[VARIANCE]], 0 : tensor<?xf32>
-  // CHECK-DAG: %[[INDEX_CAST:.+]] = index_cast %[[DIM]] : index to i32
-  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INDEX_CAST]]) : (i32) -> tensor<1xi32>
-  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "xla_hlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[TO_DIM_TENSOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<1xi32>) -> tensor<?xf32>
+  // CHECK-DAG: %[[TO_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[DIM]]) : (index) -> tensor<1xindex>
+  // CHECK-DAG: %[[EPS_BCAST:.+]] =  "xla_hlo.dynamic_broadcast_in_dim"(%[[EPS]], %[[TO_DIM_TENSOR]]) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<f32>, tensor<1xindex>) -> tensor<?xf32>
   // CHECK-DAG: %[[VARIANCE_EPS:.+]] = xla_hlo.add %[[VARIANCE]], %[[EPS_BCAST]] : tensor<?xf32>
   // CHECK-DAG: %[[STDDEV:.+]] = "xla_hlo.sqrt"(%[[VARIANCE_EPS]]) : (tensor<?xf32>) -> tensor<?xf32>
   // CHECK-DAG: %[[INPUT_DIM_0:.+]] = dim %[[X]], 0 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_0:.+]] = index_cast %[[INPUT_DIM_0]] : index to i32
   // CHECK-DAG: %[[INPUT_DIM_1:.+]] = dim %[[X]], 1 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_1:.+]] = index_cast %[[INPUT_DIM_1]] : index to i32
   // CHECK-DAG: %[[INPUT_DIM_2:.+]] = dim %[[X]], 2 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_2:.+]] = index_cast %[[INPUT_DIM_2]] : index to i32
   // CHECK-DAG: %[[INPUT_DIM_3:.+]] = dim %[[X]], 3 : tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[INPUT_INDEX_CAST_3:.+]] = index_cast %[[INPUT_DIM_3]] : index to i32
-  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INPUT_INDEX_CAST_0]], %[[INPUT_INDEX_CAST_1]], %[[INPUT_INDEX_CAST_2]], %[[INPUT_INDEX_CAST_3]]) : (i32, i32, i32, i32) -> tensor<4xi32>
-  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
-  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xi32>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[TO_INPUT_DIM_TENSOR:.+]] = "xla_hlo.scalars_to_dimension_tensor"(%[[INPUT_DIM_0]], %[[INPUT_DIM_1]], %[[INPUT_DIM_2]], %[[INPUT_DIM_3]]) : (index, index, index, index) -> tensor<4xindex>
+  // CHECK-DAG: %[[STDDEV_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[STDDEV]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[SCALE_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[SCALE]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[OFFSET_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[OFFSET]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+  // CHECK-DAG: %[[MEAN_BCAST:.+]] = "xla_hlo.dynamic_broadcast_in_dim"(%[[MEAN]], %[[TO_INPUT_DIM_TENSOR]]) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_CENTER:.+]] = xla_hlo.subtract %[[X]], %[[MEAN_BCAST]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_SCALED:.+]] = xla_hlo.multiply %[[X_CENTER]], %[[SCALE_BCAST]] : tensor<?x?x?x?xf32>
   // CHECK-DAG: %[[X_NORMED:.+]] = xla_hlo.divide %[[X_SCALED]], %[[STDDEV_BCAST]] : tensor<?x?x?x?xf32>
diff --git a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
index bf666400900..7b4262825f8 100644
--- a/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/materialize_broadcasts.cc
@@ -143,28 +143,23 @@ std::vector<Value> ComputeBroadcastedShape(SrcOp op, Value small, Value large,
   // either be the same in that dimension or it can be 1, in which case the
   // shape of the other operand is used.
   for (int i = 0; i < output_rank; ++i) {
-    Value index_value;
     if (indexes[i] == kExpandShape) {
       // The smaller shape gets expanded to the larger one in this case.
-      index_value = rewriter->create<mlir::DimOp>(loc, large, i);
-    } else {
-      // Compute the result shape depending on whether the rank of smaller is 1.
-      // This does not check that the broadcast operation actualy is correct.
-      // In particular, we do not check that both shapes are the same if the
-      // smaller ranked shape is not 1.
-      ConstantOp one = rewriter->create<mlir::ConstantOp>(
-          loc, rewriter->getIntegerAttr(rewriter->getIndexType(), 1));
-      DimOp lrg_dim = rewriter->create<mlir::DimOp>(loc, large, i);
-      DimOp sml_dim = rewriter->create<mlir::DimOp>(loc, small, indexes[i]);
-      CmpIOp compare =
-          rewriter->create<mlir::CmpIOp>(loc, CmpIPredicate::eq, lrg_dim, one);
-      index_value =
-          rewriter->create<mlir::SelectOp>(loc, compare, lrg_dim, sml_dim);
+      shape_values.push_back(rewriter->create<mlir::DimOp>(loc, large, i));
+      continue;
     }
-    // Ideally, we would like to keep this on index but MLIR does not allow
-    // this.
-    shape_values.push_back(rewriter->create<mlir::IndexCastOp>(
-        loc, index_value, rewriter->getIntegerType(32)));
+    // Compute the result shape depending on whether the rank of smaller is 1.
+    // This does not check that the broadcast operation actualy is correct.
+    // In particular, we do not check that both shapes are the same if the
+    // smaller ranked shape is not 1.
+    ConstantOp one = rewriter->create<mlir::ConstantOp>(
+        loc, rewriter->getIntegerAttr(rewriter->getIndexType(), 1));
+    DimOp lrg_dim = rewriter->create<mlir::DimOp>(loc, large, i);
+    DimOp sml_dim = rewriter->create<mlir::DimOp>(loc, small, indexes[i]);
+    CmpIOp compare =
+        rewriter->create<mlir::CmpIOp>(loc, CmpIPredicate::eq, lrg_dim, one);
+    shape_values.push_back(
+        rewriter->create<mlir::SelectOp>(loc, compare, lrg_dim, sml_dim));
   }
 
   return shape_values;
diff --git a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
index 32d8b079c89..d53aaee3701 100644
--- a/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/unfuse_batch_norm.cc
@@ -58,9 +58,7 @@ Value CalculateShapeValue(Location loc, Value operand,
   int64_t rank = result_type.getRank();
   shape_values.reserve(rank);
   for (int64_t i = 0; i < rank; ++i) {
-    auto index_value = rewriter.create<mlir::DimOp>(loc, operand, i);
-    shape_values.push_back(rewriter.create<mlir::IndexCastOp>(
-        loc, index_value, rewriter.getIntegerType(32)));
+    shape_values.push_back(rewriter.create<mlir::DimOp>(loc, operand, i));
   }
   Type shape_element_type = shape_values.front().getType();
   return rewriter.create<ScalarsToDimensionTensorOp>(

From 0c6f6f4776e132148cd7775492665c5456c28294 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 04:00:10 -0700
Subject: [PATCH 251/412] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/854f5f332af4

PiperOrigin-RevId: 311704350
Change-Id: I2314bba4a4f72fdceda3a3439bac3e3fb96b811a
---
 third_party/mlir/BUILD | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 1bddf2180bc..93843d58f30 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -2458,6 +2458,7 @@ cc_library(
         ":LLVMTransforms",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
+        ":LinalgToStandard",
         ":NVVMDialect",
         ":Parser",
         ":Pass",
@@ -2543,6 +2544,7 @@ cc_library(
         ":LinalgPassIncGen",
         ":LinalgToLLVM",
         ":LinalgToSPIRV",
+        ":LinalgToStandard",
         ":LinalgTransforms",
         ":LoopPassIncGen",
         ":LoopsToGPUPass",
@@ -3121,6 +3123,31 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "LinalgToStandard",
+    srcs = glob([
+        "lib/Conversion/LinalgToStandard/*.cpp",
+        "lib/Conversion/LinalgToStandard/*.h",
+    ]) + ["lib/Conversion/PassDetail.h"],
+    hdrs = glob([
+        "include/mlir/Conversion/LinalgToStandard/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":Affine",
+        ":ConversionPassIncGen",
+        ":IR",
+        ":LinalgOps",
+        ":Pass",
+        ":SCFDialect",
+        ":StandardOps",
+        ":Support",
+        ":Transforms",
+        "@llvm-project//llvm:core",
+        "@llvm-project//llvm:support",
+    ],
+)
+
 cc_library(
     name = "LinalgToSPIRV",
     srcs = glob([

From 872e950b51edbf3430d547e2fe4ed15ba8b18f77 Mon Sep 17 00:00:00 2001
From: seo-inyoung <62606132+seo-inyoung@users.noreply.github.com>
Date: Fri, 15 May 2020 20:05:11 +0900
Subject: [PATCH 252/412] Update SECURITY.md

simple error correction
---
 SECURITY.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/SECURITY.md b/SECURITY.md
index 6fc2c3aa9cc..f3a6c148b2e 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -64,7 +64,7 @@ your model, and we recommend you run the TensorFlow process in a sandbox.
 
 It is possible to write models that are secure in a sense that they can safely
 process untrusted inputs assuming there are no bugs. There are two main reasons
-to not rely on this: first, it is easy to write models which must not be exposed
+to not rely on this: First, it is easy to write models which must not be exposed
 to untrusted inputs, and second, there are bugs in any software system of
 sufficient complexity. Letting users control inputs could allow them to trigger
 bugs either in TensorFlow or in dependent libraries.
@@ -149,7 +149,7 @@ attack (or worse). Because TensorFlow behaves correctly, this is not a
 vulnerability in TensorFlow (although it would be a vulnerability of this
 hypothetical system).
 
-As a general rule, it is incorrect behavior for Tensorflow to access memory it
+As a general rule, it is incorrect behavior for TensorFlow to access memory it
 does not own, or to terminate in an unclean way. Bugs in TensorFlow that lead to
 such behaviors constitute a vulnerability.
 

From 103bb013d4d4ba19da0445abd9b9c627af9df817 Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 14:23:20 +0200
Subject: [PATCH 253/412] Verifiy differences with test annotation

---
 tensorflow/python/kernel_tests/map_fn_test.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 0bc3307e484..81dd817687a 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.autograph.impl import api
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -186,7 +187,8 @@ class MapFnTest(test.TestCase):
     self.assertAllEqual(-nums, received[1])
     self.assertAllEqual(nums, received[2])
 
-  @test_util.run_in_graph_and_eager_modes
+  #@test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testMap_autograph_indirect(self):
     def test_function(x):
       cond = constant_op.constant(-1)
@@ -195,6 +197,8 @@ class MapFnTest(test.TestCase):
       else:
         result = x
       return result
+
+    @api.convert(recursive=False) 
     def map_call(x):
       return map_fn.map_fn(test_function, x)
 

From bbc2f3a190ff05a0bb8c30246dc71490587f434a Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 15:37:38 +0200
Subject: [PATCH 254/412] Let test to fail

---
 tensorflow/python/kernel_tests/map_fn_test.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/map_fn_test.py b/tensorflow/python/kernel_tests/map_fn_test.py
index 81dd817687a..8ead634aa11 100644
--- a/tensorflow/python/kernel_tests/map_fn_test.py
+++ b/tensorflow/python/kernel_tests/map_fn_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.autograph.impl import api
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -187,8 +187,7 @@ class MapFnTest(test.TestCase):
     self.assertAllEqual(-nums, received[1])
     self.assertAllEqual(nums, received[2])
 
-  #@test_util.run_in_graph_and_eager_modes
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes
   def testMap_autograph_indirect(self):
     def test_function(x):
       cond = constant_op.constant(-1)
@@ -198,7 +197,7 @@ class MapFnTest(test.TestCase):
         result = x
       return result
 
-    @api.convert(recursive=False) 
+    @def_function.function
     def map_call(x):
       return map_fn.map_fn(test_function, x)
 

From 8da4a14be31d4621208724b373730b2c4972f2f9 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Fri, 15 May 2020 07:34:14 -0700
Subject: [PATCH 255/412] Avoid overhead for creating executors if there is no
 change in execution mode.

PiperOrigin-RevId: 311726778
Change-Id: I33a1e5085e1740504181bd6096229b6df12b26f8
---
 tensorflow/python/eager/context.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 86b3d5cf95f..c6ef21402d2 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -2027,6 +2027,8 @@ def execution_mode(mode):
   """Context manager for setting execution mode for current thread."""
   if mode is None:
     yield
+  elif (mode == ASYNC) == context().executor.is_async():
+    yield
   else:
     ctx = context()
     executor_new = executor.new_executor(mode == ASYNC)

From 560762e40d9bb085ea33f52b36b96a3851e1b3d2 Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 16:49:53 +0200
Subject: [PATCH 256/412] Test autograph transform of fn

---
 tensorflow/python/ops/map_fn.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index 2c9c678336e..e39d35c36b0 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -22,6 +22,8 @@ from __future__ import print_function
 
 import re
 
+from tensorflow.python.autograph.core import ag_ctx as autograph_ctx
+from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
@@ -477,6 +479,8 @@ def map_fn(fn,
       elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable,
                                                         elems_flat_signature)
       elems_value = elems_unflatten(elems_value_flat)
+      autographed_fn = autograph.tf_convert(fn, autograph_ctx.control_status_ctx())
+      result_value = autographed_fn(elems_value)
       result_value = fn(elems_value)
       nest.assert_same_structure(fn_output_signature or elems, result_value)
       result_value_flat = nest.flatten(result_value)

From 020a88ac127caa1e333ce36873ad2602abc5f7d7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 08:09:03 -0700
Subject: [PATCH 257/412] Rollback to investigate failure

PiperOrigin-RevId: 311731132
Change-Id: I109ce87f13bb1b1c06b3e110bafbdf9c014c8258
---
 tensorflow/lite/BUILD                         |  19 +-
 tensorflow/lite/core/subgraph.cc              |   7 -
 tensorflow/lite/core/subgraph.h               |   6 -
 tensorflow/lite/delegates/BUILD               |  23 --
 .../lite/delegates/interpreter_utils.cc       |  67 ----
 tensorflow/lite/delegates/interpreter_utils.h |  52 ---
 .../lite/delegates/interpreter_utils_test.cc  |  92 -----
 tensorflow/lite/delegates/utils.h             |   2 -
 tensorflow/lite/interpreter.cc                |   4 -
 tensorflow/lite/interpreter.h                 |  13 -
 tensorflow/lite/interpreter_test.cc           | 309 +++++++++++++++-
 tensorflow/lite/interpreter_test.h            | 331 ------------------
 12 files changed, 307 insertions(+), 618 deletions(-)
 delete mode 100644 tensorflow/lite/delegates/interpreter_utils.cc
 delete mode 100644 tensorflow/lite/delegates/interpreter_utils.h
 delete mode 100644 tensorflow/lite/delegates/interpreter_utils_test.cc
 delete mode 100644 tensorflow/lite/interpreter_test.h

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 4d8c07aa15b..14babee2da7 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -340,27 +340,11 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "interpreter_test_fixtures",
-    testonly = True,
-    hdrs = ["interpreter_test.h"],
-    deps = [
-        ":framework",
-        "//tensorflow/lite/core/api",
-        "//tensorflow/lite/kernels:builtin_ops",
-        "//tensorflow/lite/kernels:kernel_util",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 # Test main interpreter
 cc_test(
     name = "interpreter_test",
     size = "small",
-    srcs = [
-        "interpreter_test.cc",
-    ],
+    srcs = ["interpreter_test.cc"],
     features = ["-dynamic_link_test_srcs"],  # see go/dynamic_link_test_srcs
     tags = [
         "tflite_not_portable_ios",  # TODO(b/117786830)
@@ -368,7 +352,6 @@ cc_test(
     deps = [
         ":external_cpu_backend_context",
         ":framework",
-        ":interpreter_test_fixtures",
         ":string_util",
         ":version",
         "//tensorflow/lite/core/api",
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 81710df128b..7f4e0e286ea 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -533,11 +533,6 @@ void Subgraph::SetCancellationFunction(void* data,
   check_cancelled_func_ = check_cancelled_func;
 }
 
-bool Subgraph::IsCancelled() {
-  return (check_cancelled_func_ != nullptr) &&
-         (*check_cancelled_func_)(cancellation_data_);
-}
-
 void Subgraph::ReserveNodes(int count) {
   nodes_and_registration_.reserve(count);
 }
@@ -1321,8 +1316,6 @@ TfLiteStatus Subgraph::RemoveAllDelegates() {
   return kTfLiteOk;
 }
 
-bool Subgraph::HasDelegates() { return !delegates_applied_.empty(); }
-
 TfLiteStatus Subgraph::EnsureMemoryAllocations() {
   if (memory_planner_) {
     state_ = kStateUninvokable;
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index d6067daaa6a..0b0c1e31e89 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -553,9 +553,6 @@ class Subgraph {
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
-  // Returns true if the subgraph has delegates applied.
-  bool HasDelegates();
-
   // Cleanups up data reserved for the given node. Does not remove the {node,
   // registration} pair from nodes_and_registrations_.
   void CleanupNode(int node_index);
@@ -581,9 +578,6 @@ class Subgraph {
   // Ensures the memory required is planned and allocated.
   TfLiteStatus EnsureMemoryAllocations();
 
-  // Returns true if cancellation function returns true.
-  bool IsCancelled();
-
   // The state of the Interpreter.
   enum State {
     // The interpreter isn't ready to be invoked.
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index 8d4c921576d..df671675ec9 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -43,26 +43,3 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
-
-cc_library(
-    name = "interpreter_utils",
-    srcs = ["interpreter_utils.cc"],
-    hdrs = ["interpreter_utils.h"],
-    copts = tflite_copts(),
-    deps = [
-        "//tensorflow/lite:framework",
-    ],
-)
-
-cc_test(
-    name = "interpreter_utils_test",
-    srcs = ["interpreter_utils_test.cc"],
-    linkopts = tflite_linkopts(),
-    linkstatic = 1,
-    deps = [
-        ":interpreter_utils",
-        "//tensorflow/lite:framework",
-        "//tensorflow/lite:interpreter_test_fixtures",
-        "@com_google_googletest//:gtest_main",
-    ],
-)
diff --git a/tensorflow/lite/delegates/interpreter_utils.cc b/tensorflow/lite/delegates/interpreter_utils.cc
deleted file mode 100644
index 85d79d887fb..00000000000
--- a/tensorflow/lite/delegates/interpreter_utils.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/interpreter_utils.h"
-
-namespace tflite {
-namespace delegates {
-TfLiteStatus InterpreterUtils::InvokeWithCPUFallback(Interpreter* interpreter) {
-  TfLiteStatus status = interpreter->Invoke();
-  if (status == kTfLiteOk || interpreter->IsCancelled() ||
-      !interpreter->HasDelegates()) {
-    return status;
-  }
-  // Retry without delegation.
-  // TODO(b/138706191): retry only if error is due to delegation.
-  TF_LITE_REPORT_ERROR(
-      interpreter->error_reporter(),
-      "Invoke() failed in the presence of delegation. Retrying without.");
-
-  // Copy input data to a buffer.
-  // Input data is safe since Subgraph::PrepareOpsAndTensors() passes
-  // preserve_inputs=true to ArenaPlanner.
-  std::vector<char> buf;
-  size_t input_size = 0;
-
-  for (auto i : interpreter->inputs()) {
-    TfLiteTensor* t = interpreter->tensor(i);
-    input_size += t->bytes;
-  }
-  buf.reserve(input_size);
-  auto bufp = buf.begin();
-  for (auto i : interpreter->inputs()) {
-    // TF_LITE_ENSURE_STATUS(interpreter->EnsureTensorDataIsReadable(i));
-    TfLiteTensor* t = interpreter->tensor(i);
-    std::copy(t->data.raw, t->data.raw + t->bytes, bufp);
-    bufp += t->bytes;
-  }
-
-  TF_LITE_ENSURE_STATUS(interpreter->RemoveAllDelegates());
-
-  // Copy inputs from buffer.
-  bufp = buf.begin();
-  for (auto i : interpreter->inputs()) {
-    TfLiteTensor* t = interpreter->tensor(i);
-    std::copy(bufp, bufp + t->bytes, t->data.raw);
-    bufp += t->bytes;
-  }
-
-  // Invoke again.
-  TF_LITE_ENSURE_STATUS(interpreter->Invoke());
-  return kTfLiteDelegateError;
-}
-
-}  // namespace delegates
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/interpreter_utils.h b/tensorflow/lite/delegates/interpreter_utils.h
deleted file mode 100644
index f736c2db1f4..00000000000
--- a/tensorflow/lite/delegates/interpreter_utils.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
-#define TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
-
-#include "tensorflow/lite/interpreter.h"
-
-// Utility functions and classes for using delegates.
-
-namespace tflite {
-namespace delegates {
-#if !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-class InterpreterUtils {
- public:
-  /// Invokes an interpreter with automatic fallback from delegation to CPU.
-  ///
-  /// If using the delegate fails, the delegate is automatically undone and an
-  /// attempt made to return the interpreter to an invokable state.
-  ///
-  /// Allowing the fallback is suitable only if both of the following hold:
-  /// - The caller is known not to cache pointers to tensor data across Invoke()
-  ///   calls.
-  /// - The model is not stateful (no variables, no LSTMs) or the state isn't
-  ///   needed between batches.
-  ///
-  /// Returns one of the following three status codes:
-  /// 1. kTfLiteOk: Success. Output is valid.
-  /// 2. kTfLiteDelegateError: Delegate error but fallback succeeded. Output is
-  /// valid.
-  /// NOTE: This undoes all delegates previously applied to the Interpreter.
-  /// 3. kTfLiteError: Unexpected/runtime failure. Output is invalid.
-  /// WARNING: This is an experimental API and subject to change.
-  static TfLiteStatus InvokeWithCPUFallback(Interpreter* interpreter);
-};
-#endif  // !TFLITE_EXPERIMENTAL_RUNTIME_EAGER
-}  // namespace delegates
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_DELEGATES_INTERPRETER_UTILS_H_
diff --git a/tensorflow/lite/delegates/interpreter_utils_test.cc b/tensorflow/lite/delegates/interpreter_utils_test.cc
deleted file mode 100644
index 8dc856d796c..00000000000
--- a/tensorflow/lite/delegates/interpreter_utils_test.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/delegates/interpreter_utils.h"
-
-#include <gmock/gmock.h>
-#include <gtest/gtest.h>
-#include "tensorflow/lite/interpreter_test.h"
-
-namespace tflite {
-namespace {
-
-TEST_F(TestDelegate, DelegateNodeInvokeFailureFallback) {
-  delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {0, 1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
-      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Delegation modified execution plan.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 1);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 3;
-
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  EXPECT_EQ(
-      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
-      kTfLiteDelegateError);
-  // Delegation removed, returning to original execution plan.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  // Check outputs.
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-TEST_F(TestDelegate, TestFallbackWithMultipleDelegates) {
-  // First delegate only supports node 0.
-  // This delegate should support dynamic tensors, otherwise the second won't be
-  // applied.
-  delegate_ = std::unique_ptr<SimpleDelegate>(
-      new SimpleDelegate({0}, kTfLiteDelegateFlagsAllowDynamicTensors));
-  // Second delegate supports nodes 1 & 2, and makes the graph immutable.
-  delegate2_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate(
-      {1, 2}, kTfLiteDelegateFlagsNone, false /**fail_node_prepare**/,
-      0 /**min_ops_per_subset**/, true /**fail_node_invoke**/));
-  // Pre-delegation execution plan should have three nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 3);
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  ASSERT_EQ(
-      interpreter_->ModifyGraphWithDelegate(delegate2_->get_tf_lite_delegate()),
-      kTfLiteOk);
-  // Should be two delegates nodes.
-  ASSERT_EQ(interpreter_->execution_plan().size(), 2);
-
-  std::vector<float> input = {1.0f, 2.0f, 3.0f};
-  std::vector<float> expected_output = {2.0f, 4.0f, 6.0f};
-  constexpr int kOutputTensorIndex = 2;
-  TfLiteTensor* tensor = interpreter_->tensor(kOutputTensorIndex);
-
-  memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
-  memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  EXPECT_EQ(
-      delegates::InterpreterUtils::InvokeWithCPUFallback(interpreter_.get()),
-      kTfLiteDelegateError);
-  // All delegates should be undone.
-  EXPECT_EQ(interpreter_->execution_plan().size(), 3);
-  for (int i = 0; i < 3; ++i) {
-    EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
-  }
-}
-
-}  // namespace
-}  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index 3b0668af04b..d6d22c4efa2 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -16,8 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_UTILS_H_
 #define TENSORFLOW_LITE_DELEGATES_UTILS_H_
 
-// Utility functions and classes for implementing delegates.
-
 #include <functional>
 #include <limits>
 #include <set>
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 167254a2a62..c8ccf671d60 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -310,8 +310,6 @@ void Interpreter::SetCancellationFunction(void* data,
   }
 }
 
-bool Interpreter::IsCancelled() { return primary_subgraph().IsCancelled(); }
-
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
   TfLiteStatus status = kTfLiteOk;
   for (auto& subgraph : subgraphs_) {
@@ -342,8 +340,6 @@ TfLiteStatus Interpreter::RemoveAllDelegates() {
   return kTfLiteOk;
 }
 
-bool Interpreter::HasDelegates() { return primary_subgraph().HasDelegates(); }
-
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle buffer_handle,
                                           TfLiteDelegate* delegate) {
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index aa9c54d295f..b93fd76c13b 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -42,9 +42,6 @@ namespace tflite {
 
 class InterpreterTest;
 class TestDelegate;
-namespace delegates {
-class InterpreterUtils;  // Class for friend declarations.
-}  // namespace delegates
 
 namespace impl {
 
@@ -532,7 +529,6 @@ class Interpreter {
   friend class InterpreterBuilder;
   friend class tflite::InterpreterTest;
   friend class tflite::TestDelegate;
-  friend class tflite::delegates::InterpreterUtils;
 
   /// Set the value of an external context.
   static void SetExternalContext(struct TfLiteContext* context,
@@ -546,15 +542,6 @@ class Interpreter {
   // afterwards.
   TfLiteStatus RemoveAllDelegates();
 
-  // Returns true if delegates have been applied.
-  bool HasDelegates();
-
-  // Returns true if cancellation function returns true.
-  bool IsCancelled();
-
-  // Get the error reporter associated with this interpreter.
-  ErrorReporter* error_reporter() { return error_reporter_; }
-
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 1d8f82ef16a..cfc7c168aa5 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/external_cpu_backend_context.h"
-#include "tensorflow/lite/interpreter_test.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -36,6 +35,25 @@ limitations under the License.
 
 namespace tflite {
 
+// InterpreterTest is a friend of Interpreter, so it can access context_.
+class InterpreterTest : public ::testing::Test {
+ public:
+  template <typename Delegate>
+  static TfLiteStatus ModifyGraphWithDelegate(
+      Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
+    Interpreter::TfLiteDelegatePtr tflite_delegate(
+        delegate.release(), [](TfLiteDelegate* delegate) {
+          delete reinterpret_cast<Delegate*>(delegate);
+        });
+    return interpreter->ModifyGraphWithDelegate(std::move(tflite_delegate));
+  }
+
+ protected:
+  TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
+
+  Interpreter interpreter_;
+};
+
 namespace ops {
 namespace builtin {
 TfLiteRegistration* Register_PADV2();
@@ -1286,6 +1304,291 @@ TEST_F(TestExecutionPlan, NullExecutionPlan) {
   ASSERT_EQ(run_order_, std::vector<int>());
 }
 
+// Build a kernel registration for an op that copies its one input
+// to an output
+TfLiteRegistration AddOpRegistration() {
+  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
+
+  reg.custom_name = "my_add";
+  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
+
+  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+    // Set output size to input size
+    const TfLiteTensor* input1 = GetInput(context, node, 0);
+    const TfLiteTensor* input2 = GetInput(context, node, 1);
+    TfLiteTensor* output = GetOutput(context, node, 0);
+
+    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
+    for (int i = 0; i < input1->dims->size; ++i) {
+      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
+    }
+
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+        context, output, TfLiteIntArrayCopy(input1->dims)));
+    return kTfLiteOk;
+  };
+
+  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
+    // Copy input data to output data.
+    const TfLiteTensor* a0 = GetInput(context, node, 0);
+    TF_LITE_ENSURE(context, a0);
+    TF_LITE_ENSURE(context, a0->data.f);
+    const TfLiteTensor* a1 = GetInput(context, node, 1);
+    TF_LITE_ENSURE(context, a1);
+    TF_LITE_ENSURE(context, a1->data.f);
+    TfLiteTensor* out = GetOutput(context, node, 0);
+    TF_LITE_ENSURE(context, out);
+    TF_LITE_ENSURE(context, out->data.f);
+    int num = a0->dims->data[0];
+    for (int i = 0; i < num; i++) {
+      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+    }
+    return kTfLiteOk;
+  };
+  return reg;
+}
+
+}  // namespace
+
+// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
+class TestDelegate : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    interpreter_.reset(new Interpreter);
+    interpreter_->AddTensors(5);
+    interpreter_->SetInputs({0, 1});
+    interpreter_->SetOutputs({3, 4});
+    TfLiteQuantizationParams quant;
+    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
+                                               quant);
+    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
+                                               quant);
+    TfLiteRegistration reg = AddOpRegistration();
+    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
+    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
+  }
+
+  void TearDown() override {
+    // Interpreter relies on delegate to free the resources properly. Thus
+    // the life cycle of delegate must be longer than interpreter.
+    interpreter_.reset();
+    delegate_.reset();
+  }
+
+  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
+
+  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
+
+  TfLiteStatus RemoveAllDelegates() {
+    return interpreter_->RemoveAllDelegates();
+  }
+
+ protected:
+  class SimpleDelegate {
+   public:
+    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
+    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
+    // value-copyable and compatible with TfLite.
+    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
+    // min_ops_per_subset: If >0, partitioning preview is used to choose only
+    // those subsets with min_ops_per_subset number of nodes.
+    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
+    explicit SimpleDelegate(
+        const std::vector<int>& nodes,
+        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
+        bool fail_node_prepare = false, int min_ops_per_subset = 0,
+        bool fail_node_invoke = false)
+        : nodes_(nodes),
+          fail_delegate_node_prepare_(fail_node_prepare),
+          min_ops_per_subset_(min_ops_per_subset),
+          fail_delegate_node_invoke_(fail_node_invoke) {
+      delegate_.Prepare = [](TfLiteContext* context,
+                             TfLiteDelegate* delegate) -> TfLiteStatus {
+        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
+        TfLiteIntArray* nodes_to_separate =
+            TfLiteIntArrayCreate(simple->nodes_.size());
+        // Mark nodes that we want in TfLiteIntArray* structure.
+        int index = 0;
+        for (auto node_index : simple->nodes_) {
+          nodes_to_separate->data[index++] = node_index;
+          // make sure node is added
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+        }
+        // Check that all nodes are available
+        TfLiteIntArray* execution_plan;
+        TF_LITE_ENSURE_STATUS(
+            context->GetExecutionPlan(context, &execution_plan));
+        for (int exec_index = 0; exec_index < execution_plan->size;
+             exec_index++) {
+          int node_index = execution_plan->data[exec_index];
+          TfLiteNode* node;
+          TfLiteRegistration* reg;
+          context->GetNodeAndRegistration(context, node_index, &node, &reg);
+          if (exec_index == node_index) {
+            // Check op details only if it wasn't delegated already.
+            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
+            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
+          }
+        }
+
+        // Get preview of delegate partitioning from the context.
+        TfLiteDelegateParams* params_array;
+        int num_partitions;
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        if (simple->min_ops_per_subset() > 0) {
+          // Build a new vector of ops from subsets with atleast the minimum
+          // size.
+          std::vector<int> allowed_ops;
+          for (int idx = 0; idx < num_partitions; ++idx) {
+            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
+            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
+            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
+                               nodes_in_subset->data + nodes_in_subset->size);
+          }
+
+          // Free existing nodes_to_separate & initialize a new array with
+          // allowed_ops.
+          TfLiteIntArrayFree(nodes_to_separate);
+          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
+          memcpy(nodes_to_separate->data, allowed_ops.data(),
+                 sizeof(int) * nodes_to_separate->size);
+        }
+
+        // Another call to PreviewDelegateParitioning should be okay, since
+        // partitioning memory is managed by context.
+        TFLITE_CHECK_EQ(
+            context->PreviewDelegatePartitioning(
+                context, nodes_to_separate, &params_array, &num_partitions),
+            kTfLiteOk);
+
+        context->ReplaceNodeSubsetsWithDelegateKernels(
+            context, simple->FakeFusedRegistration(), nodes_to_separate,
+            delegate);
+        TfLiteIntArrayFree(nodes_to_separate);
+        return kTfLiteOk;
+      };
+      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* tensor) -> TfLiteStatus {
+        // TODO(b/156586986): Implement tests to test buffer copying logic.
+        return kTfLiteOk;
+      };
+      delegate_.CopyFromBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle buffer_handle,
+             TfLiteTensor* output) -> TfLiteStatus {
+        TFLITE_CHECK_GE(buffer_handle, -1);
+        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
+        const float floats[] = {6., 6., 6.};
+        int num = output->dims->data[0];
+        for (int i = 0; i < num; i++) {
+          output->data.f[i] = floats[i];
+        }
+        return kTfLiteOk;
+      };
+
+      delegate_.FreeBufferHandle =
+          [](TfLiteContext* context, TfLiteDelegate* delegate,
+             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
+      // Store type-punned data SimpleDelegate structure.
+      delegate_.data_ = static_cast<void*>(this);
+      delegate_.flags = delegate_flags;
+    }
+
+    TfLiteRegistration FakeFusedRegistration() {
+      TfLiteRegistration reg = {nullptr};
+      reg.custom_name = "fake_fused_op";
+
+      reg.invoke = [](TfLiteContext* context,
+                      TfLiteNode* node) -> TfLiteStatus {
+        // Copy input data to output data.
+        const TfLiteTensor* a0;
+        const TfLiteTensor* a1;
+        if (node->inputs->size == 2) {
+          a0 = GetInput(context, node, 0);
+          a1 = GetInput(context, node, 1);
+        } else {
+          a0 = GetInput(context, node, 0);
+          a1 = a0;
+        }
+        TfLiteTensor* out = GetOutput(context, node, 0);
+        int num = 1;
+        for (int i = 0; i < a0->dims->size; ++i) {
+          num *= a0->dims->data[i];
+        }
+        for (int i = 0; i < num; i++) {
+          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
+        }
+        // Make the data stale so that CopyFromBufferHandle can be invoked
+        out->data_is_stale = true;
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_invoke_) {
+        reg.invoke = [](TfLiteContext* context,
+                        TfLiteNode* node) -> TfLiteStatus {
+          return kTfLiteError;
+        };
+      }
+
+      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+        // Set output size to input size
+        const TfLiteTensor* input1;
+        const TfLiteTensor* input2;
+        if (node->inputs->size == 2) {
+          input1 = GetInput(context, node, 0);
+          input2 = GetInput(context, node, 1);
+        } else {
+          input1 = GetInput(context, node, 0);
+          input2 = input1;
+        }
+        TfLiteTensor* output = GetOutput(context, node, 0);
+
+        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
+            context, output, TfLiteIntArrayCopy(input1->dims)));
+        return kTfLiteOk;
+      };
+      if (fail_delegate_node_prepare_) {
+        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
+          return kTfLiteError;
+        };
+      }
+
+      return reg;
+    }
+
+    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
+
+    int min_ops_per_subset() { return min_ops_per_subset_; }
+
+   private:
+    std::vector<int> nodes_;
+    TfLiteDelegate delegate_;
+    bool fail_delegate_node_prepare_ = false;
+    int min_ops_per_subset_ = 0;
+    bool fail_delegate_node_invoke_ = false;
+  };
+
+  std::unique_ptr<Interpreter> interpreter_;
+  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
+};
+namespace {
+
 TEST_F(TestDelegate, BasicDelegate) {
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   interpreter_->ModifyGraphWithDelegate(delegate_->get_tf_lite_delegate());
@@ -1664,7 +1967,7 @@ TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
   // Verify Invoke() behavior.
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  EXPECT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  interpreter_->Invoke();
   for (int i = 0; i < 3; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
@@ -1678,7 +1981,7 @@ TEST_F(TestDelegate, TestResizeInputWithMultipleDelegates) {
 
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  EXPECT_EQ(interpreter_->Invoke(), kTfLiteOk);
+  interpreter_->Invoke();
   for (int i = 0; i < 4; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
diff --git a/tensorflow/lite/interpreter_test.h b/tensorflow/lite/interpreter_test.h
deleted file mode 100644
index d4f0c8a05c5..00000000000
--- a/tensorflow/lite/interpreter_test.h
+++ /dev/null
@@ -1,331 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_INTERPRETER_TEST_H_
-#define TENSORFLOW_LITE_INTERPRETER_TEST_H_
-
-#include <gtest/gtest.h>
-#include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/interpreter.h"
-#include "tensorflow/lite/kernels/internal/compatibility.h"
-#include "tensorflow/lite/kernels/kernel_util.h"
-#include "tensorflow/lite/kernels/register.h"
-
-namespace tflite {
-// InterpreterTest is a friend of Interpreter, so it can access context_.
-class InterpreterTest : public ::testing::Test {
- public:
-  template <typename Delegate>
-  static TfLiteStatus ModifyGraphWithDelegate(
-      Interpreter* interpreter, std::unique_ptr<Delegate> delegate) {
-    Interpreter::TfLiteDelegatePtr tflite_delegate(
-        delegate.release(), [](TfLiteDelegate* delegate) {
-          delete reinterpret_cast<Delegate*>(delegate);
-        });
-    return interpreter->ModifyGraphWithDelegate(std::move(tflite_delegate));
-  }
-
- protected:
-  TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
-
-  Interpreter interpreter_;
-};
-
-// Build a kernel registration for an op that copies its one input
-// to an output
-TfLiteRegistration AddOpRegistration() {
-  TfLiteRegistration reg = {nullptr, nullptr, nullptr, nullptr};
-
-  reg.custom_name = "my_add";
-  reg.builtin_code = tflite::BuiltinOperator_CUSTOM;
-
-  reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-    // Set output size to input size
-    const TfLiteTensor* input1 = GetInput(context, node, 0);
-    const TfLiteTensor* input2 = GetInput(context, node, 1);
-    TfLiteTensor* output = GetOutput(context, node, 0);
-
-    TF_LITE_ENSURE_EQ(context, input1->dims->size, input2->dims->size);
-    for (int i = 0; i < input1->dims->size; ++i) {
-      TF_LITE_ENSURE_EQ(context, input1->dims->data[i], input2->dims->data[i]);
-    }
-
-    TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-        context, output, TfLiteIntArrayCopy(input1->dims)));
-    return kTfLiteOk;
-  };
-
-  reg.invoke = [](TfLiteContext* context, TfLiteNode* node) {
-    // Copy input data to output data.
-    const TfLiteTensor* a0 = GetInput(context, node, 0);
-    TF_LITE_ENSURE(context, a0);
-    TF_LITE_ENSURE(context, a0->data.f);
-    const TfLiteTensor* a1 = GetInput(context, node, 1);
-    TF_LITE_ENSURE(context, a1);
-    TF_LITE_ENSURE(context, a1->data.f);
-    TfLiteTensor* out = GetOutput(context, node, 0);
-    TF_LITE_ENSURE(context, out);
-    TF_LITE_ENSURE(context, out->data.f);
-    int num = a0->dims->data[0];
-    for (int i = 0; i < num; i++) {
-      out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-    }
-    return kTfLiteOk;
-  };
-  return reg;
-}
-
-// TestDelegate is a friend of Interpreter to access RemoveAllDelegates().
-class TestDelegate : public ::testing::Test {
- protected:
-  void SetUp() override {
-    interpreter_.reset(new Interpreter);
-    interpreter_->AddTensors(5);
-    interpreter_->SetInputs({0, 1});
-    interpreter_->SetOutputs({3, 4});
-    TfLiteQuantizationParams quant;
-    interpreter_->SetTensorParametersReadWrite(0, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(1, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(2, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(3, kTfLiteFloat32, "", {3},
-                                               quant);
-    interpreter_->SetTensorParametersReadWrite(4, kTfLiteFloat32, "", {3},
-                                               quant);
-    TfLiteRegistration reg = AddOpRegistration();
-    interpreter_->AddNodeWithParameters({0, 0}, {2}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({1, 1}, {3}, nullptr, 0, nullptr, &reg);
-    interpreter_->AddNodeWithParameters({2, 1}, {4}, nullptr, 0, nullptr, &reg);
-  }
-
-  void TearDown() override {
-    // Interpreter relies on delegate to free the resources properly. Thus
-    // the life cycle of delegate must be longer than interpreter.
-    interpreter_.reset();
-    delegate_.reset();
-  }
-
-  TfLiteBufferHandle last_allocated_handle_ = kTfLiteNullBufferHandle;
-
-  TfLiteBufferHandle AllocateBufferHandle() { return ++last_allocated_handle_; }
-
-  TfLiteStatus RemoveAllDelegates() {
-    return interpreter_->RemoveAllDelegates();
-  }
-
- protected:
-  class SimpleDelegate {
-   public:
-    // Create a simple implementation of a TfLiteDelegate. We use the C++ class
-    // SimpleDelegate and it can produce a handle TfLiteDelegate that is
-    // value-copyable and compatible with TfLite.
-    // fail_node_prepare: To simulate failure of Delegate node's Prepare().
-    // min_ops_per_subset: If >0, partitioning preview is used to choose only
-    // those subsets with min_ops_per_subset number of nodes.
-    // fail_node_invoke: To simulate failure of Delegate node's Invoke().
-    explicit SimpleDelegate(
-        const std::vector<int>& nodes,
-        TfLiteDelegateFlags delegate_flags = kTfLiteDelegateFlagsNone,
-        bool fail_node_prepare = false, int min_ops_per_subset = 0,
-        bool fail_node_invoke = false)
-        : nodes_(nodes),
-          fail_delegate_node_prepare_(fail_node_prepare),
-          min_ops_per_subset_(min_ops_per_subset),
-          fail_delegate_node_invoke_(fail_node_invoke) {
-      delegate_.Prepare = [](TfLiteContext* context,
-                             TfLiteDelegate* delegate) -> TfLiteStatus {
-        auto* simple = static_cast<SimpleDelegate*>(delegate->data_);
-        TfLiteIntArray* nodes_to_separate =
-            TfLiteIntArrayCreate(simple->nodes_.size());
-        // Mark nodes that we want in TfLiteIntArray* structure.
-        int index = 0;
-        for (auto node_index : simple->nodes_) {
-          nodes_to_separate->data[index++] = node_index;
-          // make sure node is added
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-          TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-        }
-        // Check that all nodes are available
-        TfLiteIntArray* execution_plan;
-        TF_LITE_ENSURE_STATUS(
-            context->GetExecutionPlan(context, &execution_plan));
-        for (int exec_index = 0; exec_index < execution_plan->size;
-             exec_index++) {
-          int node_index = execution_plan->data[exec_index];
-          TfLiteNode* node;
-          TfLiteRegistration* reg;
-          context->GetNodeAndRegistration(context, node_index, &node, &reg);
-          if (exec_index == node_index) {
-            // Check op details only if it wasn't delegated already.
-            TFLITE_CHECK_EQ(reg->builtin_code, tflite::BuiltinOperator_CUSTOM);
-            TFLITE_CHECK_EQ(strcmp(reg->custom_name, "my_add"), 0);
-          }
-        }
-
-        // Get preview of delegate partitioning from the context.
-        TfLiteDelegateParams* params_array;
-        int num_partitions;
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        if (simple->min_ops_per_subset() > 0) {
-          // Build a new vector of ops from subsets with atleast the minimum
-          // size.
-          std::vector<int> allowed_ops;
-          for (int idx = 0; idx < num_partitions; ++idx) {
-            const auto* nodes_in_subset = params_array[idx].nodes_to_replace;
-            if (nodes_in_subset->size < simple->min_ops_per_subset()) continue;
-            allowed_ops.insert(allowed_ops.end(), nodes_in_subset->data,
-                               nodes_in_subset->data + nodes_in_subset->size);
-          }
-
-          // Free existing nodes_to_separate & initialize a new array with
-          // allowed_ops.
-          TfLiteIntArrayFree(nodes_to_separate);
-          nodes_to_separate = TfLiteIntArrayCreate(allowed_ops.size());
-          memcpy(nodes_to_separate->data, allowed_ops.data(),
-                 sizeof(int) * nodes_to_separate->size);
-        }
-
-        // Another call to PreviewDelegateParitioning should be okay, since
-        // partitioning memory is managed by context.
-        TFLITE_CHECK_EQ(
-            context->PreviewDelegatePartitioning(
-                context, nodes_to_separate, &params_array, &num_partitions),
-            kTfLiteOk);
-
-        context->ReplaceNodeSubsetsWithDelegateKernels(
-            context, simple->FakeFusedRegistration(), nodes_to_separate,
-            delegate);
-        TfLiteIntArrayFree(nodes_to_separate);
-        return kTfLiteOk;
-      };
-      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
-                                        TfLiteDelegate* delegate,
-                                        TfLiteBufferHandle buffer_handle,
-                                        TfLiteTensor* tensor) -> TfLiteStatus {
-        // TODO(b/156586986): Implement tests to test buffer copying logic.
-        return kTfLiteOk;
-      };
-      delegate_.CopyFromBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle,
-             TfLiteTensor* output) -> TfLiteStatus {
-        TFLITE_CHECK_GE(buffer_handle, -1);
-        TFLITE_CHECK_EQ(output->buffer_handle, buffer_handle);
-        const float floats[] = {6., 6., 6.};
-        int num = output->dims->data[0];
-        for (int i = 0; i < num; i++) {
-          output->data.f[i] = floats[i];
-        }
-        return kTfLiteOk;
-      };
-
-      delegate_.FreeBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle* handle) { *handle = kTfLiteNullBufferHandle; };
-      // Store type-punned data SimpleDelegate structure.
-      delegate_.data_ = static_cast<void*>(this);
-      delegate_.flags = delegate_flags;
-    }
-
-    TfLiteRegistration FakeFusedRegistration() {
-      TfLiteRegistration reg = {nullptr};
-      reg.custom_name = "fake_fused_op";
-
-      reg.invoke = [](TfLiteContext* context,
-                      TfLiteNode* node) -> TfLiteStatus {
-        // Copy input data to output data.
-        const TfLiteTensor* a0;
-        const TfLiteTensor* a1;
-        if (node->inputs->size == 2) {
-          a0 = GetInput(context, node, 0);
-          a1 = GetInput(context, node, 1);
-        } else {
-          a0 = GetInput(context, node, 0);
-          a1 = a0;
-        }
-        TfLiteTensor* out = GetOutput(context, node, 0);
-        int num = 1;
-        for (int i = 0; i < a0->dims->size; ++i) {
-          num *= a0->dims->data[i];
-        }
-        for (int i = 0; i < num; i++) {
-          out->data.f[i] = a0->data.f[i] + a1->data.f[i];
-        }
-        // Make the data stale so that CopyFromBufferHandle can be invoked
-        if (out->buffer_handle != kTfLiteNullBufferHandle) {
-          out->data_is_stale = true;
-        }
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_invoke_) {
-        reg.invoke = [](TfLiteContext* context,
-                        TfLiteNode* node) -> TfLiteStatus {
-          return kTfLiteError;
-        };
-      }
-
-      reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-        // Set output size to input size
-        const TfLiteTensor* input1;
-        const TfLiteTensor* input2;
-        if (node->inputs->size == 2) {
-          input1 = GetInput(context, node, 0);
-          input2 = GetInput(context, node, 1);
-        } else {
-          input1 = GetInput(context, node, 0);
-          input2 = input1;
-        }
-        TfLiteTensor* output = GetOutput(context, node, 0);
-
-        TF_LITE_ENSURE_STATUS(context->ResizeTensor(
-            context, output, TfLiteIntArrayCopy(input1->dims)));
-        return kTfLiteOk;
-      };
-      if (fail_delegate_node_prepare_) {
-        reg.prepare = [](TfLiteContext* context, TfLiteNode* node) {
-          return kTfLiteError;
-        };
-      }
-
-      return reg;
-    }
-
-    TfLiteDelegate* get_tf_lite_delegate() { return &delegate_; }
-
-    int min_ops_per_subset() { return min_ops_per_subset_; }
-
-   private:
-    std::vector<int> nodes_;
-    TfLiteDelegate delegate_;
-    bool fail_delegate_node_prepare_ = false;
-    int min_ops_per_subset_ = 0;
-    bool fail_delegate_node_invoke_ = false;
-  };
-
-  std::unique_ptr<Interpreter> interpreter_;
-  std::unique_ptr<SimpleDelegate> delegate_, delegate2_;
-};
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_INTERPRETER_TEST_H_

From d6dd56f74f228227dc9781bd389147df61d3784e Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 17:26:04 +0200
Subject: [PATCH 258/412] Remove original fn call

---
 tensorflow/python/ops/map_fn.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index e39d35c36b0..b98b4ad10bc 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -481,7 +481,6 @@ def map_fn(fn,
       elems_value = elems_unflatten(elems_value_flat)
       autographed_fn = autograph.tf_convert(fn, autograph_ctx.control_status_ctx())
       result_value = autographed_fn(elems_value)
-      result_value = fn(elems_value)
       nest.assert_same_structure(fn_output_signature or elems, result_value)
       result_value_flat = nest.flatten(result_value)
       result_value_batchable = _result_value_flat_to_batchable(

From ec0026c8c38319c8ea1cc6ce80a1e5b6bb48c502 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Fri, 15 May 2020 08:38:22 -0700
Subject: [PATCH 259/412] Update ops_history for bincount.

PiperOrigin-RevId: 311735482
Change-Id: I4bff5fdf6a840a5a5c692b5b906817815d41ba71
---
 tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt  | 2 +-
 tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt | 2 +-
 tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
index e26e1639e82..9bab6854e40 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/DenseBincount.pbtxt
@@ -39,7 +39,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
index 9d94149cc09..4f5fb24109c 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RaggedBincount.pbtxt
@@ -43,7 +43,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
index 333b71a5e1c..9bbc5132845 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseBincount.pbtxt
@@ -47,7 +47,7 @@ op {
     }
   }
   attr {
-    name: "binary_count"
+    name: "binary_output"
     type: "bool"
     default_value {
       b: false

From e8d51ef6010faec838b34fe07cddb7369721d903 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Fri, 15 May 2020 08:50:01 -0700
Subject: [PATCH 260/412] Remove the unnecessary address-returning operator and
 lamda expression.

PiperOrigin-RevId: 311737378
Change-Id: I55bf12bf66540ed32dd48d61da7f41bdf2ace5eb
---
 tensorflow/lite/tools/evaluation/utils.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 33967b6f4ea..3807814fee1 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -119,7 +119,7 @@ TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
 #if defined(__ANDROID__)
 TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options) {
   return TfLiteDelegatePtr(TfLiteGpuDelegateV2Create(options),
-                           &TfLiteGpuDelegateV2Delete);
+                           TfLiteGpuDelegateV2Delete);
 }
 #endif  // defined(__ANDROID__)
 
@@ -184,9 +184,7 @@ TfLiteDelegatePtr CreateXNNPACKDelegate() {
 TfLiteDelegatePtr CreateXNNPACKDelegate(
     const TfLiteXNNPackDelegateOptions* xnnpack_options) {
   auto xnnpack_delegate = TfLiteXNNPackDelegateCreate(xnnpack_options);
-  return TfLiteDelegatePtr(xnnpack_delegate, [](TfLiteDelegate* delegate) {
-    TfLiteXNNPackDelegateDelete(delegate);
-  });
+  return TfLiteDelegatePtr(xnnpack_delegate, TfLiteXNNPackDelegateDelete);
 }
 
 TfLiteDelegatePtr CreateXNNPACKDelegate(int num_threads) {

From 6bddca85b3f792cef733da529ea3fbb92fcb9522 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 15 May 2020 09:18:04 -0700
Subject: [PATCH 261/412] Use fully-qualified std::string in TraceMe

PiperOrigin-RevId: 311741974
Change-Id: Ic9100c53ded4011b590651cbb5ca276b093a3fc2
---
 .../core/profiler/internal/traceme_recorder.h |  5 +-
 tensorflow/core/profiler/lib/traceme.h        | 54 +++++++++----------
 2 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/tensorflow/core/profiler/internal/traceme_recorder.h b/tensorflow/core/profiler/internal/traceme_recorder.h
index 1da7d4cebb1..5fdea5bddbd 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder.h
+++ b/tensorflow/core/profiler/internal/traceme_recorder.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TRACEME_RECORDER_H_
 
 #include <atomic>
+#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -52,13 +53,13 @@ class TraceMeRecorder {
   // Times are in ns since the Unix epoch.
   struct Event {
     uint64 activity_id;
-    string name;
+    std::string name;
     uint64 start_time;  // 0 = missing
     uint64 end_time;    // 0 = missing
   };
   struct ThreadInfo {
     uint32 tid;
-    string name;
+    std::string name;
   };
   struct ThreadEvents {
     ThreadInfo thread;
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index af93ac11b1e..2c3e3ebe6cc 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -16,12 +16,10 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_
 
 #include <new>
+#include <string>
 #include <utility>
 
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "absl/strings/strip.h"
 #include "tensorflow/core/platform/env_time.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
@@ -78,20 +76,21 @@ inline int GetTFTraceMeLevel(bool is_expensive) {
 //          auto id = ActivityStart("step");
 //          ... do some work ...
 //          ActivityEnd(id);
+//       The two static methods should be called within the same thread.
 class TraceMe {
  public:
-  // Constructor that traces a user-defined activity labeled with activity_name
+  // Constructor that traces a user-defined activity labeled with name
   // in the UI. Level defines the trace priority, used for filtering TraceMe
   // events. By default, traces with TraceMe level <= 2 are recorded. Levels:
   // - Must be a positive integer.
   // - Can be a value in enum TraceMeLevel.
   // Users are welcome to use level > 3 in their code, if they wish to filter
   // out their host traces based on verbosity.
-  explicit TraceMe(absl::string_view activity_name, int level = 1) {
+  explicit TraceMe(absl::string_view name, int level = 1) {
     DCHECK_GE(level, 1);
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) string(activity_name);
+      new (&no_init_.name) std::string(name);
       start_time_ = EnvTime::NowNanos();
     }
 #endif
@@ -102,26 +101,26 @@ class TraceMe {
   // Note: We can't take the string by value because a) it would make the
   // overloads ambiguous, and b) we want lvalue strings to use the string_view
   // constructor so we avoid copying them when tracing is disabled.
-  explicit TraceMe(string &&activity_name, int level = 1) {
+  explicit TraceMe(std::string&& name, int level = 1) {
     DCHECK_GE(level, 1);
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) string(std::move(activity_name));
+      new (&no_init_.name) std::string(std::move(name));
       start_time_ = EnvTime::NowNanos();
     }
 #endif
   }
 
   // Do not allow passing strings by reference or value since the caller
-  // may unintentionally maintain ownership of the activity_name.
-  // Explicitly std::move the activity_name or wrap it in a string_view if
+  // may unintentionally maintain ownership of the name.
+  // Explicitly std::move the name or wrap it in a string_view if
   // you really wish to maintain ownership.
-  explicit TraceMe(const string &activity_name, int level = 1) = delete;
+  explicit TraceMe(const std::string& name, int level = 1) = delete;
 
   // This overload is necessary to make TraceMe's with string literals work.
   // Otherwise, the string&& and the string_view constructor would be equally
   // good overload candidates.
-  explicit TraceMe(const char *raw, int level = 1)
+  explicit TraceMe(const char* raw, int level = 1)
       : TraceMe(absl::string_view(raw), level) {}
 
   // This overload only generates the activity name if tracing is enabled.
@@ -136,12 +135,14 @@ class TraceMe {
     DCHECK_GE(level, 1);
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
-      new (&no_init_.name) string(name_generator());
+      new (&no_init_.name) std::string(name_generator());
       start_time_ = EnvTime::NowNanos();
     }
 #endif
   }
 
+  ~TraceMe() { Stop(); }
+
   // Stop tracing the activity. Called by the destructor, but exposed to allow
   // stopping tracing before the object goes out of scope. Only has an effect
   // the first time it is called.
@@ -171,23 +172,21 @@ class TraceMe {
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) {
       if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
-        absl::string_view orig = no_init_.name;
-        if (absl::EndsWith(orig, "#")) {
-          // orig does have metadata.
-          absl::ConsumeSuffix(&orig, "#");
-          absl::ConsumePrefix(&new_metadata, "#");
-          no_init_.name = absl::StrCat(orig, ",", new_metadata);
-        } else {
-          // orig does not have metadata.
-          absl::StrAppend(&no_init_.name, new_metadata);
+        std::string& name = no_init_.name;
+        DCHECK(!name.empty());
+        DCHECK(!new_metadata.empty());
+        if (name.back() == '#') {  // name already has metadata
+          name.back() = ',';
+          if (TF_PREDICT_TRUE(new_metadata.front() == '#')) {
+            new_metadata.remove_prefix(1);
+          }
         }
+        name.append(new_metadata.data(), new_metadata.size());
       }
     }
 #endif
   }
 
-  ~TraceMe() { Stop(); }
-
   // Static API, for use when scoped objects are inconvenient.
 
   // Record the start time of an activity.
@@ -196,7 +195,7 @@ class TraceMe {
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) {
       uint64 activity_id = TraceMeRecorder::NewActivityId();
-      TraceMeRecorder::Record({activity_id, string(name),
+      TraceMeRecorder::Record({activity_id, std::string(name),
                                /*start_time=*/EnvTime::NowNanos(),
                                /*end_time=*/0});
       return activity_id;
@@ -211,7 +210,8 @@ class TraceMe {
     // We don't check the level again (see TraceMe::Stop()).
     if (TF_PREDICT_FALSE(activity_id != kUntracedActivity)) {
       if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
-        TraceMeRecorder::Record({activity_id, /*name=*/"", /*start_time=*/0,
+        TraceMeRecorder::Record({activity_id, /*name=*/std::string(),
+                                 /*start_time=*/0,
                                  /*end_time=*/EnvTime::NowNanos()});
       }
     }
@@ -239,7 +239,7 @@ class TraceMe {
   union NoInit {
     NoInit() {}
     ~NoInit() {}
-    string name;
+    std::string name;
   } no_init_;
 
   uint64 start_time_ = kUntracedActivity;

From 64d839bb754b104e151bb49bb4ec46dbe690745d Mon Sep 17 00:00:00 2001
From: bhack <bhack@users.noreply.github.com>
Date: Fri, 15 May 2020 18:21:51 +0200
Subject: [PATCH 262/412] Fix lint and improve readibility

---
 tensorflow/python/ops/map_fn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/map_fn.py b/tensorflow/python/ops/map_fn.py
index b98b4ad10bc..40f8edfcdd1 100644
--- a/tensorflow/python/ops/map_fn.py
+++ b/tensorflow/python/ops/map_fn.py
@@ -479,7 +479,8 @@ def map_fn(fn,
       elems_value_flat = _elems_value_batchable_to_flat(elems_value_batchable,
                                                         elems_flat_signature)
       elems_value = elems_unflatten(elems_value_flat)
-      autographed_fn = autograph.tf_convert(fn, autograph_ctx.control_status_ctx())
+      ag_ctx = autograph_ctx.control_status_ctx()
+      autographed_fn = autograph.tf_convert(fn, ag_ctx)
       result_value = autographed_fn(elems_value)
       nest.assert_same_structure(fn_output_signature or elems, result_value)
       result_value_flat = nest.flatten(result_value)

From c568e0dd7fe372db4f5380f20ea2f96ebdd1b935 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 09:19:40 -0700
Subject: [PATCH 263/412] Avoid overhead for creating executors if there is no
 change in execution mode.

PiperOrigin-RevId: 311742240
Change-Id: I8676ab711d3c3d9e64d4ec142e5d934f7c32ee73
---
 tensorflow/python/eager/context.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index c6ef21402d2..86b3d5cf95f 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -2027,8 +2027,6 @@ def execution_mode(mode):
   """Context manager for setting execution mode for current thread."""
   if mode is None:
     yield
-  elif (mode == ASYNC) == context().executor.is_async():
-    yield
   else:
     ctx = context()
     executor_new = executor.new_executor(mode == ASYNC)

From 02b5a6754bb9f62f1b415783e684ab8a69c4a01b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 09:32:35 -0700
Subject: [PATCH 264/412] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/9d4b4f344d8e

PiperOrigin-RevId: 311744575
Change-Id: Icdd7f018b188db8f8768f3b40d6411c2257547c1
---
 .../compiler/xla/service/mlir_gpu/BUILD       |  2 +-
 .../conv_emitter/conv_emitter_test.cc         |  2 +-
 .../xla/service/mlir_gpu/kernel_lowering.cc   |  6 ++--
 third_party/mlir/BUILD                        | 34 +++++++++----------
 third_party/mlir/test.BUILD                   |  2 +-
 5 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
index a57e4300d6e..07655a61074 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD
+++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD
@@ -185,10 +185,10 @@ cc_library(
         "@llvm-project//mlir:LinalgOps",
         "@llvm-project//mlir:LinalgToLLVM",
         "@llvm-project//mlir:LinalgTransforms",
-        "@llvm-project//mlir:LoopsToGPUPass",
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:SCFToGPUPass",
         "@llvm-project//mlir:SCFTransforms",
         "@llvm-project//mlir:StandardOps",
         "@llvm-project//mlir:Support",
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
index 56684b1f726..d5cad385324 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/experimental/conv_emitter/conv_emitter_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/Support/raw_ostream.h"
-#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
index 847ad918308..4645b084eb6 100644
--- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc
@@ -19,8 +19,8 @@ limitations under the License.
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
 #include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"  // from @llvm-project
-#include "mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"  // from @llvm-project
+#include "mlir/Conversion/SCFToStandard/SCFToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"  // from @llvm-project
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"  // from @llvm-project
 #include "mlir/Dialect/Affine/IR/AffineOps.h"  // from @llvm-project
@@ -351,7 +351,7 @@ struct FixKernelFunctionSignatures
 struct MapParallelLoops
     : public mlir::PassWrapper<MapParallelLoops, mlir::FunctionPass> {
   void runOnFunction() override {
-    mlir::greedilyMapParallelLoopsToGPU(getFunction().getBody());
+    mlir::greedilyMapParallelSCFToGPU(getFunction().getBody());
   }
 };
 
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 93843d58f30..5636bc27cff 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -2005,9 +2005,9 @@ cc_library(
 )
 
 cc_library(
-    name = "LoopsToGPU",
-    srcs = ["lib/Conversion/LoopsToGPU/LoopsToGPU.cpp"],
-    hdrs = ["include/mlir/Conversion/LoopsToGPU/LoopsToGPU.h"],
+    name = "SCFToGPU",
+    srcs = ["lib/Conversion/SCFToGPU/SCFToGPU.cpp"],
+    hdrs = ["include/mlir/Conversion/SCFToGPU/SCFToGPU.h"],
     includes = ["include"],
     deps = [
         ":Affine",
@@ -2027,22 +2027,22 @@ cc_library(
 )
 
 cc_library(
-    name = "LoopsToGPUPass",
+    name = "SCFToGPUPass",
     srcs = [
-        "lib/Conversion/LoopsToGPU/LoopsToGPUPass.cpp",
         "lib/Conversion/PassDetail.h",
+        "lib/Conversion/SCFToGPU/SCFToGPUPass.cpp",
     ],
     hdrs = [
-        "include/mlir/Conversion/LoopsToGPU/LoopsToGPUPass.h",
+        "include/mlir/Conversion/SCFToGPU/SCFToGPUPass.h",
     ],
     includes = ["include"],
     deps = [
         ":Affine",
         ":ConversionPassIncGen",
         ":GPUDialect",
-        ":LoopsToGPU",
         ":Pass",
         ":SCFDialect",
+        ":SCFToGPU",
         ":StandardOps",
         ":Support",
         ":Transforms",
@@ -2053,11 +2053,11 @@ cc_library(
 cc_library(
     name = "CFGTransforms",
     srcs = [
-        "lib/Conversion/LoopToStandard/LoopToStandard.cpp",
         "lib/Conversion/PassDetail.h",
+        "lib/Conversion/SCFToStandard/SCFToStandard.cpp",
     ],
     hdrs = [
-        "include/mlir/Conversion/LoopToStandard/ConvertLoopToStandard.h",
+        "include/mlir/Conversion/SCFToStandard/SCFToStandard.h",
     ],
     includes = ["include"],
     deps = [
@@ -2468,7 +2468,7 @@ cc_library(
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
-        ":VectorToLoops",
+        ":VectorToSCF",
         "@llvm-project//llvm:support",
         "@llvm-project//mlir/test:TestAffine",
         "@llvm-project//mlir/test:TestDialect",
@@ -2547,13 +2547,13 @@ cc_library(
         ":LinalgToStandard",
         ":LinalgTransforms",
         ":LoopPassIncGen",
-        ":LoopsToGPUPass",
         ":NVVMDialect",
         ":OpenMPDialect",
         ":QuantOps",
         ":QuantPassIncGen",
         ":ROCDLDialect",
         ":SCFDialect",
+        ":SCFToGPUPass",
         ":SCFTransforms",
         ":SDBM",
         ":SPIRVDialect",
@@ -2602,11 +2602,11 @@ cc_binary(
     deps = [
         ":Analysis",
         ":IR",
-        ":LoopsToGPUPass",
         ":MlirOptLib",
         ":MlirOptMain",
         ":OpenMPDialect",
         ":QuantOps",
+        ":SCFToGPUPass",
         ":Transforms",
         "@llvm-project//llvm:all_targets",
         "@llvm-project//llvm:support",
@@ -3117,7 +3117,7 @@ cc_library(
         ":Support",
         ":Transforms",
         ":VectorToLLVM",
-        ":VectorToLoops",
+        ":VectorToSCF",
         "@llvm-project//llvm:core",
         "@llvm-project//llvm:support",
     ],
@@ -3355,13 +3355,13 @@ cc_library(
 )
 
 cc_library(
-    name = "VectorToLoops",
+    name = "VectorToSCF",
     srcs = glob([
-        "lib/Conversion/VectorToLoops/*.cpp",
-        "lib/Conversion/VectorToLoops/*.h",
+        "lib/Conversion/VectorToSCF/*.cpp",
+        "lib/Conversion/VectorToSCF/*.h",
     ]),
     hdrs = glob([
-        "include/mlir/Conversion/VectorToLoops/*.h",
+        "include/mlir/Conversion/VectorToSCF/*.h",
     ]),
     includes = ["include"],
     deps = [
diff --git a/third_party/mlir/test.BUILD b/third_party/mlir/test.BUILD
index eb5d8a650eb..24b310f076e 100644
--- a/third_party/mlir/test.BUILD
+++ b/third_party/mlir/test.BUILD
@@ -171,7 +171,7 @@ cc_library(
         "@llvm-project//mlir:Transforms",
         "@llvm-project//mlir:VectorOps",
         "@llvm-project//mlir:VectorToLLVM",
-        "@llvm-project//mlir:VectorToLoops",
+        "@llvm-project//mlir:VectorToSCF",
     ],
 )
 

From 9957cb60a248ba1e61d5606a3d0a189290f36b37 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Fri, 15 May 2020 10:05:35 -0700
Subject: [PATCH 265/412] Bump open source llvm revision to
 9d4b4f344d8ea917e082cf58d66b71c0171e1650

PiperOrigin-RevId: 311751290
Change-Id: Ie8366f82180116dd363c3ed7ece36f948196bf1b
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index c3d097a8362..949c6920e33 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -655,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "bfa200ebcf3706fde0dde335a3c1fa3fe1b3ba3f"
-    LLVM_SHA256 = "72deefcfe20434cb27a31ff9503c348dcf21065dbd27e9fa54c1fb3f5089b8e1"
+    LLVM_COMMIT = "9d4b4f344d8ea917e082cf58d66b71c0171e1650"
+    LLVM_SHA256 = "36e4470b5656cea3e0afb218edbdd96376fcb51dc2c5ed887b21237068baee41"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 53c634a6c150da732dcd6305478ffecd6a887668 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Fri, 15 May 2020 10:17:05 -0700
Subject: [PATCH 266/412] [MLIR/XLA] Constant sinking to control flow regions.

This is necessary for exporting to XLA since functional control flow is expected.

PiperOrigin-RevId: 311753796
Change-Id: If4e50a3b2fa668f162c9b30cc80e2bf743a9b641
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |  1 +
 .../tensorflow/utils/compile_mlir_util.cc     |  4 +
 tensorflow/compiler/mlir/xla/BUILD            | 19 +++++
 .../tests/sink-constants-to-control-flow.mlir | 60 +++++++++++++
 .../compiler/mlir/xla/transforms/passes.h     |  4 +
 .../sink_constants_to_control_flow.cc         | 85 +++++++++++++++++++
 6 files changed, 173 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
 create mode 100644 tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index 54b560ed6ce..eb220a31f80 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1140,6 +1140,7 @@ COMPILE_MLIR_UTIL_DEPS = [
     "//tensorflow/compiler/mlir/xla:type_to_shape",
     "//tensorflow/compiler/mlir/xla:xla_legalize_tf",
     "//tensorflow/compiler/mlir/xla:xla_legalize_tf_with_tf2xla",
+    "//tensorflow/compiler/mlir/xla:xla_sink_constants_to_control_flow",
     "//tensorflow/compiler/tf2xla:common",
     "//tensorflow/compiler/tf2xla:xla_compiler",
     "//tensorflow/core:framework",
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
index e8ca691f961..03283da0112 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc
@@ -305,6 +305,10 @@ Status ConvertMLIRToXlaComputation(
   // invocation.
   tf2xla.addNestedPass<mlir::FuncOp>(
       mlir::xla_hlo::createLegalizeTFPass(false));
+  // In order to export to XLA, we must sink constants to control flow regions,
+  // since XLA uses functional control flow.
+  tf2xla.addNestedPass<mlir::FuncOp>(
+      mlir::xla_hlo::createSinkConstantsToControlFlowPass());
 
   if (VLOG_IS_ON(1)) {
     // Print the whole module after each pass which requires disabling
diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD
index 12334e463fa..179a637ec7b 100644
--- a/tensorflow/compiler/mlir/xla/BUILD
+++ b/tensorflow/compiler/mlir/xla/BUILD
@@ -193,6 +193,24 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "xla_sink_constants_to_control_flow",
+    srcs = [
+        "transforms/sink_constants_to_control_flow.cc",
+    ],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:lower_tf_lib",
+        "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "map_xla_to_scalar_op",
     hdrs = ["transforms/map_xla_to_scalar_op.h"],
@@ -873,6 +891,7 @@ cc_library(
         ":xla_legalize_to_standard",
         ":xla_lower",
         ":xla_materialize_broadcasts",
+        ":xla_sink_constants_to_control_flow",
         ":xla_test_passes",
     ],
 )
diff --git a/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir b/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
new file mode 100644
index 00000000000..c2fbad2faec
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/tests/sink-constants-to-control-flow.mlir
@@ -0,0 +1,60 @@
+// RUN: xla-opt %s -xla-hlo-sink-constants-to-control-flow | FileCheck %s --dump-input=fail
+
+// Tests sinking constants to a while loop.
+
+// CHECK-LABEL: func @sink_const_to_while
+func @sink_const_to_while(%arg0: tensor<i64>) -> tensor<i64> {
+  // CHECK-NEXT: xla_hlo.while
+  %c0 = xla_hlo.constant dense<1> : tensor<i64>
+  %c1 = xla_hlo.constant dense<2> : tensor<i64>
+  %0 = "xla_hlo.while"(%arg0) ( {
+  ^bb0(%arg1: tensor<i64>):
+    // CHECK: %[[ARG1A:.+]]: tensor<i64>
+    // CHECK: %[[C0:.+]] = xla_hlo.constant dense<1> : tensor<i64>
+    // CHECK: "xla_hlo.compare"(%[[C0]], %[[ARG1A]])
+    %1 = "xla_hlo.compare"(%c0, %arg1) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+    "xla_hlo.return"(%1) : (tensor<i1>) -> ()
+  },  {
+  ^bb0(%arg1: tensor<i64>):
+    // CHECK: %[[ARG1B:.+]]: tensor<i64>
+    // CHECK-DAG: %[[C1:.+]] = xla_hlo.constant dense<2> : tensor<i64>
+    // CHECK-DAG: %[[ADD0:.+]] = xla_hlo.add %[[ARG1B]], %[[ARG1B]]
+    %2 = xla_hlo.add %arg1, %arg1 : tensor<i64>
+    // CHECK: %[[ADD1:.+]] = xla_hlo.add %[[C1]], %[[ADD0]]
+    %3 = xla_hlo.add %c1, %2 : tensor<i64>
+    // CHECK: %[[ADD2:.+]] = xla_hlo.add %[[C1]], %[[ADD1]]
+    %4 = xla_hlo.add %c1, %3 : tensor<i64>
+    "xla_hlo.return"(%4) : (tensor<i64>) -> ()
+  }) : (tensor<i64>) -> tensor<i64>
+  return %0 : tensor<i64>
+}
+
+// Tests sinking constants to a conditional op.
+
+// CHECK-LABEL: func @sink_const_to_conditional
+func @sink_const_to_conditional(%arg0: tensor<i64>) -> tensor<i64> {
+  %c0 = xla_hlo.constant dense<1> : tensor<i64>
+  %c1 = xla_hlo.constant dense<2> : tensor<i64>
+  %0 = "xla_hlo.compare"(%arg0, %c0) {comparison_direction = "LT"} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  %1 = "xla_hlo.tuple"(%arg0) : (tensor<i64>) -> tuple<tensor<i64>>
+  // CHECK: xla_hlo.conditional
+  %2 = "xla_hlo.conditional"(%0, %1, %1) ( {
+  ^bb0(%arg1: tuple<tensor<i64>>):
+    // CHECK: %[[C0:.+]] = xla_hlo.constant dense<1> : tensor<i64>
+    %3 = "xla_hlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
+    // CHECK: %[[ADD0:.+]] = xla_hlo.add %[[C0]],
+    %4 = xla_hlo.add %c0, %3 : tensor<i64>
+    %5 = "xla_hlo.tuple"(%4) : (tensor<i64>) -> tuple<tensor<i64>>
+    "xla_hlo.return"(%5) : (tuple<tensor<i64>>) -> ()
+  },  {
+  ^bb0(%arg1: tuple<tensor<i64>>):
+    // CHECK: %[[C1:.+]] = xla_hlo.constant dense<2> : tensor<i64>
+    %6 = "xla_hlo.get_tuple_element"(%arg1) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
+    // CHECK: %[[ADD1:.+]] = xla_hlo.add %[[C1]],
+    %7 = xla_hlo.add %c1, %6 : tensor<i64>
+    %8 = "xla_hlo.tuple"(%7) : (tensor<i64>) -> tuple<tensor<i64>>
+    "xla_hlo.return"(%8) : (tuple<tensor<i64>>) -> ()
+  }) : (tensor<i1>, tuple<tensor<i64>>, tuple<tensor<i64>>) -> tuple<tensor<i64>>
+  %9 = "xla_hlo.get_tuple_element"(%2) {index = 0 : i32} : (tuple<tensor<i64>>) -> tensor<i64>
+  return %9 : tensor<i64>
+}
diff --git a/tensorflow/compiler/mlir/xla/transforms/passes.h b/tensorflow/compiler/mlir/xla/transforms/passes.h
index 39375e210d5..b148eac4286 100644
--- a/tensorflow/compiler/mlir/xla/transforms/passes.h
+++ b/tensorflow/compiler/mlir/xla/transforms/passes.h
@@ -65,6 +65,10 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToLhloPass();
 // Lowers from HLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<FuncOp>> createLegalizeHloToLinalgPass();
 
+// Sinks constants implicitly captured in control flow regions. This is
+// necessary to export to XLA.
+std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass();
+
 }  // namespace xla_hlo
 
 namespace xla_lhlo {
diff --git a/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc b/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc
new file mode 100644
index 00000000000..29646465acd
--- /dev/null
+++ b/tensorflow/compiler/mlir/xla/transforms/sink_constants_to_control_flow.cc
@@ -0,0 +1,85 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/xla/ir/hlo_ops.h"
+
+namespace mlir {
+namespace xla_hlo {
+
+namespace {
+
+// A pass that sinks constants implicitly captured in control flow regions. This
+// is necessary to export to XLA.
+class SinkConstantsToControlFlow
+    : public mlir::PassWrapper<SinkConstantsToControlFlow, FunctionPass> {
+  void runOnFunction() override {
+    getFunction().walk([](Operation* op) {
+      if (auto while_op = llvm::dyn_cast<WhileOp>(op)) {
+        SinkToRegion(&while_op.body());
+        SinkToRegion(&while_op.cond());
+      } else if (auto cond_op = llvm::dyn_cast<ConditionalOp>(op)) {
+        SinkToRegion(&cond_op.true_branch());
+        SinkToRegion(&cond_op.false_branch());
+      }
+    });
+  }
+
+ private:
+  // Performs constant sinking into a region.
+  static void SinkToRegion(Region* region) {
+    llvm::DenseMap<Value, ConstOp> sunk_constant;
+    visitUsedValuesDefinedAbove({*region}, [&](OpOperand* use) {
+      Value constant = use->get();
+      auto const_op = dyn_cast_or_null<ConstOp>(constant.getDefiningOp());
+      if (!const_op) return;
+      auto map_entry = sunk_constant.try_emplace(constant, nullptr);
+      if (!map_entry.second) {
+        // This constant has already been cloned into the region, reuse it.
+        use->set(map_entry.first->getSecond().getResult());
+        if (constant.use_empty()) const_op.erase();
+        return;
+      }
+      if (constant.hasOneUse()) {
+        const_op.getOperation()->moveBefore(&region->front().front());
+        return;
+      }
+      map_entry.first->getSecond() = const_op.clone();
+      region->front().getOperations().insert(region->front().begin(),
+                                             map_entry.first->getSecond());
+      use->set(map_entry.first->getSecond().getResult());
+    });
+  }
+};
+
+static mlir::PassRegistration<SinkConstantsToControlFlow> pass(
+    "xla-hlo-sink-constants-to-control-flow",
+    "Sink constants implicitly captured in control flow regions. This is "
+    "necessary to export to XLA.");
+
+}  // anonymous namespace
+
+std::unique_ptr<OperationPass<FuncOp>> createSinkConstantsToControlFlowPass() {
+  return std::make_unique<SinkConstantsToControlFlow>();
+}
+
+}  // namespace xla_hlo
+}  // namespace mlir

From 2540d202b5b798c7cea953b60247b834bef3ca07 Mon Sep 17 00:00:00 2001
From: Yuanzhong Xu <yuanzx@google.com>
Date: Fri, 15 May 2020 10:19:17 -0700
Subject: [PATCH 267/412] Fix TF2XLA's InitGraph for unused feeds.

If a feed is not used, previously it would prune the placeholders and cause crashes.

PiperOrigin-RevId: 311754319
Change-Id: Ie1ad67c21ffb83ba88aeabea94c416473df099a0
---
 .../compiler/tf2xla/graph_compiler_util.cc    | 27 ++++++++++----
 tensorflow/compiler/tf2xla/tf2xla_test.cc     | 37 +++++++++++++++++++
 2 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/graph_compiler_util.cc b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
index 57278eea292..a9385e05564 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler_util.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
@@ -49,10 +49,12 @@ typedef std::unordered_map<string, Node*> NodeMap;
 // Each feed id identifies the positional output of some node, which may consist
 // of multiple edges. AddPlaceholdersForFeeds has already replaced each fed
 // tensor with a placeholder.  For each feed tensor, replaces all edges so they
-// point from a new _Arg node instead.
+// point from a new _Arg node instead. The newly created _Arg nodes are added to
+// `arg_nodes`.
 Status AddArgNodes(Graph* graph, const NodeMap& node_map,
                    const protobuf::RepeatedPtrField<tf2xla::Feed>& feeds,
-                   const std::unordered_map<string, string>& feed_remapping) {
+                   const std::unordered_map<string, string>& feed_remapping,
+                   std::unordered_set<const Node*>* arg_nodes) {
   for (int arg_index = 0; arg_index < feeds.size(); ++arg_index) {
     const tf2xla::Feed& feed = feeds[arg_index];
     // All feeds have been replaced by placeholders.
@@ -86,6 +88,7 @@ Status AddArgNodes(Graph* graph, const NodeMap& node_map,
             .Attr(kShapeAttr, TensorShape(feed.shape()))
             .Attr(kDebugNameAttr, feed.name())
             .Finalize(graph, &arg_node));
+    arg_nodes->insert(arg_node);
 
     // Collects out-edges from the feed node that have a matching edge index;
     // these will be replaced with edges from the arg node instead.
@@ -149,13 +152,13 @@ Status RewriteAndPruneGraph(
   for (Node* n : graph->nodes()) {
     node_map[n->name()] = n;
   }
+  std::unordered_set<const Node*> nodes_to_keep;
+  TF_RETURN_IF_ERROR(AddArgNodes(graph, node_map, config.feed(), feed_remapping,
+                                 &nodes_to_keep));
   TF_RETURN_IF_ERROR(
-      AddArgNodes(graph, node_map, config.feed(), feed_remapping));
-  std::unordered_set<const Node*> retval_nodes;
-  TF_RETURN_IF_ERROR(
-      AddRetvalNodes(graph, node_map, config.fetch(), &retval_nodes));
+      AddRetvalNodes(graph, node_map, config.fetch(), &nodes_to_keep));
   VLOG(2) << "Post rewrite: " << DumpGraphToFile("tf2xla_post_rewrite", *graph);
-  PruneForReverseReachability(graph, std::move(retval_nodes));
+  PruneForReverseReachability(graph, std::move(nodes_to_keep));
   FixupSourceAndSinkEdges(graph);
   VLOG(2) << "Post prune: " << DumpGraphToFile("tfcompile_post_prune", *graph);
   // Sanity-check, to make sure the feeds and fetches still exist post-pruning.
@@ -277,8 +280,16 @@ Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   // Prune the GraphDef first so that unknown ops that we aren't compiling get
   // filtered out.
   GraphDef second_copy_def;
+  // Add the placeholder nodes as "fetches" in prune_config, such that they will
+  // be preserved in PruneGraphDefInto.
+  auto prune_config = config;
+  for (const auto& entry : feed_remapping) {
+    auto ph = prune_config.add_fetch();
+    *ph->mutable_id()->mutable_node_name() = entry.second;
+    ph->mutable_id()->set_output_index(0);
+  }
   TF_RETURN_IF_ERROR(
-      PruneGraphDefInto(config, first_copy_def, &second_copy_def));
+      PruneGraphDefInto(prune_config, first_copy_def, &second_copy_def));
 
   TF_RETURN_IF_ERROR(AddDefaultAttrsToGraphDef(
       &second_copy_def, *g->op_registry(), /*node_offset=*/0));
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index 24afe595b18..7ea69f734c9 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -99,5 +99,42 @@ TEST(ConvertGraphDefToXla, Sum) {
       ConvertGraphDefToXla(graph_def, config, client, &computation)));
 }
 
+TEST(ConvertGraphDefToXla, SumWithUnusedArgument) {
+  GraphDef graph_def = SumGraph();
+  tf2xla::Config config = SumConfig();
+  NodeDef* unused = graph_def.add_node();
+  unused->set_name("unused");
+  unused->set_op("Placeholder");
+  (*unused->mutable_attr())["dtype"] = TypeAttrValue(DT_INT32);
+  config.add_feed()->mutable_id()->set_node_name("unused");
+
+  xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
+  xla::XlaComputation computation;
+  TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
+
+  // Set up arguments.
+  auto x_literal = xla::LiteralUtil::CreateR0<int32>(10);
+  auto y_literal = xla::LiteralUtil::CreateR0<int32>(32);
+  auto x_global_or = client->TransferToServer(x_literal);
+  auto y_global_or = client->TransferToServer(y_literal);
+  auto unused_global_or = client->TransferToServer(y_literal);
+  TF_EXPECT_OK(x_global_or.status());
+  TF_EXPECT_OK(y_global_or.status());
+  TF_EXPECT_OK(unused_global_or.status());
+  std::unique_ptr<xla::GlobalData> x_global =
+      std::move(x_global_or.ValueOrDie());
+  std::unique_ptr<xla::GlobalData> y_global =
+      std::move(y_global_or.ValueOrDie());
+  std::unique_ptr<xla::GlobalData> unused_global =
+      std::move(unused_global_or.ValueOrDie());
+
+  // Execute and check result.
+  auto result_or = client->ExecuteAndTransfer(
+      computation, {x_global.get(), y_global.get(), unused_global.get()});
+  TF_EXPECT_OK(result_or.status());
+  xla::Literal result = std::move(result_or.ValueOrDie());
+  EXPECT_EQ("(\ns32[] 42\n)", result.ToString());
+}
+
 }  // namespace
 }  // namespace tensorflow

From 76d3d13b5ad112300796a2f78be26031f9c71571 Mon Sep 17 00:00:00 2001
From: Rajeshwar Reddy T <43972606+rthadur@users.noreply.github.com>
Date: Fri, 15 May 2020 10:27:17 -0700
Subject: [PATCH 268/412] Create bot_config.yml

---
 .github/bot_config.yml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .github/bot_config.yml

diff --git a/.github/bot_config.yml b/.github/bot_config.yml
new file mode 100644
index 00000000000..d63bd2ce844
--- /dev/null
+++ b/.github/bot_config.yml
@@ -0,0 +1,29 @@
+ # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ============================================================================
+    #
+    # THIS IS A GENERATED DOCKERFILE.
+    #
+    # This file was assembled from multiple pieces, whose use is documented
+    # throughout. Please refer to the TensorFlow dockerfiles documentation
+    # for more information.
+
+# A list of assignees
+assignees:
+   - amahendrakar
+   - ravikyram
+   - Saduf2019
+# A list of assignees for    
+compiler_assignees:
+   - joker-eph

From d62a22a30000f11bde298daa86d82004e8531767 Mon Sep 17 00:00:00 2001
From: Tomer Kaftan <kaftan@google.com>
Date: Fri, 15 May 2020 10:43:51 -0700
Subject: [PATCH 269/412] Extend Keras Lambda layers to work with functions of
 any signature rather than only functions that take one argument.

Any *args and **kwargs passed when calling the lambda layer will be forwarded directly to the underlying lambda.

PiperOrigin-RevId: 311759844
Change-Id: Ia5ffe17f2951e4fd42d9ee4020c7c8b35ef9122f
---
 tensorflow/python/keras/layers/core.py        | 43 ++++++++++---------
 tensorflow/python/keras/layers/core_test.py   | 20 +++++++++
 .../v1/tensorflow.keras.layers.-lambda.pbtxt  |  2 +-
 .../v2/tensorflow.keras.layers.-lambda.pbtxt  |  2 +-
 4 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index db9c47eca17..d1528c7ba59 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -53,7 +53,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_inspect
+from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -738,7 +738,8 @@ class Lambda(Layer):
   models. `Lambda` layers are best suited for simple operations or
   quick experimentation. For more advanced use cases, follow
   [this guide](https://www.tensorflow.org/guide/keras/custom_layers_and_models)
-  for subclassing `tf.keras.layers.Layer`.
+  for subclassing `tf.keras.layers.Layer`. (Do not subclass
+  `tf.keras.layers.Lamba`.)
 
   The main reason to subclass `tf.keras.layers.Layer` instead of using a
   `Lambda` layer is saving and inspecting a Model. `Lambda` layers
@@ -798,8 +799,7 @@ class Lambda(Layer):
     computation, but anything more complex should use a subclass Layer instead.
 
   Arguments:
-    function: The function to be evaluated. Takes input tensor as first
-      argument.
+    function: The function to evaluate when the layer is called.
     output_shape: Expected output shape from function. This argument can be
       inferred if not explicitly provided. Can be a tuple or function. If a
       tuple, it only specifies the first dimension onward;
@@ -812,8 +812,8 @@ class Lambda(Layer):
     mask: Either None (indicating no masking) or a callable with the same
       signature as the `compute_mask` layer method, or a tensor that will be
       returned as output mask regardless of what the input is.
-    arguments: Optional dictionary of keyword arguments to be passed to the
-      function.
+    arguments: Optional dictionary of keyword arguments to pass by default to
+      the function when those arguments are not passed to the layer call.
   Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
     integers, does not include the samples axis) when using this layer as the
     first layer in a model.
@@ -823,11 +823,16 @@ class Lambda(Layer):
   @trackable.no_automatic_dependency_tracking
   def __init__(self, function, output_shape=None, mask=None, arguments=None,
                **kwargs):
-    super(Lambda, self).__init__(**kwargs)
-
     self.arguments = arguments or {}
     self.function = function
 
+    # Decorate the function to produce this layer's call method
+    def _call_wrapper(*args, **kwargs):
+      return self._call_wrapper(*args, **kwargs)
+    self.call = tf_decorator.make_decorator(function, _call_wrapper)
+
+    super(Lambda, self).__init__(**kwargs)
+
     if mask is not None:
       self.supports_masking = True
     self.mask = mask
@@ -836,9 +841,8 @@ class Lambda(Layer):
     # Warning on every invocation will be quite irksome in Eager mode.
     self._already_warned = False
 
-    function_args = tf_inspect.getfullargspec(function).args
-    self._fn_expects_training_arg = 'training' in function_args
-    self._fn_expects_mask_arg = 'mask' in function_args
+    self._expects_training_arg = 'training' in self._call_fn_args
+    self._expects_mask_arg = 'mask' in self._call_fn_args
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
@@ -869,23 +873,22 @@ class Lambda(Layer):
     output_shapes = tf_utils.convert_shapes(self._output_shape, to_tuples=False)
     return nest.map_structure(_add_batch, output_shapes)
 
-  def call(self, inputs, mask=None, training=None):
+  def _call_wrapper(self, *args, **kwargs):
     # We must copy for thread safety, but it only needs to be a shallow copy.
-    kwargs = {k: v for k, v in self.arguments.items()}
-    if self._fn_expects_mask_arg:
-      kwargs['mask'] = mask
-    if self._fn_expects_training_arg:
-      kwargs['training'] = training
+    call_kwargs = {k: v for k, v in self.arguments.items()}
+
+    # override default kwargs with the args passed to the layer call
+    call_kwargs.update(kwargs)
 
     created_variables = []
-    def _variable_creator(next_creator, **kwargs):
-      var = next_creator(**kwargs)
+    def _variable_creator(next_creator, **creator_kwargs):
+      var = next_creator(**creator_kwargs)
       created_variables.append(var)
       return var
 
     with backprop.GradientTape(watch_accessed_variables=True) as tape,\
         variable_scope.variable_creator_scope(_variable_creator):
-      result = self.function(inputs, **kwargs)
+      result = self.function(*args, **call_kwargs)
     self._check_variables(created_variables, tape.watched_variables())
     return result
 
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index 3daa187f1ce..aa1192e12fc 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -139,6 +139,26 @@ class LambdaLayerTest(keras_parameterized.TestCase):
     out = ld([x1, x2])
     self.assertAllEqual(out.shape, [3, 2])
 
+  def test_lambda_multiple_args(self):
+    ld = keras.layers.Lambda(lambda x, y: x[0] + y)
+    x1 = np.ones([3, 2], np.float32)
+    x2 = np.ones([3, 5], np.float32)
+
+    expected_result = x1 * 2
+    self.assertAllEqual(ld([x1, x2], x1), expected_result)
+    self.assertAllEqual(ld([x1, x2], y=x1), expected_result)
+    self.assertAllEqual(ld(x=[x1, x2], y=x1), expected_result)
+
+  def test_lambda_constructor_args_and_multiple_args(self):
+    x1 = np.ones([3, 2], np.float32)
+    x2 = np.ones([3, 5], np.float32)
+    ld = keras.layers.Lambda(lambda x, y: x[0] + y, arguments={'y': x1*2})
+
+    self.assertAllEqual(ld([x1, x2]), x1 * 3)
+    self.assertAllEqual(ld([x1, x2], y=x1), x1 * 2)
+    self.assertAllEqual(ld(x=[x1, x2]), x1 * 3)
+    self.assertAllEqual(ld(x=[x1, x2], y=x1), x1 * 2)
+
   def test_lambda_output_shape(self):
     l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
     l(keras.backend.variable(np.ones((1, 1))))
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 22fa730112f..d4dbe96d1ba 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 22fa730112f..d4dbe96d1ba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "compute_mask"

From bd010a095ee4eca62b39ac54e0d96e93adf49672 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 10:45:17 -0700
Subject: [PATCH 270/412] String formatting in assertAllEqual() fails in Python
 3 because bytestring may be converted using %s. Use %r to fix this.

PiperOrigin-RevId: 311760220
Change-Id: Ia46073b51bc38b8e88016edab37bc34ceebd5d7f
---
 tensorflow/python/framework/test_util.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index d5bbd889166..4981e1b68fd 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -2686,7 +2686,7 @@ class TensorFlowTestCase(googletest.TestCase):
     if (b.ndim <= 3 or b.size < 500):
       self.assertEqual(
           a.shape, b.shape, "Shape mismatch: expected %s, got %s."
-          " Contents: %s. \n%s." % (a.shape, b.shape, b, msg))
+          " Contents: %r. \n%s." % (a.shape, b.shape, b, msg))
     else:
       self.assertEqual(
           a.shape, b.shape, "Shape mismatch: expected %s, got %s."
@@ -2709,8 +2709,8 @@ class TensorFlowTestCase(googletest.TestCase):
       else:
         # np.where is broken for scalars
         x, y = a, b
-      msgs.append("not equal lhs = {}".format(x))
-      msgs.append("not equal rhs = {}".format(y))
+      msgs.append("not equal lhs = %r" % x)
+      msgs.append("not equal rhs = %r" % y)
       # With Python 3, we need to make sure the dtype matches between a and b.
       b = b.astype(a.dtype)
       np.testing.assert_array_equal(a, b, err_msg="\n".join(msgs))

From 262e92804b465874927d48be30311147692dd7a9 Mon Sep 17 00:00:00 2001
From: Xunkai Zhang <xunkai@google.com>
Date: Fri, 15 May 2020 10:46:10 -0700
Subject: [PATCH 271/412] [tfls.util] Remove tensorflow-lite-gpu from
 dependencies.

PiperOrigin-RevId: 311760392
Change-Id: Ia8fe0682cfda037589f7546f1e70974c1be439c5
---
 .../org/tensorflow/lite/gpu/GpuDelegate.java  | 16 +++--
 .../lite/experimental/support/java/BUILD      | 19 ++++-
 .../lite/support/model/GpuDelegateProxy.java  | 69 +++++++++++++++++++
 .../tensorflow/lite/support/model/Model.java  | 23 ++++---
 4 files changed, 110 insertions(+), 17 deletions(-)
 create mode 100644 tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java

diff --git a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
index 895f12f0233..78cab0d2cbf 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
+++ b/tensorflow/lite/delegates/gpu/java/src/main/java/org/tensorflow/lite/gpu/GpuDelegate.java
@@ -17,18 +17,19 @@ package org.tensorflow.lite.gpu;
 
 import java.io.Closeable;
 import org.tensorflow.lite.Delegate;
+import org.tensorflow.lite.annotations.UsedByReflection;
 
 /**
  * {@link Delegate} for GPU inference.
  *
- * <p>Note: When calling {@code Interpreter.modifyGraphWithDelegate()}/
- * {@code Interpreter.Options.addDelegate()} and {@code Interpreter.run()}, the caller must have an
- * {@code EGLContext} in the <b>current thread</b> and {@code Interpreter.run()} must be called from
- * the same {@code EGLContext}. If an {@code EGLContext} does not exist, the delegate will
- * internally create one, but then the developer must ensure that {@code Interpreter.run()} is
- * always called from the same thread in which {@code Interpreter.modifyGraphWithDelegate()} was
- * called.
+ * <p>Note: When calling {@code Interpreter.modifyGraphWithDelegate()}/ {@code
+ * Interpreter.Options.addDelegate()} and {@code Interpreter.run()}, the caller must have an {@code
+ * EGLContext} in the <b>current thread</b> and {@code Interpreter.run()} must be called from the
+ * same {@code EGLContext}. If an {@code EGLContext} does not exist, the delegate will internally
+ * create one, but then the developer must ensure that {@code Interpreter.run()} is always called
+ * from the same thread in which {@code Interpreter.modifyGraphWithDelegate()} was called.
  */
+@UsedByReflection("TFLiteSupport/model/GpuDelegateProxy")
 public class GpuDelegate implements Delegate, Closeable {
 
   private static final long INVALID_DELEGATE_HANDLE = 0;
@@ -98,6 +99,7 @@ public class GpuDelegate implements Delegate, Closeable {
             options.inferencePreference);
   }
 
+  @UsedByReflection("TFLiteSupport/model/GpuDelegateProxy")
   public GpuDelegate() {
     this(new Options());
   }
diff --git a/tensorflow/lite/experimental/support/java/BUILD b/tensorflow/lite/experimental/support/java/BUILD
index 43e984a0cb8..85f5da17193 100644
--- a/tensorflow/lite/experimental/support/java/BUILD
+++ b/tensorflow/lite/experimental/support/java/BUILD
@@ -9,7 +9,24 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+# TODO(b/156482505): The NOGPU target is a temporary target. Internally, people
+# may already depend on "tensorflow-lite-support" so we shouldn't remove GPU
+# from its dependency. We will have CLs to help users migrate. After migration
+# is done, the "NOGPU" target will be removed.
+android_library(
+    name = "tensorflow-lite-support-nogpu",
+    srcs = glob(["src/java/org/tensorflow/lite/support/**/*.java"]),
+    javacopts = JAVACOPTS,
+    manifest = "AndroidManifest.xml",
+    deps = [
+        "//tensorflow/lite/java:tensorflowlite",
+        "@org_checkerframework_qual",
+    ],
+)
+
 # TODO(138904786): Split Java part and Android part to make the support library usable by pure Java.
+# For new users: Please use "tensorflow-lite-support-nogpu" if possible, and
+# additionally depends on "tensorflowlite_gpu" if needed.
 android_library(
     name = "tensorflow-lite-support",
     srcs = glob(["src/java/org/tensorflow/lite/support/**/*.java"]),
@@ -17,7 +34,7 @@ android_library(
     manifest = "AndroidManifest.xml",
     deps = [
         "//tensorflow/lite/java:tensorflowlite",
-        "//tensorflow/lite/java:tensorflowlite_gpu",
+        "//tensorflow/lite/java:tensorflowlite_gpu",  # unuseddeps: keep
         "@org_checkerframework_qual",
     ],
 )
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java
new file mode 100644
index 00000000000..9cfcf923ded
--- /dev/null
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/GpuDelegateProxy.java
@@ -0,0 +1,69 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.support.model;
+
+import android.util.Log;
+import java.io.Closeable;
+import java.io.IOException;
+import org.checkerframework.checker.nullness.qual.Nullable;
+import org.tensorflow.lite.Delegate;
+
+/**
+ * Helper class to create and call necessary methods of {@code GpuDelegate} which is not a strict
+ * dependency.
+ */
+class GpuDelegateProxy implements Delegate, Closeable {
+
+  private static final String TAG = "GpuDelegateProxy";
+
+  private final Delegate proxiedDelegate;
+  private final Closeable proxiedCloseable;
+
+  @Nullable
+  public static GpuDelegateProxy maybeNewInstance() {
+    try {
+      Class<?> clazz = Class.forName("org.tensorflow.lite.gpu.GpuDelegate");
+      Object instance = clazz.getDeclaredConstructor().newInstance();
+      return new GpuDelegateProxy(instance);
+    } catch (ReflectiveOperationException e) {
+      Log.e(TAG, "Failed to create the GpuDelegate dynamically.", e);
+      return null;
+    }
+  }
+
+  /** Calls {@code close()} method of the delegate. */
+  @Override
+  public void close() {
+    try {
+      proxiedCloseable.close();
+    } catch (IOException e) {
+      // Should not trigger, because GpuDelegate#close never throws. The catch is required because
+      // of Closeable#close.
+      Log.e(TAG, "Failed to close the GpuDelegate.", e);
+    }
+  }
+
+  /** Calls {@code getNativeHandle()} method of the delegate. */
+  @Override
+  public long getNativeHandle() {
+    return proxiedDelegate.getNativeHandle();
+  }
+
+  private GpuDelegateProxy(Object instance) {
+    this.proxiedCloseable = (Closeable) instance;
+    this.proxiedDelegate = (Delegate) instance;
+  }
+}
diff --git a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
index c7f9e83f692..40659e39848 100644
--- a/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
+++ b/tensorflow/lite/experimental/support/java/src/java/org/tensorflow/lite/support/model/Model.java
@@ -22,7 +22,6 @@ import java.util.Map;
 import org.checkerframework.checker.nullness.qual.NonNull;
 import org.checkerframework.checker.nullness.qual.Nullable;
 import org.tensorflow.lite.Interpreter;
-import org.tensorflow.lite.gpu.GpuDelegate;
 import org.tensorflow.lite.support.common.FileUtil;
 import org.tensorflow.lite.support.common.SupportPreconditions;
 
@@ -91,7 +90,7 @@ public class Model {
   /** The memory-mapped model data. */
   private final MappedByteBuffer byteModel;
 
-  private final GpuDelegate gpuDelegate;
+  private final GpuDelegateProxy gpuDelegateProxy;
 
   /**
    * Builder for {@link Model}.
@@ -181,24 +180,30 @@ public class Model {
    * @param modelPath The original path of the model. It can be fetched later by {@link
    *     Model#getPath()}.
    * @param options The options for running the model.
+   * @throws IllegalArgumentException if {@code options.device} is {@link Device#GPU} but
+   *     "tensorflow-lite-gpu" is not linked to the project.
    */
   public static Model createModel(
       @NonNull MappedByteBuffer byteModel, @NonNull String modelPath, @NonNull Options options) {
     Interpreter.Options interpreterOptions = new Interpreter.Options();
-    GpuDelegate gpuDelegate = options.device.equals(Device.GPU) ? new GpuDelegate() : null;
+    GpuDelegateProxy gpuDelegateProxy = null;
     switch (options.device) {
       case NNAPI:
         interpreterOptions.setUseNNAPI(true);
         break;
       case GPU:
-        interpreterOptions.addDelegate(gpuDelegate);
+        gpuDelegateProxy = GpuDelegateProxy.maybeNewInstance();
+        SupportPreconditions.checkArgument(
+            gpuDelegateProxy != null,
+            "Cannot inference with GPU. Did you add \"tensorflow-lite-gpu\" as dependency?");
+        interpreterOptions.addDelegate(gpuDelegateProxy);
         break;
       case CPU:
         break;
     }
     interpreterOptions.setNumThreads(options.numThreads);
     Interpreter interpreter = new Interpreter(byteModel, interpreterOptions);
-    return new Model(modelPath, byteModel, interpreter, gpuDelegate);
+    return new Model(modelPath, byteModel, interpreter, gpuDelegateProxy);
   }
 
   /** Returns the memory-mapped model data. */
@@ -243,8 +248,8 @@ public class Model {
     if (interpreter != null) {
       interpreter.close();
     }
-    if (gpuDelegate != null) {
-      gpuDelegate.close();
+    if (gpuDelegateProxy != null) {
+      gpuDelegateProxy.close();
     }
   }
 
@@ -252,10 +257,10 @@ public class Model {
       @NonNull String modelPath,
       @NonNull MappedByteBuffer byteModel,
       @NonNull Interpreter interpreter,
-      @Nullable GpuDelegate gpuDelegate) {
+      @Nullable GpuDelegateProxy gpuDelegateProxy) {
     this.modelPath = modelPath;
     this.byteModel = byteModel;
     this.interpreter = interpreter;
-    this.gpuDelegate = gpuDelegate;
+    this.gpuDelegateProxy = gpuDelegateProxy;
   }
 }

From 26104505b8267c6f08493869e64e59af2ed62326 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Fri, 15 May 2020 10:50:40 -0700
Subject: [PATCH 272/412] [XLA:Python] Expose new use_spmd_partitioning compile
 option.

PiperOrigin-RevId: 311761373
Change-Id: I1f696e0c082295dc0e6896f05d1e88525de7ce70
---
 tensorflow/compiler/xla/python/xla.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index f03595bf677..65fb5311994 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -1406,7 +1406,10 @@ PYBIND11_MODULE(xla_extension, m) {
                              options.device_assignment())
                        : absl::nullopt;
           },
-          &ExecutableBuildOptions::set_device_assignment);
+          &ExecutableBuildOptions::set_device_assignment)
+      .def_property("use_spmd_partitioning",
+                    &ExecutableBuildOptions::use_spmd_partitioning,
+                    &ExecutableBuildOptions::set_use_spmd_partitioning);
 
   py::class_<XlaComputation>(m, "XlaComputation")
       .def(py::init([](const py::bytes& serialized_hlo_module_proto)

From 77245d07d13522a5cb5d060390fffa1894df5bbf Mon Sep 17 00:00:00 2001
From: Edward Loper <edloper@google.com>
Date: Fri, 15 May 2020 10:58:42 -0700
Subject: [PATCH 273/412] Add dispatch support to more Python APIs.

PiperOrigin-RevId: 311763060
Change-Id: Ib35371483aa083e245996508a82fd13d8ac43131
---
 tensorflow/python/keras/activations.py        |  16 +++
 tensorflow/python/keras/backend.py            | 104 ++++++++++++++++++
 tensorflow/python/keras/backend_config.py     |   3 +
 tensorflow/python/keras/losses.py             |  16 +++
 tensorflow/python/keras/metrics.py            |   6 +
 tensorflow/python/ops/array_ops.py            |  47 ++++++++
 .../python/ops/candidate_sampling_ops.py      |   6 +
 tensorflow/python/ops/check_ops.py            |  42 +++++++
 tensorflow/python/ops/clip_ops.py             |   4 +
 tensorflow/python/ops/confusion_matrix.py     |   3 +
 tensorflow/python/ops/control_flow_ops.py     |   8 ++
 tensorflow/python/ops/ctc_ops.py              |   9 ++
 tensorflow/python/ops/embedding_ops.py        |   7 ++
 tensorflow/python/ops/functional_ops.py       |   7 ++
 tensorflow/python/ops/histogram_ops.py        |   3 +
 tensorflow/python/ops/image_ops_impl.py       |  80 ++++++++++++--
 tensorflow/python/ops/linalg/linalg_impl.py   |  12 +-
 .../ops/linalg/sparse/conjugate_gradient.py   |   2 +
 tensorflow/python/ops/linalg_ops.py           |  12 ++
 tensorflow/python/ops/logging_ops.py          |   3 +
 tensorflow/python/ops/losses/losses_impl.py   |  12 ++
 tensorflow/python/ops/manip_ops.py            |   2 +
 tensorflow/python/ops/math_ops.py             |  56 +++++++++-
 tensorflow/python/ops/nn_impl.py              |  33 ++++++
 tensorflow/python/ops/nn_ops.py               |  76 ++++++++++++-
 tensorflow/python/ops/numerics.py             |   3 +
 tensorflow/python/ops/parsing_ops.py          |  11 ++
 tensorflow/python/ops/proto_ops.py            |   5 +-
 .../python/ops/ragged/ragged_array_ops.py     |   5 +
 .../python/ops/ragged/ragged_concat_ops.py    |   2 +
 .../python/ops/ragged/ragged_factory_ops.py   |   4 +
 .../ops/ragged/ragged_functional_ops.py       |   2 +
 .../python/ops/ragged/ragged_math_ops.py      |   2 +
 .../python/ops/ragged/ragged_string_ops.py    |  11 ++
 .../python/ops/ragged/segment_id_ops.py       |   3 +
 tensorflow/python/ops/random_ops.py           |  10 ++
 tensorflow/python/ops/rnn.py                  |   7 ++
 tensorflow/python/ops/script_ops.py           |   4 +
 tensorflow/python/ops/sets_impl.py            |   5 +
 tensorflow/python/ops/signal/dct_ops.py       |   3 +
 tensorflow/python/ops/signal/fft_ops.py       |  21 +++-
 tensorflow/python/ops/signal/mel_ops.py       |   2 +
 tensorflow/python/ops/signal/mfcc_ops.py      |   2 +
 .../python/ops/signal/reconstruction_ops.py   |   2 +
 tensorflow/python/ops/signal/shape_ops.py     |   2 +
 tensorflow/python/ops/signal/spectral_ops.py  |   6 +
 tensorflow/python/ops/signal/window_ops.py    |   6 +
 tensorflow/python/ops/sort_ops.py             |   3 +
 tensorflow/python/ops/sparse_ops.py           |   6 +
 tensorflow/python/ops/special_math_ops.py     |  10 ++
 tensorflow/python/ops/stateless_random_ops.py |  11 ++
 tensorflow/python/ops/string_ops.py           |   7 ++
 52 files changed, 696 insertions(+), 28 deletions(-)

diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index 34d04d68c6c..0ee4a91f417 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -24,6 +24,7 @@ from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
 
 # b/123041942
@@ -41,6 +42,7 @@ _TF_ACTIVATIONS_V2 = {
 
 
 @keras_export('keras.activations.softmax')
+@dispatch.add_dispatch_support
 def softmax(x, axis=-1):
   """Softmax converts a real vector to a vector of categorical probabilities.
 
@@ -82,6 +84,7 @@ def softmax(x, axis=-1):
 
 
 @keras_export('keras.activations.elu')
+@dispatch.add_dispatch_support
 def elu(x, alpha=1.0):
   """Exponential linear unit.
 
@@ -100,6 +103,7 @@ def elu(x, alpha=1.0):
 
 
 @keras_export('keras.activations.selu')
+@dispatch.add_dispatch_support
 def selu(x):
   """Scaled Exponential Linear Unit (SELU).
 
@@ -153,6 +157,7 @@ def selu(x):
 
 
 @keras_export('keras.activations.softplus')
+@dispatch.add_dispatch_support
 def softplus(x):
   """Softplus activation function, `softplus(x) = log(exp(x) + 1)`.
   
@@ -174,6 +179,7 @@ def softplus(x):
 
 
 @keras_export('keras.activations.softsign')
+@dispatch.add_dispatch_support
 def softsign(x):
   """Softsign activation function, `softsign(x) = x / (abs(x) + 1)`.
   
@@ -194,6 +200,7 @@ def softsign(x):
 
 
 @keras_export('keras.activations.swish')
+@dispatch.add_dispatch_support
 def swish(x):
   """Swish activation function, `swish(x) = x * sigmoid(x)`.
 
@@ -224,6 +231,7 @@ def swish(x):
 
 
 @keras_export('keras.activations.relu')
+@dispatch.add_dispatch_support
 def relu(x, alpha=0., max_value=None, threshold=0):
   """Applies the rectified linear unit activation function.
 
@@ -264,6 +272,7 @@ def relu(x, alpha=0., max_value=None, threshold=0):
 
 
 @keras_export('keras.activations.tanh')
+@dispatch.add_dispatch_support
 def tanh(x):
   """Hyperbolic tangent activation function.
 
@@ -285,6 +294,7 @@ def tanh(x):
 
 
 @keras_export('keras.activations.sigmoid')
+@dispatch.add_dispatch_support
 def sigmoid(x):
   """Sigmoid activation function, `sigmoid(x) = 1 / (1 + exp(-x))`.
 
@@ -314,6 +324,7 @@ def sigmoid(x):
 
 
 @keras_export('keras.activations.exponential')
+@dispatch.add_dispatch_support
 def exponential(x):
   """Exponential activation function.
 
@@ -334,6 +345,7 @@ def exponential(x):
 
 
 @keras_export('keras.activations.hard_sigmoid')
+@dispatch.add_dispatch_support
 def hard_sigmoid(x):
   """Hard sigmoid activation function.
 
@@ -360,6 +372,7 @@ def hard_sigmoid(x):
 
 
 @keras_export('keras.activations.linear')
+@dispatch.add_dispatch_support
 def linear(x):
   """Linear activation function (pass-through).
 
@@ -380,6 +393,7 @@ def linear(x):
 
 
 @keras_export('keras.activations.serialize')
+@dispatch.add_dispatch_support
 def serialize(activation):
   """Returns the string identifier of an activation function.
 
@@ -410,6 +424,7 @@ def serialize(activation):
 
 
 @keras_export('keras.activations.deserialize')
+@dispatch.add_dispatch_support
 def deserialize(name, custom_objects=None):
   """Returns activation function given a string identifier.
 
@@ -447,6 +462,7 @@ def deserialize(name, custom_objects=None):
 
 
 @keras_export('keras.activations.get')
+@dispatch.add_dispatch_support
 def get(identifier):
   """Returns function.
 
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 11e53e032ae..11795625d06 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -76,6 +76,7 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import moving_averages
 from tensorflow.python.training.tracking import util as tracking_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import object_identity
 from tensorflow.python.util import tf_contextlib
@@ -173,6 +174,7 @@ def backend():
 
 
 @keras_export('keras.backend.cast_to_floatx')
+@dispatch.add_dispatch_support
 def cast_to_floatx(x):
   """Cast a Numpy array to the default Keras float type.
 
@@ -799,6 +801,7 @@ def is_sparse(tensor):
 
 
 @keras_export('keras.backend.to_dense')
+@dispatch.add_dispatch_support
 def to_dense(tensor):
   """Converts a sparse tensor into a dense tensor and returns it.
 
@@ -1007,6 +1010,7 @@ def _initialize_variables(session):
 
 
 @keras_export('keras.backend.constant')
+@dispatch.add_dispatch_support
 def constant(value, dtype=None, shape=None, name=None):
   """Creates a constant tensor.
 
@@ -1163,6 +1167,7 @@ def is_placeholder(x):
 
 
 @keras_export('keras.backend.shape')
+@dispatch.add_dispatch_support
 def shape(x):
   """Returns the symbolic shape of a tensor or variable.
 
@@ -1245,6 +1250,7 @@ def ndim(x):
 
 
 @keras_export('keras.backend.dtype')
+@dispatch.add_dispatch_support
 def dtype(x):
   """Returns the dtype of a Keras tensor or variable, as a string.
 
@@ -1343,6 +1349,7 @@ def zeros(shape, dtype=None, name=None):
 
 
 @keras_export('keras.backend.ones')
+@dispatch.add_dispatch_support
 def ones(shape, dtype=None, name=None):
   """Instantiates an all-ones variable and returns it.
 
@@ -1377,6 +1384,7 @@ def ones(shape, dtype=None, name=None):
 
 
 @keras_export('keras.backend.eye')
+@dispatch.add_dispatch_support
 def eye(size, dtype=None, name=None):
   """Instantiate an identity matrix and returns it.
 
@@ -1433,6 +1441,7 @@ def zeros_like(x, dtype=None, name=None):
 
 
 @keras_export('keras.backend.ones_like')
+@dispatch.add_dispatch_support
 def ones_like(x, dtype=None, name=None):
   """Instantiates an all-ones variable of the same shape as another tensor.
 
@@ -1563,6 +1572,7 @@ def count_params(x):
 
 
 @keras_export('keras.backend.cast')
+@dispatch.add_dispatch_support
 def cast(x, dtype):
   """Casts a tensor to a different dtype and returns it.
 
@@ -1647,6 +1657,7 @@ def moving_average_update(x, value, momentum):
 
 
 @keras_export('keras.backend.dot')
+@dispatch.add_dispatch_support
 def dot(x, y):
   """Multiplies 2 tensors (and/or variables) and returns a tensor.
 
@@ -1707,6 +1718,7 @@ def dot(x, y):
 
 
 @keras_export('keras.backend.batch_dot')
+@dispatch.add_dispatch_support
 def batch_dot(x, y, axes=None):
   """Batchwise dot product.
 
@@ -1895,6 +1907,7 @@ def batch_dot(x, y, axes=None):
 
 
 @keras_export('keras.backend.transpose')
+@dispatch.add_dispatch_support
 def transpose(x):
   """Transposes a tensor and returns it.
 
@@ -1926,6 +1939,7 @@ def transpose(x):
 
 
 @keras_export('keras.backend.gather')
+@dispatch.add_dispatch_support
 def gather(reference, indices):
   """Retrieves the elements of indices `indices` in the tensor `reference`.
 
@@ -1961,6 +1975,7 @@ def gather(reference, indices):
 
 
 @keras_export('keras.backend.max')
+@dispatch.add_dispatch_support
 def max(x, axis=None, keepdims=False):
   """Maximum value in a tensor.
 
@@ -1979,6 +1994,7 @@ def max(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.min')
+@dispatch.add_dispatch_support
 def min(x, axis=None, keepdims=False):
   """Minimum value in a tensor.
 
@@ -1997,6 +2013,7 @@ def min(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.sum')
+@dispatch.add_dispatch_support
 def sum(x, axis=None, keepdims=False):
   """Sum of the values in a tensor, alongside the specified axis.
 
@@ -2015,6 +2032,7 @@ def sum(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.prod')
+@dispatch.add_dispatch_support
 def prod(x, axis=None, keepdims=False):
   """Multiplies the values in a tensor, alongside the specified axis.
 
@@ -2033,6 +2051,7 @@ def prod(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.cumsum')
+@dispatch.add_dispatch_support
 def cumsum(x, axis=0):
   """Cumulative sum of the values in a tensor, alongside the specified axis.
 
@@ -2047,6 +2066,7 @@ def cumsum(x, axis=0):
 
 
 @keras_export('keras.backend.cumprod')
+@dispatch.add_dispatch_support
 def cumprod(x, axis=0):
   """Cumulative product of the values in a tensor, alongside the specified axis.
 
@@ -2081,6 +2101,7 @@ def var(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.std')
+@dispatch.add_dispatch_support
 def std(x, axis=None, keepdims=False):
   """Standard deviation of a tensor, alongside the specified axis.
 
@@ -2107,6 +2128,7 @@ def std(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.mean')
+@dispatch.add_dispatch_support
 def mean(x, axis=None, keepdims=False):
   """Mean of a tensor, alongside the specified axis.
 
@@ -2127,6 +2149,7 @@ def mean(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.any')
+@dispatch.add_dispatch_support
 def any(x, axis=None, keepdims=False):
   """Bitwise reduction (logical OR).
 
@@ -2143,6 +2166,7 @@ def any(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.all')
+@dispatch.add_dispatch_support
 def all(x, axis=None, keepdims=False):
   """Bitwise reduction (logical AND).
 
@@ -2159,6 +2183,7 @@ def all(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.argmax')
+@dispatch.add_dispatch_support
 def argmax(x, axis=-1):
   """Returns the index of the maximum value along an axis.
 
@@ -2173,6 +2198,7 @@ def argmax(x, axis=-1):
 
 
 @keras_export('keras.backend.argmin')
+@dispatch.add_dispatch_support
 def argmin(x, axis=-1):
   """Returns the index of the minimum value along an axis.
 
@@ -2187,6 +2213,7 @@ def argmin(x, axis=-1):
 
 
 @keras_export('keras.backend.square')
+@dispatch.add_dispatch_support
 def square(x):
   """Element-wise square.
 
@@ -2200,6 +2227,7 @@ def square(x):
 
 
 @keras_export('keras.backend.abs')
+@dispatch.add_dispatch_support
 def abs(x):
   """Element-wise absolute value.
 
@@ -2213,6 +2241,7 @@ def abs(x):
 
 
 @keras_export('keras.backend.sqrt')
+@dispatch.add_dispatch_support
 def sqrt(x):
   """Element-wise square root.
 
@@ -2229,6 +2258,7 @@ def sqrt(x):
 
 
 @keras_export('keras.backend.exp')
+@dispatch.add_dispatch_support
 def exp(x):
   """Element-wise exponential.
 
@@ -2242,6 +2272,7 @@ def exp(x):
 
 
 @keras_export('keras.backend.log')
+@dispatch.add_dispatch_support
 def log(x):
   """Element-wise log.
 
@@ -2276,6 +2307,7 @@ def logsumexp(x, axis=None, keepdims=False):
 
 
 @keras_export('keras.backend.round')
+@dispatch.add_dispatch_support
 def round(x):
   """Element-wise rounding to the closest integer.
 
@@ -2291,6 +2323,7 @@ def round(x):
 
 
 @keras_export('keras.backend.sign')
+@dispatch.add_dispatch_support
 def sign(x):
   """Element-wise sign.
 
@@ -2304,6 +2337,7 @@ def sign(x):
 
 
 @keras_export('keras.backend.pow')
+@dispatch.add_dispatch_support
 def pow(x, a):
   """Element-wise exponentiation.
 
@@ -2318,6 +2352,7 @@ def pow(x, a):
 
 
 @keras_export('keras.backend.clip')
+@dispatch.add_dispatch_support
 def clip(x, min_value, max_value):
   """Element-wise value clipping.
 
@@ -2341,6 +2376,7 @@ def clip(x, min_value, max_value):
 
 
 @keras_export('keras.backend.equal')
+@dispatch.add_dispatch_support
 def equal(x, y):
   """Element-wise equality between two tensors.
 
@@ -2355,6 +2391,7 @@ def equal(x, y):
 
 
 @keras_export('keras.backend.not_equal')
+@dispatch.add_dispatch_support
 def not_equal(x, y):
   """Element-wise inequality between two tensors.
 
@@ -2369,6 +2406,7 @@ def not_equal(x, y):
 
 
 @keras_export('keras.backend.greater')
+@dispatch.add_dispatch_support
 def greater(x, y):
   """Element-wise truth value of (x > y).
 
@@ -2383,6 +2421,7 @@ def greater(x, y):
 
 
 @keras_export('keras.backend.greater_equal')
+@dispatch.add_dispatch_support
 def greater_equal(x, y):
   """Element-wise truth value of (x >= y).
 
@@ -2397,6 +2436,7 @@ def greater_equal(x, y):
 
 
 @keras_export('keras.backend.less')
+@dispatch.add_dispatch_support
 def less(x, y):
   """Element-wise truth value of (x < y).
 
@@ -2411,6 +2451,7 @@ def less(x, y):
 
 
 @keras_export('keras.backend.less_equal')
+@dispatch.add_dispatch_support
 def less_equal(x, y):
   """Element-wise truth value of (x <= y).
 
@@ -2425,6 +2466,7 @@ def less_equal(x, y):
 
 
 @keras_export('keras.backend.maximum')
+@dispatch.add_dispatch_support
 def maximum(x, y):
   """Element-wise maximum of two tensors.
 
@@ -2449,6 +2491,7 @@ def maximum(x, y):
 
 
 @keras_export('keras.backend.minimum')
+@dispatch.add_dispatch_support
 def minimum(x, y):
   """Element-wise minimum of two tensors.
 
@@ -2463,6 +2506,7 @@ def minimum(x, y):
 
 
 @keras_export('keras.backend.sin')
+@dispatch.add_dispatch_support
 def sin(x):
   """Computes sin of x element-wise.
 
@@ -2476,6 +2520,7 @@ def sin(x):
 
 
 @keras_export('keras.backend.cos')
+@dispatch.add_dispatch_support
 def cos(x):
   """Computes cos of x element-wise.
 
@@ -2621,6 +2666,7 @@ def normalize_batch_in_training(x, gamma, beta, reduction_axes, epsilon=1e-3):
 
 
 @keras_export('keras.backend.batch_normalization')
+@dispatch.add_dispatch_support
 def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
   """Applies batch normalization on x given mean, var, beta and gamma.
 
@@ -2683,6 +2729,7 @@ def batch_normalization(x, mean, var, beta, gamma, axis=-1, epsilon=1e-3):
 
 
 @keras_export('keras.backend.concatenate')
+@dispatch.add_dispatch_support
 def concatenate(tensors, axis=-1):
   """Concatenates a list of tensors alongside the specified axis.
 
@@ -2720,6 +2767,7 @@ def concatenate(tensors, axis=-1):
 
 
 @keras_export('keras.backend.reshape')
+@dispatch.add_dispatch_support
 def reshape(x, shape):
   """Reshapes a tensor to the specified shape.
 
@@ -2749,6 +2797,7 @@ def reshape(x, shape):
 
 
 @keras_export('keras.backend.permute_dimensions')
+@dispatch.add_dispatch_support
 def permute_dimensions(x, pattern):
   """Permutes axes in a tensor.
 
@@ -2780,6 +2829,7 @@ def permute_dimensions(x, pattern):
 
 
 @keras_export('keras.backend.resize_images')
+@dispatch.add_dispatch_support
 def resize_images(x, height_factor, width_factor, data_format,
                   interpolation='nearest'):
   """Resizes the images contained in a 4D tensor.
@@ -2843,6 +2893,7 @@ def resize_images(x, height_factor, width_factor, data_format,
 
 
 @keras_export('keras.backend.resize_volumes')
+@dispatch.add_dispatch_support
 def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
   """Resizes the volume contained in a 5D tensor.
 
@@ -2875,6 +2926,7 @@ def resize_volumes(x, depth_factor, height_factor, width_factor, data_format):
 
 
 @keras_export('keras.backend.repeat_elements')
+@dispatch.add_dispatch_support
 def repeat_elements(x, rep, axis):
   """Repeats the elements of a tensor along an axis, like `np.repeat`.
 
@@ -2936,6 +2988,7 @@ def repeat_elements(x, rep, axis):
 
 
 @keras_export('keras.backend.repeat')
+@dispatch.add_dispatch_support
 def repeat(x, n):
   """Repeats a 2D tensor.
 
@@ -2971,6 +3024,7 @@ def repeat(x, n):
 
 
 @keras_export('keras.backend.arange')
+@dispatch.add_dispatch_support
 def arange(start, stop=None, step=1, dtype='int32'):
   """Creates a 1D tensor containing a sequence of integers.
 
@@ -3009,6 +3063,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
 
 
 @keras_export('keras.backend.tile')
+@dispatch.add_dispatch_support
 def tile(x, n):
   """Creates a tensor by tiling `x` by `n`.
 
@@ -3026,6 +3081,7 @@ def tile(x, n):
 
 
 @keras_export('keras.backend.flatten')
+@dispatch.add_dispatch_support
 def flatten(x):
   """Flatten a tensor.
 
@@ -3051,6 +3107,7 @@ def flatten(x):
 
 
 @keras_export('keras.backend.batch_flatten')
+@dispatch.add_dispatch_support
 def batch_flatten(x):
   """Turn a nD tensor into a 2D tensor with same 0th dimension.
 
@@ -3076,6 +3133,7 @@ def batch_flatten(x):
 
 
 @keras_export('keras.backend.expand_dims')
+@dispatch.add_dispatch_support
 def expand_dims(x, axis=-1):
   """Adds a 1-sized dimension at index "axis".
 
@@ -3090,6 +3148,7 @@ def expand_dims(x, axis=-1):
 
 
 @keras_export('keras.backend.squeeze')
+@dispatch.add_dispatch_support
 def squeeze(x, axis):
   """Removes a 1-dimension from the tensor at index "axis".
 
@@ -3104,6 +3163,7 @@ def squeeze(x, axis):
 
 
 @keras_export('keras.backend.temporal_padding')
+@dispatch.add_dispatch_support
 def temporal_padding(x, padding=(1, 1)):
   """Pads the middle dimension of a 3D tensor.
 
@@ -3121,6 +3181,7 @@ def temporal_padding(x, padding=(1, 1)):
 
 
 @keras_export('keras.backend.spatial_2d_padding')
+@dispatch.add_dispatch_support
 def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
   """Pads the 2nd and 3rd dimensions of a 4D tensor.
 
@@ -3152,6 +3213,7 @@ def spatial_2d_padding(x, padding=((1, 1), (1, 1)), data_format=None):
 
 
 @keras_export('keras.backend.spatial_3d_padding')
+@dispatch.add_dispatch_support
 def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
   """Pads 5D tensor with zeros along the depth, height, width dimensions.
 
@@ -3196,6 +3258,7 @@ def spatial_3d_padding(x, padding=((1, 1), (1, 1), (1, 1)), data_format=None):
 
 
 @keras_export('keras.backend.stack')
+@dispatch.add_dispatch_support
 def stack(x, axis=0):
   """Stacks a list of rank `R` tensors into a rank `R+1` tensor.
 
@@ -3222,6 +3285,7 @@ def stack(x, axis=0):
 
 
 @keras_export('keras.backend.one_hot')
+@dispatch.add_dispatch_support
 def one_hot(indices, num_classes):
   """Computes the one-hot representation of an integer tensor.
 
@@ -3241,6 +3305,7 @@ def one_hot(indices, num_classes):
 
 
 @keras_export('keras.backend.reverse')
+@dispatch.add_dispatch_support
 def reverse(x, axes):
   """Reverse a tensor along the specified axes.
 
@@ -3321,6 +3386,7 @@ def get_value(x):
 
 
 @keras_export('keras.backend.batch_get_value')
+@dispatch.add_dispatch_support
 def batch_get_value(tensors):
   """Returns the value of more than one tensor variable.
 
@@ -3382,6 +3448,7 @@ def set_value(x, value):
 
 
 @keras_export('keras.backend.batch_set_value')
+@dispatch.add_dispatch_support
 def batch_set_value(tuples):
   """Sets the values of many tensor variables at once.
 
@@ -3424,6 +3491,7 @@ set_value.__doc__ = set_value.__doc__.format(snippet=_VALUE_SET_CODE_STRING)
 
 
 @keras_export('keras.backend.print_tensor')
+@dispatch.add_dispatch_support
 def print_tensor(x, message=''):
   """Prints `message` and the tensor value when evaluated.
 
@@ -3861,6 +3929,7 @@ def gradients(loss, variables):
 
 
 @keras_export('keras.backend.stop_gradient')
+@dispatch.add_dispatch_support
 def stop_gradient(variables):
   """Returns `variables` but with zero gradient w.r.t. every other variable.
 
@@ -3882,6 +3951,7 @@ def stop_gradient(variables):
 
 
 @keras_export('keras.backend.rnn')
+@dispatch.add_dispatch_support
 def rnn(step_function,
         inputs,
         initial_states,
@@ -4276,6 +4346,7 @@ def rnn(step_function,
 
 
 @keras_export('keras.backend.switch')
+@dispatch.add_dispatch_support
 def switch(condition, then_expression, else_expression):
   """Switches between two operations depending on a scalar value.
 
@@ -4409,6 +4480,7 @@ def in_test_phase(x, alt, training=None):
 
 
 @keras_export('keras.backend.relu')
+@dispatch.add_dispatch_support
 def relu(x, alpha=0., max_value=None, threshold=0):
   """Rectified linear unit.
 
@@ -4462,6 +4534,7 @@ def relu(x, alpha=0., max_value=None, threshold=0):
 
 
 @keras_export('keras.backend.elu')
+@dispatch.add_dispatch_support
 def elu(x, alpha=1.):
   """Exponential linear unit.
 
@@ -4480,6 +4553,7 @@ def elu(x, alpha=1.):
 
 
 @keras_export('keras.backend.softmax')
+@dispatch.add_dispatch_support
 def softmax(x, axis=-1):
   """Softmax of a tensor.
 
@@ -4495,6 +4569,7 @@ def softmax(x, axis=-1):
 
 
 @keras_export('keras.backend.softplus')
+@dispatch.add_dispatch_support
 def softplus(x):
   """Softplus of a tensor.
 
@@ -4508,6 +4583,7 @@ def softplus(x):
 
 
 @keras_export('keras.backend.softsign')
+@dispatch.add_dispatch_support
 def softsign(x):
   """Softsign of a tensor.
 
@@ -4527,6 +4603,7 @@ def _backtrack_identity(tensor):
 
 
 @keras_export('keras.backend.categorical_crossentropy')
+@dispatch.add_dispatch_support
 def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy between an output tensor and a target tensor.
 
@@ -4595,6 +4672,7 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
 
 @keras_export('keras.backend.sparse_categorical_crossentropy')
+@dispatch.add_dispatch_support
 def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   """Categorical crossentropy with integer targets.
 
@@ -4676,6 +4754,7 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
 
 
 @keras_export('keras.backend.binary_crossentropy')
+@dispatch.add_dispatch_support
 def binary_crossentropy(target, output, from_logits=False):
   """Binary crossentropy between an output tensor and a target tensor.
 
@@ -4712,6 +4791,7 @@ def binary_crossentropy(target, output, from_logits=False):
 
 
 @keras_export('keras.backend.sigmoid')
+@dispatch.add_dispatch_support
 def sigmoid(x):
   """Element-wise sigmoid.
 
@@ -4725,6 +4805,7 @@ def sigmoid(x):
 
 
 @keras_export('keras.backend.hard_sigmoid')
+@dispatch.add_dispatch_support
 def hard_sigmoid(x):
   """Segment-wise linear approximation of sigmoid.
 
@@ -4747,6 +4828,7 @@ def hard_sigmoid(x):
 
 
 @keras_export('keras.backend.tanh')
+@dispatch.add_dispatch_support
 def tanh(x):
   """Element-wise tanh.
 
@@ -4760,6 +4842,7 @@ def tanh(x):
 
 
 @keras_export('keras.backend.dropout')
+@dispatch.add_dispatch_support
 def dropout(x, level, noise_shape=None, seed=None):
   """Sets entries in `x` to zero at random, while scaling the entire tensor.
 
@@ -4780,6 +4863,7 @@ def dropout(x, level, noise_shape=None, seed=None):
 
 
 @keras_export('keras.backend.l2_normalize')
+@dispatch.add_dispatch_support
 def l2_normalize(x, axis=None):
   """Normalizes a tensor wrt the L2 norm alongside the specified axis.
 
@@ -4794,6 +4878,7 @@ def l2_normalize(x, axis=None):
 
 
 @keras_export('keras.backend.in_top_k')
+@dispatch.add_dispatch_support
 def in_top_k(predictions, targets, k):
   """Returns whether the `targets` are in the top `k` `predictions`.
 
@@ -4896,6 +4981,7 @@ def _preprocess_padding(padding):
 
 
 @keras_export('keras.backend.conv1d')
+@dispatch.add_dispatch_support
 def conv1d(x,
            kernel,
            strides=1,
@@ -4946,6 +5032,7 @@ def conv1d(x,
 
 
 @keras_export('keras.backend.conv2d')
+@dispatch.add_dispatch_support
 def conv2d(x,
            kernel,
            strides=(1, 1),
@@ -4989,6 +5076,7 @@ def conv2d(x,
 
 
 @keras_export('keras.backend.conv2d_transpose')
+@dispatch.add_dispatch_support
 def conv2d_transpose(x,
                      kernel,
                      output_shape,
@@ -5129,6 +5217,7 @@ def separable_conv1d(x,
 
 
 @keras_export('keras.backend.separable_conv2d')
+@dispatch.add_dispatch_support
 def separable_conv2d(x,
                      depthwise_kernel,
                      pointwise_kernel,
@@ -5186,6 +5275,7 @@ def separable_conv2d(x,
 
 
 @keras_export('keras.backend.depthwise_conv2d')
+@dispatch.add_dispatch_support
 def depthwise_conv2d(x,
                      depthwise_kernel,
                      strides=(1, 1),
@@ -5235,6 +5325,7 @@ def depthwise_conv2d(x,
 
 
 @keras_export('keras.backend.conv3d')
+@dispatch.add_dispatch_support
 def conv3d(x,
            kernel,
            strides=(1, 1, 1),
@@ -5337,6 +5428,7 @@ def conv3d_transpose(x,
 
 
 @keras_export('keras.backend.pool2d')
+@dispatch.add_dispatch_support
 def pool2d(x,
            pool_size,
            strides=(1, 1),
@@ -5396,6 +5488,7 @@ def pool2d(x,
 
 
 @keras_export('keras.backend.pool3d')
+@dispatch.add_dispatch_support
 def pool3d(x,
            pool_size,
            strides=(1, 1, 1),
@@ -5526,6 +5619,7 @@ def local_conv(inputs,
 
 
 @keras_export('keras.backend.local_conv1d')
+@dispatch.add_dispatch_support
 def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
   """Apply 1D conv with un-shared weights.
 
@@ -5561,6 +5655,7 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
 
 
 @keras_export('keras.backend.local_conv2d')
+@dispatch.add_dispatch_support
 def local_conv2d(inputs,
                  kernel,
                  kernel_size,
@@ -5602,6 +5697,7 @@ def local_conv2d(inputs,
 
 
 @keras_export('keras.backend.bias_add')
+@dispatch.add_dispatch_support
 def bias_add(x, bias, data_format=None):
   """Adds a bias vector to a tensor.
 
@@ -5646,6 +5742,7 @@ def bias_add(x, bias, data_format=None):
 
 
 @keras_export('keras.backend.random_normal')
+@dispatch.add_dispatch_support
 def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with normal distribution of values.
 
@@ -5682,6 +5779,7 @@ def random_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.random_uniform')
+@dispatch.add_dispatch_support
 def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
   """Returns a tensor with uniform distribution of values.
 
@@ -5715,6 +5813,7 @@ def random_uniform(shape, minval=0.0, maxval=1.0, dtype=None, seed=None):
 
 @deprecated(None, 'Use `tf.keras.backend.random_bernoulli` instead.')
 @keras_export('keras.backend.random_binomial')
+@dispatch.add_dispatch_support
 def random_binomial(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random binomial distribution of values.
 
@@ -5751,6 +5850,7 @@ def random_binomial(shape, p=0.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.random_bernoulli')
+@dispatch.add_dispatch_support
 def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
   """Returns a tensor with random bernoulli distribution of values.
 
@@ -5767,6 +5867,7 @@ def random_bernoulli(shape, p=0.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.truncated_normal')
+@dispatch.add_dispatch_support
 def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
   """Returns a tensor with truncated random normal distribution of values.
 
@@ -5801,6 +5902,7 @@ def truncated_normal(shape, mean=0.0, stddev=1.0, dtype=None, seed=None):
 
 
 @keras_export('keras.backend.ctc_label_dense_to_sparse')
+@dispatch.add_dispatch_support
 def ctc_label_dense_to_sparse(labels, label_lengths):
   """Converts CTC labels from dense to sparse.
 
@@ -5847,6 +5949,7 @@ def ctc_label_dense_to_sparse(labels, label_lengths):
 
 
 @keras_export('keras.backend.ctc_batch_cost')
+@dispatch.add_dispatch_support
 def ctc_batch_cost(y_true, y_pred, input_length, label_length):
   """Runs CTC loss algorithm on each batch element.
 
@@ -5879,6 +5982,7 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
 
 
 @keras_export('keras.backend.ctc_decode')
+@dispatch.add_dispatch_support
 def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
   """Decodes the output of a softmax.
 
diff --git a/tensorflow/python/keras/backend_config.py b/tensorflow/python/keras/backend_config.py
index c1bf163c444..cd1f1e4b423 100644
--- a/tensorflow/python/keras/backend_config.py
+++ b/tensorflow/python/keras/backend_config.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
 
 # The type of float to use throughout a session.
@@ -30,6 +31,7 @@ _IMAGE_DATA_FORMAT = 'channels_last'
 
 
 @keras_export('keras.backend.epsilon')
+@dispatch.add_dispatch_support
 def epsilon():
   """Returns the value of the fuzz factor used in numeric expressions.
 
@@ -110,6 +112,7 @@ def set_floatx(value):
 
 
 @keras_export('keras.backend.image_data_format')
+@dispatch.add_dispatch_support
 def image_data_format():
   """Returns the default image data format convention.
 
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 99fb015288b..2bb53dcfaa5 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.ops.losses import util as tf_losses_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import keras_export
 from tensorflow.tools.docs import doc_controls
 
@@ -1164,6 +1165,7 @@ class Huber(LossFunctionWrapper):
               'keras.losses.mean_squared_error',
               'keras.losses.mse',
               'keras.losses.MSE')
+@dispatch.add_dispatch_support
 def mean_squared_error(y_true, y_pred):
   """Computes the mean squared error between labels and predictions.
 
@@ -1199,6 +1201,7 @@ def mean_squared_error(y_true, y_pred):
               'keras.losses.mean_absolute_error',
               'keras.losses.mae',
               'keras.losses.MAE')
+@dispatch.add_dispatch_support
 def mean_absolute_error(y_true, y_pred):
   """Computes the mean absolute error between labels and predictions.
 
@@ -1231,6 +1234,7 @@ def mean_absolute_error(y_true, y_pred):
               'keras.losses.mean_absolute_percentage_error',
               'keras.losses.mape',
               'keras.losses.MAPE')
+@dispatch.add_dispatch_support
 def mean_absolute_percentage_error(y_true, y_pred):
   """Computes the mean absolute percentage error between `y_true` and `y_pred`.
 
@@ -1267,6 +1271,7 @@ def mean_absolute_percentage_error(y_true, y_pred):
               'keras.losses.mean_squared_logarithmic_error',
               'keras.losses.msle',
               'keras.losses.MSLE')
+@dispatch.add_dispatch_support
 def mean_squared_logarithmic_error(y_true, y_pred):
   """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
 
@@ -1315,6 +1320,7 @@ def _maybe_convert_labels(y_true):
 
 
 @keras_export('keras.metrics.squared_hinge', 'keras.losses.squared_hinge')
+@dispatch.add_dispatch_support
 def squared_hinge(y_true, y_pred):
   """Computes the squared hinge loss between `y_true` and `y_pred`.
 
@@ -1347,6 +1353,7 @@ def squared_hinge(y_true, y_pred):
 
 
 @keras_export('keras.metrics.hinge', 'keras.losses.hinge')
+@dispatch.add_dispatch_support
 def hinge(y_true, y_pred):
   """Computes the hinge loss between `y_true` and `y_pred`.
 
@@ -1378,6 +1385,7 @@ def hinge(y_true, y_pred):
 
 
 @keras_export('keras.losses.categorical_hinge')
+@dispatch.add_dispatch_support
 def categorical_hinge(y_true, y_pred):
   """Computes the categorical hinge loss between `y_true` and `y_pred`.
 
@@ -1410,6 +1418,7 @@ def categorical_hinge(y_true, y_pred):
 
 
 @keras_export('keras.losses.huber', v1=[])
+@dispatch.add_dispatch_support
 def huber(y_true, y_pred, delta=1.0):
   """Computes Huber loss value.
 
@@ -1447,6 +1456,7 @@ def huber(y_true, y_pred, delta=1.0):
 
 
 @keras_export('keras.losses.log_cosh', 'keras.losses.logcosh')
+@dispatch.add_dispatch_support
 def log_cosh(y_true, y_pred):
   """Logarithm of the hyperbolic cosine of the prediction error.
 
@@ -1485,6 +1495,7 @@ def log_cosh(y_true, y_pred):
 
 @keras_export('keras.metrics.categorical_crossentropy',
               'keras.losses.categorical_crossentropy')
+@dispatch.add_dispatch_support
 def categorical_crossentropy(y_true,
                              y_pred,
                              from_logits=False,
@@ -1525,6 +1536,7 @@ def categorical_crossentropy(y_true,
 
 @keras_export('keras.metrics.sparse_categorical_crossentropy',
               'keras.losses.sparse_categorical_crossentropy')
+@dispatch.add_dispatch_support
 def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
   """Computes the sparse categorical crossentropy loss.
 
@@ -1556,6 +1568,7 @@ def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1):
 
 @keras_export('keras.metrics.binary_crossentropy',
               'keras.losses.binary_crossentropy')
+@dispatch.add_dispatch_support
 def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
   """Computes the binary crossentropy loss.
 
@@ -1599,6 +1612,7 @@ def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0):
               'keras.losses.kullback_leibler_divergence',
               'keras.losses.kld',
               'keras.losses.KLD')
+@dispatch.add_dispatch_support
 def kl_divergence(y_true, y_pred):
   """Computes Kullback-Leibler divergence loss between `y_true` and `y_pred`.
 
@@ -1635,6 +1649,7 @@ def kl_divergence(y_true, y_pred):
 
 
 @keras_export('keras.metrics.poisson', 'keras.losses.poisson')
+@dispatch.add_dispatch_support
 def poisson(y_true, y_pred):
   """Computes the Poisson loss between y_true and y_pred.
 
@@ -1676,6 +1691,7 @@ def poisson(y_true, y_pred):
         'keras.losses.cosine',
         'keras.losses.cosine_similarity',
     ])
+@dispatch.add_dispatch_support
 def cosine_similarity(y_true, y_pred, axis=-1):
   """Computes the cosine similarity between labels and predictions.
 
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 63cf7c578bc..a67755b9333 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -69,6 +69,7 @@ from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util as tf_losses_utils
 from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
@@ -3212,6 +3213,7 @@ def accuracy(y_true, y_pred):
 
 
 @keras_export('keras.metrics.binary_accuracy')
+@dispatch.add_dispatch_support
 def binary_accuracy(y_true, y_pred, threshold=0.5):
   """Calculates how often predictions matches binary labels.
 
@@ -3239,6 +3241,7 @@ def binary_accuracy(y_true, y_pred, threshold=0.5):
 
 
 @keras_export('keras.metrics.categorical_accuracy')
+@dispatch.add_dispatch_support
 def categorical_accuracy(y_true, y_pred):
   """Calculates how often predictions matches one-hot labels.
 
@@ -3267,6 +3270,7 @@ def categorical_accuracy(y_true, y_pred):
 
 
 @keras_export('keras.metrics.sparse_categorical_accuracy')
+@dispatch.add_dispatch_support
 def sparse_categorical_accuracy(y_true, y_pred):
   """Calculates how often predictions matches integer labels.
 
@@ -3307,6 +3311,7 @@ def sparse_categorical_accuracy(y_true, y_pred):
 
 
 @keras_export('keras.metrics.top_k_categorical_accuracy')
+@dispatch.add_dispatch_support
 def top_k_categorical_accuracy(y_true, y_pred, k=5):
   """Computes how often targets are in the top `K` predictions.
 
@@ -3332,6 +3337,7 @@ def top_k_categorical_accuracy(y_true, y_pred, k=5):
 
 
 @keras_export('keras.metrics.sparse_top_k_categorical_accuracy')
+@dispatch.add_dispatch_support
 def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5):
   """Computes how often integer targets are in the top `K` predictions.
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 9a5e95d8aad..a2640925a38 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -57,6 +57,7 @@ _BaseSlice = slice
 
 
 @tf_export("reshape", v1=["reshape", "manip.reshape"])
+@dispatch.add_dispatch_support
 def reshape(tensor, shape, name=None):  # pylint: disable=redefined-outer-name
   r"""Reshapes a tensor.
 
@@ -197,6 +198,7 @@ def reshape(tensor, shape, name=None):  # pylint: disable=redefined-outer-name
 
 
 @tf_export("fill")
+@dispatch.add_dispatch_support
 def fill(dims, value, name=None):
   r"""Creates a tensor filled with a scalar value.
 
@@ -455,6 +457,7 @@ listdiff.__doc__ = gen_array_ops.list_diff.__doc__ + "\n" + listdiff.__doc__
                         "This op will be removed after the deprecation date. "
                         "Please switch to tf.sets.difference().")
 @tf_export(v1=["setdiff1d"])
+@dispatch.add_dispatch_support
 def setdiff1d(x, y, index_dtype=dtypes.int32, name=None):
   """Computes the difference between two lists of numbers or strings.
 
@@ -498,6 +501,7 @@ setdiff1d.__doc__ = gen_array_ops.list_diff.__doc__
 
 
 @tf_export("broadcast_dynamic_shape")
+@dispatch.add_dispatch_support
 def broadcast_dynamic_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given symbolic shapes.
 
@@ -523,6 +527,7 @@ def broadcast_dynamic_shape(shape_x, shape_y):
 
 
 @tf_export("broadcast_static_shape")
+@dispatch.add_dispatch_support
 def broadcast_static_shape(shape_x, shape_y):
   """Computes the shape of a broadcast given known shapes.
 
@@ -550,6 +555,7 @@ def broadcast_static_shape(shape_x, shape_y):
 
 
 @tf_export("shape", v1=[])
+@dispatch.add_dispatch_support
 def shape_v2(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
@@ -596,6 +602,7 @@ def shape_v2(input, out_type=dtypes.int32, name=None):
 
 
 @tf_export(v1=["shape"])
+@dispatch.add_dispatch_support
 def shape(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
@@ -650,6 +657,7 @@ def shape_internal(input, name=None, optimize=True, out_type=dtypes.int32):
 
 
 @tf_export("shape_n")
+@dispatch.add_dispatch_support
 def shape_n(input, out_type=dtypes.int32, name=None):
   # pylint: disable=redefined-builtin
   """Returns shape of tensors.
@@ -1007,6 +1015,7 @@ def _slice_helper(tensor, slice_spec, var=None):
 
 # pylint: disable=undefined-variable,protected-access,redefined-outer-name
 @tf_export("slice")
+@dispatch.add_dispatch_support
 def slice(input_, begin, size, name=None):
   # pylint: disable=redefined-builtin
   """Extracts a slice from a tensor.
@@ -1062,6 +1071,7 @@ def slice(input_, begin, size, name=None):
 
 # pylint: disable=invalid-name
 @tf_export("strided_slice")
+@dispatch.add_dispatch_support
 def strided_slice(input_,
                   begin,
                   end,
@@ -1253,6 +1263,7 @@ ops.Tensor._override_operator("__getitem__", _slice_helper)
 
 
 @tf_export("parallel_stack")
+@dispatch.add_dispatch_support
 def parallel_stack(values, name="parallel_stack"):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor in parallel.
 
@@ -1489,6 +1500,7 @@ ops.register_tensor_conversion_function((list, tuple),
 
 
 @tf_export("unstack")
+@dispatch.add_dispatch_support
 def unstack(value, num=None, axis=0, name="unstack"):
   """Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
 
@@ -1632,6 +1644,7 @@ def concat(values, axis, name="concat"):
 
 
 @tf_export(v1=["boolean_mask"])
+@dispatch.add_dispatch_support
 def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
   """Apply boolean mask to tensor.
 
@@ -1824,6 +1837,7 @@ def sparse_mask(a, mask_indices, name=None):
 
 
 @tf_export("unique")
+@dispatch.add_dispatch_support
 def unique(x, out_idx=dtypes.int32, name=None):
   """Finds unique elements in a 1-D tensor.
 
@@ -1871,6 +1885,7 @@ unique.__doc__ = gen_array_ops.unique.__doc__
 
 
 @tf_export("unique_with_counts")
+@dispatch.add_dispatch_support
 def unique_with_counts(x, out_idx=dtypes.int32, name=None):
   """Finds unique elements in a 1-D tensor.
 
@@ -1923,6 +1938,7 @@ unique_with_counts.__doc__ = gen_array_ops.unique_with_counts.__doc__
 
 
 @tf_export("split")
+@dispatch.add_dispatch_support
 def split(value, num_or_size_splits, axis=0, num=None, name="split"):
   """Splits a tensor `value` into a list of sub tensors.
 
@@ -2000,6 +2016,7 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
 
 
 @tf_export("transpose", v1=[])
+@dispatch.add_dispatch_support
 def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
   """Transposes `a`, where `a` is a Tensor.
 
@@ -2080,6 +2097,7 @@ def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
 
 
 @tf_export(v1=["transpose"])
+@dispatch.add_dispatch_support
 def transpose(a, perm=None, name="transpose", conjugate=False):
   """Transposes `a`.
 
@@ -2170,6 +2188,7 @@ def transpose(a, perm=None, name="transpose", conjugate=False):
 @tf_export(
     "linalg.matrix_transpose",
     v1=["linalg.transpose", "linalg.matrix_transpose", "matrix_transpose"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_transpose", "linalg.transpose")
 def matrix_transpose(a, name="matrix_transpose", conjugate=False):
   """Transposes last two dimensions of tensor `a`.
@@ -2248,6 +2267,7 @@ def matrix_transpose(a, name="matrix_transpose", conjugate=False):
 
 
 @tf_export("linalg.diag", v1=["linalg.diag", "matrix_diag"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_diag")
 def matrix_diag(diagonal,
                 name="diag",
@@ -2416,6 +2436,7 @@ def matrix_diag(diagonal,
 
 
 @tf_export("linalg.diag_part", v1=["linalg.diag_part", "matrix_diag_part"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_diag_part")
 @dispatch.add_dispatch_support
 def matrix_diag_part(
@@ -2556,6 +2577,7 @@ def matrix_diag_part(
 
 
 @tf_export("linalg.set_diag", v1=["linalg.set_diag", "matrix_set_diag"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("matrix_set_diag")
 def matrix_set_diag(
     input,  # pylint:disable=redefined-builtin
@@ -2719,6 +2741,7 @@ def _tag_zeros_tensor(fun):
 
 
 @tf_export("zeros")
+@dispatch.add_dispatch_support
 @_tag_zeros_tensor
 def zeros(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to zero.
@@ -2971,6 +2994,7 @@ def ones_like_impl(tensor, dtype, name, optimize=True):
 
 
 @tf_export("ones")
+@dispatch.add_dispatch_support
 def ones(shape, dtype=dtypes.float32, name=None):
   """Creates a tensor with all elements set to one (1).
 
@@ -3182,6 +3206,7 @@ def sparse_placeholder(dtype, shape=None, name=None):
 
 
 @tf_export("pad", v1=[])
+@dispatch.add_dispatch_support
 def pad_v2(tensor, paddings, mode="CONSTANT", constant_values=0, name=None):
   """Pads a tensor.
 
@@ -3240,6 +3265,7 @@ def pad_v2(tensor, paddings, mode="CONSTANT", constant_values=0, name=None):
 
 
 @tf_export(v1=["pad"])
+@dispatch.add_dispatch_support
 def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pylint: disable=invalid-name
   """Pads a tensor.
 
@@ -3357,6 +3383,7 @@ def _get_paddings_constant(paddings):
 
 
 @tf_export("meshgrid")
+@dispatch.add_dispatch_support
 def meshgrid(*args, **kwargs):
   """Broadcasts parameters for evaluation on an N-D grid.
 
@@ -3500,6 +3527,7 @@ def _TileGradShape(op):
 
 
 @tf_export("edit_distance")
+@dispatch.add_dispatch_support
 def edit_distance(hypothesis, truth, normalize=True, name="edit_distance"):
   """Computes the Levenshtein distance between sequences.
 
@@ -3694,6 +3722,7 @@ def required_space_to_batch_paddings(input_shape,
 
 
 @tf_export(v1=["nn.space_to_batch", "space_to_batch"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("space_to_batch")
 def space_to_batch(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
@@ -3717,6 +3746,7 @@ space_to_batch.__doc__ = gen_array_ops.space_to_batch.__doc__
 
 
 @tf_export("space_to_batch", "nn.space_to_batch", v1=[])
+@dispatch.add_dispatch_support
 def space_to_batch_v2(input, block_shape, paddings, name=None):  # pylint: disable=redefined-builtin
   return space_to_batch_nd(input, block_shape, paddings, name)
 
@@ -3725,6 +3755,7 @@ space_to_batch_v2.__doc__ = gen_array_ops.space_to_batch_nd.__doc__
 
 
 @tf_export(v1=["nn.space_to_depth", "space_to_depth"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("space_to_depth")
 def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
@@ -3734,6 +3765,7 @@ space_to_depth.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
 @tf_export("nn.space_to_depth", v1=[])
+@dispatch.add_dispatch_support
 def space_to_depth_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
 
@@ -3742,6 +3774,7 @@ space_to_depth_v2.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
 @tf_export(v1=["nn.depth_to_space", "depth_to_space"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("depth_to_space")
 def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
@@ -3751,6 +3784,7 @@ depth_to_space.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
 @tf_export("nn.depth_to_space", v1=[])
+@dispatch.add_dispatch_support
 def depth_to_space_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
 
@@ -3759,6 +3793,7 @@ depth_to_space_v2.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
 @tf_export(v1=["batch_to_space"])
+@dispatch.add_dispatch_support
 def batch_to_space(input, crops, block_size, name=None, block_shape=None):  # pylint: disable=redefined-builtin,missing-docstring
   block_size = deprecation.deprecated_argument_lookup("block_shape",
                                                       block_shape, "block_size",
@@ -3776,6 +3811,7 @@ batch_to_space.__doc__ = gen_array_ops.batch_to_space.__doc__
 
 
 @tf_export("batch_to_space", v1=[])
+@dispatch.add_dispatch_support
 def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=redefined-builtin
   """BatchToSpace for N-D tensors of type T.
 
@@ -4091,6 +4127,7 @@ def _all_dimensions(x):
 
 
 @tf_export("sequence_mask")
+@dispatch.add_dispatch_support
 def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
   """Returns a mask tensor representing the first N positions of each cell.
 
@@ -4317,6 +4354,7 @@ def where(condition, x=None, y=None, name=None):
 
 
 @tf_export("where", v1=["where_v2"])
+@dispatch.add_dispatch_support
 def where_v2(condition, x=None, y=None, name=None):
   """Return the elements where `condition` is `True` (multiplexing `x` and `y`).
 
@@ -5003,6 +5041,7 @@ def batch_gather_nd(params, indices, batch_dims, name=None):
 # because round_mode was added later.
 # (And also now because of 'axis' processing).
 @tf_export(v1=["quantize_v2"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     "2017-10-25",
     "`tf.quantize_v2` is deprecated, please use `tf.quantization.quantize` "
@@ -5056,6 +5095,7 @@ quantize_v2.__doc__ = """Please use `tf.quantization.quantize` instead."""
 # tf.quantization.quantize; we can deprecate tf.quantization.quantize in next
 # version of TensorFlow.
 @tf_export("quantization.quantize", v1=["quantization.quantize", "quantize"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("quantize")
 def quantize(
     input,  # pylint: disable=redefined-builtin
@@ -5095,6 +5135,7 @@ def quantize(
 
 @tf_export("quantization.dequantize", v1=["quantization.dequantize",
                                           "dequantize"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("dequantize")
 def dequantize(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
@@ -5130,6 +5171,7 @@ dequantize.__doc__ = gen_array_ops.dequantize.__doc__
 
 
 @tf_export("quantization.quantize_and_dequantize")
+@dispatch.add_dispatch_support
 def quantize_and_dequantize(
     input,  # pylint: disable=redefined-builtin
     input_min,
@@ -5189,6 +5231,7 @@ def quantize_and_dequantize(
 
 
 @tf_export("searchsorted")
+@dispatch.add_dispatch_support
 def searchsorted(sorted_sequence,
                  values,
                  side="left",
@@ -5253,6 +5296,7 @@ quantize.__doc__ = gen_array_ops.quantize_v2.__doc__
 
 
 @tf_export("image.extract_patches")
+@dispatch.add_dispatch_support
 def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
   r"""Extract `patches` from `images`.
 
@@ -5374,6 +5418,7 @@ def extract_image_patches_v2(images, sizes, strides, rates, padding, name=None):
 
 
 @tf_export(v1=["image.extract_image_patches", "extract_image_patches"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "ksizes is deprecated, use sizes instead",
                              "ksizes")
 def extract_image_patches(  # pylint: disable=missing-docstring
@@ -5422,6 +5467,7 @@ extract_image_patches.__doc__ = gen_array_ops.extract_image_patches.__doc__
 
 
 @tf_export("fingerprint")
+@dispatch.add_dispatch_support
 def fingerprint(data, method="farmhash64", name=None):
   r"""Generates fingerprint values.
 
@@ -5668,6 +5714,7 @@ def _with_nonzero_rank(data):
 
 
 @tf_export("repeat")
+@dispatch.add_dispatch_support
 def repeat(input, repeats, axis=None, name=None):  # pylint: disable=redefined-builtin
   """Repeat elements of `input`.
   
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index 56f76a49d51..6c1a36e65c9 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -24,12 +24,14 @@ from tensorflow.python.ops import array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import gen_candidate_sampling_ops
 from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(
     'random.uniform_candidate_sampler',
     v1=['random.uniform_candidate_sampler', 'nn.uniform_candidate_sampler'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('nn.uniform_candidate_sampler')
 def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                               range_max, seed=None, name=None):
@@ -92,6 +94,7 @@ def uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
         'random.log_uniform_candidate_sampler',
         'nn.log_uniform_candidate_sampler'
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('nn.log_uniform_candidate_sampler')
 def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
                                   range_max, seed=None, name=None):
@@ -154,6 +157,7 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
 @tf_export(
     'random.learned_unigram_candidate_sampler',
     'nn.learned_unigram_candidate_sampler')
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints(['nn.learned_unigram_candidate_sampler'])
 def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
                                       unique, range_max, seed=None, name=None):
@@ -213,6 +217,7 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
 
 @tf_export('random.fixed_unigram_candidate_sampler',
            'nn.fixed_unigram_candidate_sampler')
+@dispatch.add_dispatch_support
 def fixed_unigram_candidate_sampler(true_classes,
                                     num_true,
                                     num_sampled,
@@ -341,6 +346,7 @@ def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
 
 
 @tf_export('nn.compute_accidental_hits')
+@dispatch.add_dispatch_support
 def compute_accidental_hits(true_classes, sampled_candidates, num_true,
                             seed=None, name=None):
   """Compute the position ids in `sampled_candidates` matching `true_classes`.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index cefca5defae..9a5b86a1deb 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 NUMERIC_TYPES = frozenset(
@@ -375,6 +376,7 @@ def _binary_assert(sym, opname, op_func, static_func, x, y, data, summarize,
 @tf_export(
     'debugging.assert_proper_iterable',
     v1=['debugging.assert_proper_iterable', 'assert_proper_iterable'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_proper_iterable')
 def assert_proper_iterable(values):
   """Static assert that values is a "proper" iterable.
@@ -404,6 +406,7 @@ def assert_proper_iterable(values):
 
 
 @tf_export('debugging.assert_negative', v1=[])
+@dispatch.add_dispatch_support
 def assert_negative_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x < 0` holds element-wise.
 
@@ -436,6 +439,7 @@ def assert_negative_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_negative', 'assert_negative'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_negative')
 @_unary_assert_doc('< 0', 'negative')
 def assert_negative(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -456,6 +460,7 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):  # p
 
 
 @tf_export('debugging.assert_positive', v1=[])
+@dispatch.add_dispatch_support
 def assert_positive_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x > 0` holds element-wise.
 
@@ -488,6 +493,7 @@ def assert_positive_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_positive', 'assert_positive'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_positive')
 @_unary_assert_doc('> 0', 'positive')
 def assert_positive(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -507,6 +513,7 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):  # p
 
 
 @tf_export('debugging.assert_non_negative', v1=[])
+@dispatch.add_dispatch_support
 def assert_non_negative_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x >= 0` holds element-wise.
 
@@ -541,6 +548,7 @@ def assert_non_negative_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_non_negative', 'assert_non_negative'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_non_negative')
 @_unary_assert_doc('>= 0', 'non-negative')
 def assert_non_negative(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -561,6 +569,7 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_non_positive', v1=[])
+@dispatch.add_dispatch_support
 def assert_non_positive_v2(x, message=None, summarize=None, name=None):
   """Assert the condition `x <= 0` holds element-wise.
 
@@ -595,6 +604,7 @@ def assert_non_positive_v2(x, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_non_positive', 'assert_non_positive'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_non_positive')
 @_unary_assert_doc('<= 0', 'non-positive')
 def assert_non_positive(x, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
@@ -615,6 +625,7 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_equal', 'assert_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_equal_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x == y` holds element-wise.
 
@@ -649,6 +660,7 @@ def assert_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_equal', 'assert_equal'])
+@dispatch.add_dispatch_support
 @_binary_assert_doc('==')
 def assert_equal(x, y, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   with ops.name_scope(name, 'assert_equal', [x, y, data]):
@@ -660,6 +672,7 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):  # p
 
 
 @tf_export('debugging.assert_none_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
   """Assert the condition `x != y` holds for all elements.
 
@@ -698,6 +711,7 @@ def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_none_equal', 'assert_none_equal'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_none_equal')
 @_binary_assert_doc('!=')
 def assert_none_equal(
@@ -707,6 +721,7 @@ def assert_none_equal(
 
 
 @tf_export('debugging.assert_near', v1=[])
+@dispatch.add_dispatch_support
 def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
                    name=None):
   """Assert the condition `x` and `y` are close element-wise.
@@ -760,6 +775,7 @@ def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
 
 
 @tf_export(v1=['debugging.assert_near', 'assert_near'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_near')
 def assert_near(
     x, y, rtol=None, atol=None, data=None, summarize=None, message=None,
@@ -839,6 +855,7 @@ def assert_near(
 
 
 @tf_export('debugging.assert_less', 'assert_less', v1=[])
+@dispatch.add_dispatch_support
 def assert_less_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x < y` holds element-wise.
 
@@ -874,6 +891,7 @@ def assert_less_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_less', 'assert_less'])
+@dispatch.add_dispatch_support
 @_binary_assert_doc('<')
 def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   return _binary_assert('<', 'assert_less', math_ops.less, np.less, x, y, data,
@@ -881,6 +899,7 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_less_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x <= y` holds element-wise.
 
@@ -917,6 +936,7 @@ def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_less_equal', 'assert_less_equal'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_less_equal')
 @_binary_assert_doc('<=')
 def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
@@ -925,6 +945,7 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_greater', 'assert_greater', v1=[])
+@dispatch.add_dispatch_support
 def assert_greater_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x > y` holds element-wise.
 
@@ -961,6 +982,7 @@ def assert_greater_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_greater', 'assert_greater'])
+@dispatch.add_dispatch_support
 @_binary_assert_doc('>')
 def assert_greater(x, y, data=None, summarize=None, message=None, name=None):  # pylint: disable=missing-docstring
   return _binary_assert('>', 'assert_greater', math_ops.greater, np.greater, x,
@@ -968,6 +990,7 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):  #
 
 
 @tf_export('debugging.assert_greater_equal', v1=[])
+@dispatch.add_dispatch_support
 def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
   """Assert the condition `x >= y` holds element-wise.
 
@@ -1005,6 +1028,7 @@ def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_greater_equal', 'assert_greater_equal'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_greater_equal')
 @_binary_assert_doc('>=')
 def assert_greater_equal(x, y, data=None, summarize=None, message=None,
@@ -1062,6 +1086,7 @@ def _assert_rank_condition(
 
 
 @tf_export('debugging.assert_rank', 'assert_rank', v1=[])
+@dispatch.add_dispatch_support
 def assert_rank_v2(x, rank, message=None, name=None):
   """Assert that `x` has rank equal to `rank`.
 
@@ -1095,6 +1120,7 @@ def assert_rank_v2(x, rank, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_rank', 'assert_rank'])
+@dispatch.add_dispatch_support
 def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank`.
 
@@ -1157,6 +1183,7 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
 
 
 @tf_export('debugging.assert_rank_at_least', v1=[])
+@dispatch.add_dispatch_support
 def assert_rank_at_least_v2(x, rank, message=None, name=None):
   """Assert that `x` has rank of at least `rank`.
 
@@ -1190,6 +1217,7 @@ def assert_rank_at_least_v2(x, rank, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_rank_at_least', 'assert_rank_at_least'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_rank_at_least')
 def assert_rank_at_least(
     x, rank, data=None, summarize=None, message=None, name=None):
@@ -1322,6 +1350,7 @@ def _assert_ranks_condition(
 
 
 @tf_export('debugging.assert_rank_in', v1=[])
+@dispatch.add_dispatch_support
 def assert_rank_in_v2(x, ranks, message=None, name=None):
   """Assert that `x` has a rank in `ranks`.
 
@@ -1354,6 +1383,7 @@ def assert_rank_in_v2(x, ranks, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_rank_in', 'assert_rank_in'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_rank_in')
 def assert_rank_in(
     x, ranks, data=None, summarize=None, message=None, name=None):
@@ -1417,6 +1447,7 @@ def assert_rank_in(
 
 
 @tf_export('debugging.assert_integer', v1=[])
+@dispatch.add_dispatch_support
 def assert_integer_v2(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
 
@@ -1437,6 +1468,7 @@ def assert_integer_v2(x, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_integer', 'assert_integer'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_integer')
 def assert_integer(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
@@ -1476,6 +1508,7 @@ def assert_integer(x, message=None, name=None):
 
 
 @tf_export('debugging.assert_type', v1=[])
+@dispatch.add_dispatch_support
 def assert_type_v2(tensor, tf_type, message=None, name=None):
   """Asserts that the given `Tensor` is of the specified type.
 
@@ -1495,6 +1528,7 @@ def assert_type_v2(tensor, tf_type, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_type', 'assert_type'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_type')
 def assert_type(tensor, tf_type, message=None, name=None):
   """Statically asserts that the given `Tensor` is of the specified type.
@@ -1584,6 +1618,7 @@ _TensorDimSizes = collections.namedtuple(
 
 
 @tf_export('debugging.assert_shapes', v1=[])
+@dispatch.add_dispatch_support
 def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
                      name=None):
   """Assert tensor shapes and dimension size relationships between tensors.
@@ -1650,6 +1685,7 @@ def assert_shapes_v2(shapes, data=None, summarize=None, message=None,
 
 
 @tf_export(v1=['debugging.assert_shapes'])
+@dispatch.add_dispatch_support
 def assert_shapes(shapes, data=None, summarize=None, message=None, name=None):
   """Assert tensor shapes and dimension size relationships between tensors.
 
@@ -1939,6 +1975,7 @@ def is_numeric_tensor(tensor):
         'math.is_non_decreasing', 'debugging.is_non_decreasing',
         'is_non_decreasing'
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('debugging.is_non_decreasing',
                                   'is_non_decreasing')
 def is_non_decreasing(x, name=None):
@@ -1980,6 +2017,7 @@ def is_non_decreasing(x, name=None):
         'math.is_strictly_increasing', 'debugging.is_strictly_increasing',
         'is_strictly_increasing'
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('debugging.is_strictly_increasing',
                                   'is_strictly_increasing')
 def is_strictly_increasing(x, name=None):
@@ -2066,6 +2104,7 @@ def _assert_same_base_type(items, expected_type=None):
 @tf_export(
     'debugging.assert_same_float_dtype',
     v1=['debugging.assert_same_float_dtype', 'assert_same_float_dtype'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_same_float_dtype')
 def assert_same_float_dtype(tensors=None, dtype=None):
   """Validate and return float type based on `tensors` and `dtype`.
@@ -2098,6 +2137,7 @@ def assert_same_float_dtype(tensors=None, dtype=None):
 
 
 @tf_export('debugging.assert_scalar', v1=[])
+@dispatch.add_dispatch_support
 def assert_scalar_v2(tensor, message=None, name=None):
   """Asserts that the given `tensor` is a scalar.
 
@@ -2120,6 +2160,7 @@ def assert_scalar_v2(tensor, message=None, name=None):
 
 
 @tf_export(v1=['debugging.assert_scalar', 'assert_scalar'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('assert_scalar')
 def assert_scalar(tensor, name=None, message=None):
   """Asserts that the given `tensor` is a scalar (i.e. zero-dimensional).
@@ -2154,6 +2195,7 @@ def assert_scalar(tensor, name=None, message=None):
 
 
 @tf_export('ensure_shape')
+@dispatch.add_dispatch_support
 def ensure_shape(x, shape, name=None):
   """Updates the shape of a tensor and checks at runtime that the shape holds.
 
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index edb35afa52c..f7662516b4f 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -152,6 +152,7 @@ def _clip_by_value_grad(op, grad):
 
 
 @tf_export("clip_by_norm")
+@dispatch.add_dispatch_support
 def clip_by_norm(t, clip_norm, axes=None, name=None):
   """Clips tensor values to a maximum L2-norm.
 
@@ -235,6 +236,7 @@ def clip_by_norm(t, clip_norm, axes=None, name=None):
 
 
 @tf_export("linalg.global_norm", v1=["linalg.global_norm", "global_norm"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("global_norm")
 def global_norm(t_list, name=None):
   """Computes the global norm of multiple tensors.
@@ -285,6 +287,7 @@ def global_norm(t_list, name=None):
 
 
 @tf_export("clip_by_global_norm")
+@dispatch.add_dispatch_support
 def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
   """Clips values of multiple tensors by the ratio of the sum of their norms.
 
@@ -382,6 +385,7 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
     "use clip_by_norm(t, clip_norm * tf.cast(tf.size(t), tf.float32), name) "
     "instead.")
 @tf_export(v1=["clip_by_average_norm"])
+@dispatch.add_dispatch_support
 def clip_by_average_norm(t, clip_norm, name=None):
   """Clips tensor values to a maximum average L2-norm.
 
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index 3e885975b03..39177defe57 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -93,6 +94,7 @@ def remove_squeezable_dimensions(
 
 
 @tf_export('math.confusion_matrix', v1=[])
+@dispatch.add_dispatch_support
 def confusion_matrix(labels,
                      predictions,
                      num_classes=None,
@@ -202,6 +204,7 @@ def confusion_matrix(labels,
 
 
 @tf_export(v1=['math.confusion_matrix', 'confusion_matrix'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('confusion_matrix', 'train.confusion_matrix')
 def confusion_matrix_v1(labels,
                         predictions,
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 58948f7d52a..918c989432d 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -54,6 +54,7 @@ from tensorflow.python.ops.gen_control_flow_ops import *
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.lazy_loader import LazyLoader
@@ -110,6 +111,7 @@ def _summarize_eager(tensor, summarize=None):
 # Assert and Print are special symbols in python, so we must
 # use an upper-case version of them.
 @tf_export("debugging.Assert", "Assert")
+@dispatch.add_dispatch_support
 @tf_should_use.should_use_result
 def Assert(condition, data, summarize=None, name=None):
   """Asserts that the given condition is true.
@@ -1095,6 +1097,7 @@ def _UnpackIfSingleton(res):
 # pylint: disable=redefined-outer-name
 # pylint: disable=g-doc-args
 @tf_export(v1=["cond"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(
     None, "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
     "fn1", "fn2")
@@ -1318,6 +1321,7 @@ def _cast_indexed_slice_indices(a, b):
 
 
 @tf_export("cond", v1=[])
+@dispatch.add_dispatch_support
 def cond_for_tf_v2(pred, true_fn=None, false_fn=None, name=None):
   """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
 
@@ -2942,6 +2946,7 @@ def group(*inputs, **kwargs):
 
 
 @tf_export("tuple", v1=[])
+@dispatch.add_dispatch_support
 def tuple_v2(tensors, control_inputs=None, name=None):
   """Group tensors together.
 
@@ -2978,6 +2983,7 @@ def tuple_v2(tensors, control_inputs=None, name=None):
 
 
 @tf_export(v1=["tuple"])
+@dispatch.add_dispatch_support
 def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined-builtin
   """Group tensors together.
 
@@ -3312,6 +3318,7 @@ def _indexed_case_helper(branch_fns, default, branch_index, name):
 
 
 @tf_export("case", v1=[])
+@dispatch.add_dispatch_support
 def case_v2(pred_fn_pairs,
             default=None,
             exclusive=False,
@@ -3416,6 +3423,7 @@ def case_v2(pred_fn_pairs,
 
 
 @tf_export(v1=["case"])
+@dispatch.add_dispatch_support
 def case(pred_fn_pairs,
          default=None,
          exclusive=False,
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index d989bc0be44..6c9cdf1dd08 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -70,6 +71,7 @@ def _generate_defun_backend(unique_api_name, preferred_device, func):
 
 # pylint: disable=protected-access, invalid-name
 @tf_export(v1=["nn.ctc_loss"])
+@dispatch.add_dispatch_support
 def ctc_loss(labels,
              inputs=None,
              sequence_length=None,
@@ -284,6 +286,7 @@ def _CTCLossV2Grad(op, grad_loss, _):
 
 
 @tf_export("nn.ctc_greedy_decoder")
+@dispatch.add_dispatch_support
 def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
   """Performs greedy decoding on the logits given in input (best path).
 
@@ -333,6 +336,7 @@ def ctc_greedy_decoder(inputs, sequence_length, merge_repeated=True):
 
 
 @tf_export(v1=["nn.ctc_beam_search_decoder"])
+@dispatch.add_dispatch_support
 def ctc_beam_search_decoder(inputs,
                             sequence_length,
                             beam_width=100,
@@ -395,6 +399,7 @@ def ctc_beam_search_decoder(inputs,
 
 
 @tf_export("nn.ctc_beam_search_decoder", v1=["nn.ctc_beam_search_decoder_v2"])
+@dispatch.add_dispatch_support
 def ctc_beam_search_decoder_v2(inputs,
                                sequence_length,
                                beam_width=100,
@@ -731,6 +736,7 @@ def _ctc_loss_shape(op):
 
 # pylint: disable=protected-access, invalid-name
 @tf_export(v1=["nn.ctc_loss_v2"])
+@dispatch.add_dispatch_support
 def ctc_loss_v2(labels,
                 logits,
                 label_length,
@@ -825,6 +831,7 @@ def ctc_loss_v2(labels,
 
 
 @tf_export("nn.ctc_loss", v1=[])
+@dispatch.add_dispatch_support
 def ctc_loss_v3(labels,
                 logits,
                 label_length,
@@ -1056,6 +1063,7 @@ def ctc_loss_dense(labels,
 
 
 @tf_export("nn.collapse_repeated")
+@dispatch.add_dispatch_support
 def collapse_repeated(labels, seq_length, name=None):
   """Merge repeated labels into single labels.
 
@@ -1153,6 +1161,7 @@ def dense_labels_to_sparse(dense, length):
 
 
 @tf_export("nn.ctc_unique_labels")
+@dispatch.add_dispatch_support
 def ctc_unique_labels(labels, name=None):
   """Get unique labels and indices for batched labels for `tf.nn.ctc_loss`.
 
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 2fdae49b1f6..1c7b204fa58 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -250,6 +251,7 @@ def _embedding_lookup_and_transform(params,
 
 
 @tf_export(v1=["nn.embedding_lookup"])
+@dispatch.add_dispatch_support
 def embedding_lookup(
     params,
     ids,
@@ -327,6 +329,7 @@ def embedding_lookup(
 
 
 @tf_export("nn.embedding_lookup", v1=[])
+@dispatch.add_dispatch_support
 def embedding_lookup_v2(params, ids, max_norm=None, name=None):
   """Looks up embeddings for the given `ids` from a list of tensors.
 
@@ -392,6 +395,7 @@ def embedding_lookup_v2(params, ids, max_norm=None, name=None):
 
 
 @tf_export(v1=["nn.embedding_lookup_sparse"])
+@dispatch.add_dispatch_support
 def embedding_lookup_sparse(params,
                             sp_ids,
                             sp_weights,
@@ -574,6 +578,7 @@ def embedding_lookup_sparse(params,
 
 
 @tf_export("nn.embedding_lookup_sparse", v1=[])
+@dispatch.add_dispatch_support
 def embedding_lookup_sparse_v2(params,
                                sp_ids,
                                sp_weights,
@@ -664,6 +669,7 @@ def embedding_lookup_sparse_v2(params,
 
 
 @tf_export("nn.safe_embedding_lookup_sparse", v1=[])
+@dispatch.add_dispatch_support
 def safe_embedding_lookup_sparse_v2(embedding_weights,
                                     sparse_ids,
                                     sparse_weights=None,
@@ -765,6 +771,7 @@ def safe_embedding_lookup_sparse_v2(embedding_weights,
 
 
 @tf_export(v1=["nn.safe_embedding_lookup_sparse"])
+@dispatch.add_dispatch_support
 def safe_embedding_lookup_sparse(embedding_weights,
                                  sparse_ids,
                                  sparse_weights=None,
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 8ec925824de..37b41a55eb9 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops.gen_functional_ops import remote_call
 from tensorflow.python.ops.gen_functional_ops import symbolic_gradient
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
@@ -45,6 +46,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 # TODO(yuanbyu, mrry): Handle stride to support sliding windows.
 @tf_export(v1=["foldl"])
+@dispatch.add_dispatch_support
 def foldl(fn,
           elems,
           initializer=None,
@@ -162,6 +164,7 @@ def foldl(fn,
 
 
 @tf_export("foldl", v1=[])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     """back_prop=False is deprecated. Consider using tf.stop_gradient instead.
@@ -238,6 +241,7 @@ def foldl_v2(fn,
 
 
 @tf_export(v1=["foldr"])
+@dispatch.add_dispatch_support
 def foldr(fn,
           elems,
           initializer=None,
@@ -356,6 +360,7 @@ def foldr(fn,
 
 
 @tf_export("foldr", v1=[])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     """back_prop=False is deprecated. Consider using tf.stop_gradient instead.
@@ -432,6 +437,7 @@ def foldr_v2(fn,
 
 
 @tf_export(v1=["scan"])
+@dispatch.add_dispatch_support
 def scan(fn,
          elems,
          initializer=None,
@@ -686,6 +692,7 @@ def scan(fn,
 
 
 @tf_export("scan", v1=[])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     """back_prop=False is deprecated. Consider using tf.stop_gradient instead.
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index 92f3e7a24ba..d88025d653c 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -26,10 +26,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('histogram_fixed_width_bins')
+@dispatch.add_dispatch_support
 def histogram_fixed_width_bins(values,
                                value_range,
                                nbins=100,
@@ -101,6 +103,7 @@ def histogram_fixed_width_bins(values,
 
 
 @tf_export('histogram_fixed_width')
+@dispatch.add_dispatch_support
 def histogram_fixed_width(values,
                           value_range,
                           nbins=100,
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 52b65efad67..4920be213d8 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import sort_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable('RandomCrop')
@@ -323,6 +324,7 @@ def fix_image_flip_shape(image, result):
 
 
 @tf_export('image.random_flip_up_down')
+@dispatch.add_dispatch_support
 def random_flip_up_down(image, seed=None):
   """Randomly flips an image vertically (upside down).
 
@@ -363,6 +365,7 @@ def random_flip_up_down(image, seed=None):
 
 
 @tf_export('image.random_flip_left_right')
+@dispatch.add_dispatch_support
 def random_flip_left_right(image, seed=None):
   """Randomly flip an image horizontally (left to right).
 
@@ -450,6 +453,7 @@ def _random_flip(image, flip_index, seed, scope_name):
 
 
 @tf_export('image.flip_left_right')
+@dispatch.add_dispatch_support
 def flip_left_right(image):
   """Flip an image horizontally (left to right).
 
@@ -484,6 +488,7 @@ def flip_left_right(image):
 
 
 @tf_export('image.flip_up_down')
+@dispatch.add_dispatch_support
 def flip_up_down(image):
   """Flip an image vertically (upside down).
 
@@ -549,6 +554,7 @@ def _flip(image, flip_index, scope_name):
 
 
 @tf_export('image.rot90')
+@dispatch.add_dispatch_support
 def rot90(image, k=1, name=None):
   """Rotate image(s) counter-clockwise by 90 degrees.
 
@@ -660,6 +666,7 @@ def _rot90_4D(images, k, name_scope):
 
 
 @tf_export('image.transpose', v1=['image.transpose', 'image.transpose_image'])
+@dispatch.add_dispatch_support
 def transpose(image, name=None):
   """Transpose image(s) by swapping the height and width dimension.
 
@@ -718,6 +725,7 @@ def transpose(image, name=None):
 
 
 @tf_export('image.central_crop')
+@dispatch.add_dispatch_support
 def central_crop(image, central_fraction):
   """Crop the central region of the image(s).
 
@@ -850,6 +858,7 @@ def central_crop(image, central_fraction):
 
 
 @tf_export('image.pad_to_bounding_box')
+@dispatch.add_dispatch_support
 def pad_to_bounding_box(image, offset_height, offset_width, target_height,
                         target_width):
   """Pad `image` with zeros to the specified `height` and `width`.
@@ -959,6 +968,7 @@ def pad_to_bounding_box(image, offset_height, offset_width, target_height,
 
 
 @tf_export('image.crop_to_bounding_box')
+@dispatch.add_dispatch_support
 def crop_to_bounding_box(image, offset_height, offset_width, target_height,
                          target_width):
   """Crops an image to a specified bounding box.
@@ -1041,6 +1051,7 @@ def crop_to_bounding_box(image, offset_height, offset_width, target_height,
 @tf_export(
     'image.resize_with_crop_or_pad',
     v1=['image.resize_with_crop_or_pad', 'image.resize_image_with_crop_or_pad'])
+@dispatch.add_dispatch_support
 def resize_image_with_crop_or_pad(image, target_height, target_width):
   """Crops and/or pads an image to a target width and height.
 
@@ -1258,6 +1269,7 @@ def _resize_images_common(images, resizer_fn, size, preserve_aspect_ratio, name,
 
 
 @tf_export(v1=['image.resize_images', 'image.resize'])
+@dispatch.add_dispatch_support
 def resize_images(images,
                   size,
                   method=ResizeMethodV1.BILINEAR,
@@ -1343,6 +1355,7 @@ def resize_images(images,
 
 
 @tf_export('image.resize', v1=[])
+@dispatch.add_dispatch_support
 def resize_images_v2(images,
                      size,
                      method=ResizeMethod.BILINEAR,
@@ -1594,6 +1607,7 @@ def _resize_image_with_pad_common(image, target_height, target_width,
 
 
 @tf_export(v1=['image.resize_image_with_pad'])
+@dispatch.add_dispatch_support
 def resize_image_with_pad_v1(image,
                              target_height,
                              target_width,
@@ -1636,6 +1650,7 @@ def resize_image_with_pad_v1(image,
 
 
 @tf_export('image.resize_with_pad', v1=[])
+@dispatch.add_dispatch_support
 def resize_image_with_pad_v2(image,
                              target_height,
                              target_width,
@@ -1676,6 +1691,7 @@ def resize_image_with_pad_v2(image,
 
 
 @tf_export('image.per_image_standardization')
+@dispatch.add_dispatch_support
 def per_image_standardization(image):
   """Linearly scales each image in `image` to have mean 0 and variance 1.
 
@@ -1721,6 +1737,7 @@ def per_image_standardization(image):
 
 
 @tf_export('image.random_brightness')
+@dispatch.add_dispatch_support
 def random_brightness(image, max_delta, seed=None):
   """Adjust the brightness of images by a random factor.
 
@@ -1756,6 +1773,7 @@ def random_brightness(image, max_delta, seed=None):
 
 
 @tf_export('image.random_contrast')
+@dispatch.add_dispatch_support
 def random_contrast(image, lower, upper, seed=None):
   """Adjust the contrast of an image or images by a random factor.
 
@@ -1796,6 +1814,7 @@ def random_contrast(image, lower, upper, seed=None):
 
 
 @tf_export('image.adjust_brightness')
+@dispatch.add_dispatch_support
 def adjust_brightness(image, delta):
   """Adjust the brightness of RGB or Grayscale images.
 
@@ -1847,6 +1866,7 @@ def adjust_brightness(image, delta):
 
 
 @tf_export('image.adjust_contrast')
+@dispatch.add_dispatch_support
 def adjust_contrast(images, contrast_factor):
   """Adjust contrast of RGB or grayscale images.
 
@@ -1903,6 +1923,7 @@ def adjust_contrast(images, contrast_factor):
 
 
 @tf_export('image.adjust_gamma')
+@dispatch.add_dispatch_support
 def adjust_gamma(image, gamma=1, gain=1):
   """Performs [Gamma Correction](http://en.wikipedia.org/wiki/Gamma_correction).
 
@@ -1967,6 +1988,7 @@ def adjust_gamma(image, gamma=1, gain=1):
 
 
 @tf_export('image.convert_image_dtype')
+@dispatch.add_dispatch_support
 def convert_image_dtype(image, dtype, saturate=False, name=None):
   """Convert `image` to `dtype`, scaling its values if needed.
 
@@ -2066,6 +2088,7 @@ def convert_image_dtype(image, dtype, saturate=False, name=None):
 
 
 @tf_export('image.rgb_to_grayscale')
+@dispatch.add_dispatch_support
 def rgb_to_grayscale(images, name=None):
   """Converts one or more images from RGB to Grayscale.
 
@@ -2101,6 +2124,7 @@ def rgb_to_grayscale(images, name=None):
 
 
 @tf_export('image.grayscale_to_rgb')
+@dispatch.add_dispatch_support
 def grayscale_to_rgb(images, name=None):
   """Converts one or more images from Grayscale to RGB.
 
@@ -2137,6 +2161,7 @@ def grayscale_to_rgb(images, name=None):
 
 # pylint: disable=invalid-name
 @tf_export('image.random_hue')
+@dispatch.add_dispatch_support
 def random_hue(image, max_delta, seed=None):
   """Adjust the hue of RGB images by a random factor.
 
@@ -2179,6 +2204,7 @@ def random_hue(image, max_delta, seed=None):
 
 
 @tf_export('image.adjust_hue')
+@dispatch.add_dispatch_support
 def adjust_hue(image, delta, name=None):
   """Adjust hue of RGB images.
 
@@ -2246,6 +2272,7 @@ def adjust_hue(image, delta, name=None):
 
 # pylint: disable=invalid-name
 @tf_export('image.random_jpeg_quality')
+@dispatch.add_dispatch_support
 def random_jpeg_quality(image, min_jpeg_quality, max_jpeg_quality, seed=None):
   """Randomly changes jpeg encoding quality for inducing jpeg noise.
 
@@ -2293,6 +2320,7 @@ def random_jpeg_quality(image, min_jpeg_quality, max_jpeg_quality, seed=None):
 
 
 @tf_export('image.adjust_jpeg_quality')
+@dispatch.add_dispatch_support
 def adjust_jpeg_quality(image, jpeg_quality, name=None):
   """Adjust jpeg encoding quality of an image.
 
@@ -2343,6 +2371,7 @@ def adjust_jpeg_quality(image, jpeg_quality, name=None):
 
 
 @tf_export('image.random_saturation')
+@dispatch.add_dispatch_support
 def random_saturation(image, lower, upper, seed=None):
   """Adjust the saturation of RGB images by a random factor.
 
@@ -2389,6 +2418,7 @@ def random_saturation(image, lower, upper, seed=None):
 
 
 @tf_export('image.adjust_saturation')
+@dispatch.add_dispatch_support
 def adjust_saturation(image, saturation_factor, name=None):
   """Adjust saturation of RGB images.
 
@@ -2480,42 +2510,43 @@ tf_export(
     'io.decode_and_crop_jpeg',
     'image.decode_and_crop_jpeg',
     v1=['io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg'])(
-        gen_image_ops.decode_and_crop_jpeg)
+        dispatch.add_dispatch_support(gen_image_ops.decode_and_crop_jpeg))
 
 tf_export(
     'io.decode_bmp',
     'image.decode_bmp',
     v1=['io.decode_bmp', 'image.decode_bmp'])(
-        gen_image_ops.decode_bmp)
+        dispatch.add_dispatch_support(gen_image_ops.decode_bmp))
 tf_export(
     'io.decode_gif',
     'image.decode_gif',
     v1=['io.decode_gif', 'image.decode_gif'])(
-        gen_image_ops.decode_gif)
+        dispatch.add_dispatch_support(gen_image_ops.decode_gif))
 tf_export(
     'io.decode_jpeg',
     'image.decode_jpeg',
     v1=['io.decode_jpeg', 'image.decode_jpeg'])(
-        gen_image_ops.decode_jpeg)
+        dispatch.add_dispatch_support(gen_image_ops.decode_jpeg))
 tf_export(
     'io.decode_png',
     'image.decode_png',
     v1=['io.decode_png', 'image.decode_png'])(
-        gen_image_ops.decode_png)
+        dispatch.add_dispatch_support(gen_image_ops.decode_png))
 
 tf_export(
     'io.encode_jpeg',
     'image.encode_jpeg',
     v1=['io.encode_jpeg', 'image.encode_jpeg'])(
-        gen_image_ops.encode_jpeg)
+        dispatch.add_dispatch_support(gen_image_ops.encode_jpeg))
 tf_export(
     'io.extract_jpeg_shape',
     'image.extract_jpeg_shape',
     v1=['io.extract_jpeg_shape', 'image.extract_jpeg_shape'])(
-        gen_image_ops.extract_jpeg_shape)
+        dispatch.add_dispatch_support(gen_image_ops.extract_jpeg_shape))
 
 
 @tf_export('io.encode_png', 'image.encode_png')
+@dispatch.add_dispatch_support
 def encode_png(image, compression=-1, name=None):
   r"""PNG-encode an image.
 
@@ -2548,6 +2579,7 @@ def encode_png(image, compression=-1, name=None):
     'io.decode_image',
     'image.decode_image',
     v1=['io.decode_image', 'image.decode_image'])
+@dispatch.add_dispatch_support
 def decode_image(contents,
                  channels=None,
                  dtype=dtypes.uint8,
@@ -2661,6 +2693,7 @@ def decode_image(contents,
 
 
 @tf_export('image.total_variation')
+@dispatch.add_dispatch_support
 def total_variation(images, name=None):
   """Calculate and return the total variation for one or more images.
 
@@ -2732,6 +2765,7 @@ def total_variation(images, name=None):
 
 
 @tf_export('image.sample_distorted_bounding_box', v1=[])
+@dispatch.add_dispatch_support
 def sample_distorted_bounding_box_v2(image_size,
                                      bounding_boxes,
                                      seed=0,
@@ -2831,6 +2865,7 @@ def sample_distorted_bounding_box_v2(image_size,
 
 
 @tf_export(v1=['image.sample_distorted_bounding_box'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     date=None,
     instructions='`seed2` arg is deprecated.'
@@ -2945,6 +2980,7 @@ def sample_distorted_bounding_box(image_size,
 
 
 @tf_export('image.non_max_suppression')
+@dispatch.add_dispatch_support
 def non_max_suppression(boxes,
                         scores,
                         max_output_size,
@@ -2997,6 +3033,7 @@ def non_max_suppression(boxes,
 
 
 @tf_export('image.non_max_suppression_with_scores')
+@dispatch.add_dispatch_support
 def non_max_suppression_with_scores(boxes,
                                     scores,
                                     max_output_size,
@@ -3083,6 +3120,7 @@ def non_max_suppression_with_scores(boxes,
 
 
 @tf_export('image.non_max_suppression_overlaps')
+@dispatch.add_dispatch_support
 def non_max_suppression_with_overlaps(overlaps,
                                       scores,
                                       max_output_size,
@@ -3134,6 +3172,7 @@ _rgb_to_yiq_kernel = [[0.299, 0.59590059, 0.2115],
 
 
 @tf_export('image.rgb_to_yiq')
+@dispatch.add_dispatch_support
 def rgb_to_yiq(images):
   """Converts one or more images from RGB to YIQ.
 
@@ -3167,6 +3206,7 @@ _yiq_to_rgb_kernel = [[1, 1, 1], [0.95598634, -0.27201283, -1.10674021],
 
 
 @tf_export('image.yiq_to_rgb')
+@dispatch.add_dispatch_support
 def yiq_to_rgb(images):
   """Converts one or more images from YIQ to RGB.
 
@@ -3195,6 +3235,7 @@ _rgb_to_yuv_kernel = [[0.299, -0.14714119, 0.61497538],
 
 
 @tf_export('image.rgb_to_yuv')
+@dispatch.add_dispatch_support
 def rgb_to_yuv(images):
   """Converts one or more images from RGB to YUV.
 
@@ -3221,6 +3262,7 @@ _yuv_to_rgb_kernel = [[1, 1, 1], [0, -0.394642334, 2.03206185],
 
 
 @tf_export('image.yuv_to_rgb')
+@dispatch.add_dispatch_support
 def yuv_to_rgb(images):
   """Converts one or more images from YUV to RGB.
 
@@ -3314,6 +3356,7 @@ def _verify_compatible_image_shapes(img1, img2):
 
 
 @tf_export('image.psnr')
+@dispatch.add_dispatch_support
 def psnr(a, b, max_val, name=None):
   """Returns the Peak Signal-to-Noise Ratio between a and b.
 
@@ -3525,6 +3568,7 @@ def _ssim_per_channel(img1,
 
 
 @tf_export('image.ssim')
+@dispatch.add_dispatch_support
 def ssim(img1,
          img2,
          max_val,
@@ -3604,6 +3648,7 @@ _MSSSIM_WEIGHTS = (0.0448, 0.2856, 0.3001, 0.2363, 0.1333)
 
 
 @tf_export('image.ssim_multiscale')
+@dispatch.add_dispatch_support
 def ssim_multiscale(img1,
                     img2,
                     max_val,
@@ -3731,6 +3776,7 @@ def ssim_multiscale(img1,
 
 
 @tf_export('image.image_gradients')
+@dispatch.add_dispatch_support
 def image_gradients(image):
   """Returns image gradients (dy, dx) for each color channel.
 
@@ -3804,6 +3850,7 @@ def image_gradients(image):
 
 
 @tf_export('image.sobel_edges')
+@dispatch.add_dispatch_support
 def sobel_edges(image):
   """Returns a tensor holding Sobel edge maps.
 
@@ -3888,21 +3935,22 @@ resize_area_deprecation = deprecation.deprecated(
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.AREA...)` instead.'))
 tf_export(v1=['image.resize_area'])(
-    resize_area_deprecation(gen_image_ops.resize_area))
+    resize_area_deprecation(
+        dispatch.add_dispatch_support(gen_image_ops.resize_area)))
 
 resize_bicubic_deprecation = deprecation.deprecated(
     date=None,
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.BICUBIC...)` instead.'))
 tf_export(v1=['image.resize_bicubic'])(
-    resize_bicubic_deprecation(resize_bicubic))
+    dispatch.add_dispatch_support(resize_bicubic_deprecation(resize_bicubic)))
 
 resize_bilinear_deprecation = deprecation.deprecated(
     date=None,
     instructions=(
         'Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.'))
 tf_export(v1=['image.resize_bilinear'])(
-    resize_bilinear_deprecation(resize_bilinear))
+    dispatch.add_dispatch_support(resize_bilinear_deprecation(resize_bilinear)))
 
 resize_nearest_neighbor_deprecation = deprecation.deprecated(
     date=None,
@@ -3910,10 +3958,12 @@ resize_nearest_neighbor_deprecation = deprecation.deprecated(
         'Use `tf.image.resize(...method=ResizeMethod.NEAREST_NEIGHBOR...)` '
         'instead.'))
 tf_export(v1=['image.resize_nearest_neighbor'])(
-    resize_nearest_neighbor_deprecation(resize_nearest_neighbor))
+    dispatch.add_dispatch_support(
+        resize_nearest_neighbor_deprecation(resize_nearest_neighbor)))
 
 
 @tf_export('image.crop_and_resize', v1=[])
+@dispatch.add_dispatch_support
 def crop_and_resize_v2(image,
                        boxes,
                        box_indices,
@@ -3997,6 +4047,7 @@ def crop_and_resize_v2(image,
 
 
 @tf_export(v1=['image.crop_and_resize'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              'box_ind is deprecated, use box_indices instead',
                              'box_ind')
@@ -4019,6 +4070,7 @@ crop_and_resize_v1.__doc__ = gen_image_ops.crop_and_resize.__doc__
 
 
 @tf_export(v1=['image.extract_glimpse'])
+@dispatch.add_dispatch_support
 def extract_glimpse(
     input,  # pylint: disable=redefined-builtin
     size,
@@ -4104,6 +4156,7 @@ def extract_glimpse(
 
 
 @tf_export('image.extract_glimpse', v1=[])
+@dispatch.add_dispatch_support
 def extract_glimpse_v2(
     input,  # pylint: disable=redefined-builtin
     size,
@@ -4190,6 +4243,7 @@ def extract_glimpse_v2(
 
 
 @tf_export('image.combined_non_max_suppression')
+@dispatch.add_dispatch_support
 def combined_non_max_suppression(boxes,
                                  scores,
                                  max_output_size_per_class,
@@ -4442,6 +4496,7 @@ def _suppression_loop_body(boxes, iou_threshold, output_size, idx, tile_size):
 
 
 @tf_export('image.non_max_suppression_padded')
+@dispatch.add_dispatch_support
 def non_max_suppression_padded(boxes,
                                scores,
                                max_output_size,
@@ -4816,6 +4871,7 @@ def non_max_suppression_padded_v1(boxes,
 
 
 @tf_export('image.draw_bounding_boxes', v1=[])
+@dispatch.add_dispatch_support
 def draw_bounding_boxes_v2(images, boxes, colors, name=None):
   """Draw bounding boxes on a batch of images.
 
@@ -4870,6 +4926,7 @@ def draw_bounding_boxes_v2(images, boxes, colors, name=None):
 
 
 @tf_export(v1=['image.draw_bounding_boxes'])
+@dispatch.add_dispatch_support
 def draw_bounding_boxes(images, boxes, name=None, colors=None):
   """Draw bounding boxes on a batch of images.
 
@@ -4922,6 +4979,7 @@ def draw_bounding_boxes(images, boxes, name=None, colors=None):
 
 
 @tf_export('image.generate_bounding_box_proposals')
+@dispatch.add_dispatch_support
 def generate_bounding_box_proposals(scores,
                                     bbox_deltas,
                                     image_info,
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index f7617d83caf..82acd09caec 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -41,7 +41,7 @@ cholesky = linalg_ops.cholesky
 cholesky_solve = linalg_ops.cholesky_solve
 det = linalg_ops.matrix_determinant
 slogdet = gen_linalg_ops.log_matrix_determinant
-tf_export('linalg.slogdet')(slogdet)
+tf_export('linalg.slogdet')(dispatch.add_dispatch_support(slogdet))
 diag = array_ops.matrix_diag
 diag_part = array_ops.matrix_diag_part
 eigh = linalg_ops.self_adjoint_eig
@@ -51,7 +51,7 @@ eye = linalg_ops.eye
 inv = linalg_ops.matrix_inverse
 logm = gen_linalg_ops.matrix_logarithm
 lu = gen_linalg_ops.lu
-tf_export('linalg.logm')(logm)
+tf_export('linalg.logm')(dispatch.add_dispatch_support(logm))
 lstsq = linalg_ops.matrix_solve_ls
 norm = linalg_ops.norm
 qr = linalg_ops.qr
@@ -230,6 +230,7 @@ def _matrix_exp_pade13(matrix):
 
 
 @tf_export('linalg.expm')
+@dispatch.add_dispatch_support
 def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the matrix exponential of one or more square matrices.
 
@@ -340,6 +341,7 @@ def matrix_exponential(input, name=None):  # pylint: disable=redefined-builtin
 
 
 @tf_export('linalg.tridiagonal_solve')
+@dispatch.add_dispatch_support
 def tridiagonal_solve(diagonals,
                       rhs,
                       diagonals_format='compact',
@@ -541,6 +543,7 @@ def _tridiagonal_solve_compact_format(diagonals, rhs, transpose_rhs,
 
 
 @tf_export('linalg.tridiagonal_matmul')
+@dispatch.add_dispatch_support
 def tridiagonal_matmul(diagonals, rhs, diagonals_format='compact', name=None):
   r"""Multiplies tridiagonal matrix by matrix.
 
@@ -638,6 +641,7 @@ def _maybe_validate_matrix(a, validate_args):
 
 
 @tf_export('linalg.matrix_rank')
+@dispatch.add_dispatch_support
 def matrix_rank(a, tol=None, validate_args=False, name=None):
   """Compute the matrix rank of one or more matrices.
 
@@ -676,6 +680,7 @@ def matrix_rank(a, tol=None, validate_args=False, name=None):
 
 
 @tf_export('linalg.pinv')
+@dispatch.add_dispatch_support
 def pinv(a, rcond=None, validate_args=False, name=None):
   """Compute the Moore-Penrose pseudo-inverse of one or more matrices.
 
@@ -805,6 +810,7 @@ def pinv(a, rcond=None, validate_args=False, name=None):
 
 
 @tf_export('linalg.lu_solve')
+@dispatch.add_dispatch_support
 def lu_solve(lower_upper, perm, rhs, validate_args=False, name=None):
   """Solves systems of linear eqns `A X = RHS`, given LU factorizations.
 
@@ -902,6 +908,7 @@ def lu_solve(lower_upper, perm, rhs, validate_args=False, name=None):
 
 
 @tf_export('linalg.lu_matrix_inverse')
+@dispatch.add_dispatch_support
 def lu_matrix_inverse(lower_upper, perm, validate_args=False, name=None):
   """Computes the inverse given the LU decomposition(s) of one or more matrices.
 
@@ -966,6 +973,7 @@ def lu_matrix_inverse(lower_upper, perm, validate_args=False, name=None):
 
 
 @tf_export('linalg.lu_reconstruct')
+@dispatch.add_dispatch_support
 def lu_reconstruct(lower_upper, perm, validate_args=False, name=None):
   """The reconstruct one or more matrices from their LU decomposition(s).
 
diff --git a/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py b/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
index 613309f856d..6794636c3fd 100644
--- a/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
+++ b/tensorflow/python/ops/linalg/sparse/conjugate_gradient.py
@@ -27,10 +27,12 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('linalg.experimental.conjugate_gradient')
+@dispatch.add_dispatch_support
 def conjugate_gradient(operator,
                        rhs,
                        preconditioner=None,
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index abca7df19e0..03b7b98119d 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.gen_linalg_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 # Names below are lower_case.
@@ -82,6 +83,7 @@ def _RegularizedGramianCholesky(matrix, l2_regularizer, first_kind):
 @tf_export(
     'linalg.triangular_solve',
     v1=['linalg.triangular_solve', 'matrix_triangular_solve'])
+@dispatch.add_dispatch_support
 def matrix_triangular_solve(matrix, rhs, lower=True, adjoint=False, name=None):
   """Solve systems of linear equations with upper or lower triangular matrices.
 
@@ -143,6 +145,7 @@ def matrix_triangular_solve(matrix, rhs, lower=True, adjoint=False, name=None):
 
 @tf_export(
     'linalg.cholesky_solve', v1=['linalg.cholesky_solve', 'cholesky_solve'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('cholesky_solve')
 def cholesky_solve(chol, rhs, name=None):
   """Solves systems of linear eqns `A X = RHS`, given Cholesky factorizations.
@@ -187,6 +190,7 @@ def cholesky_solve(chol, rhs, name=None):
 
 
 @tf_export('eye', 'linalg.eye')
+@dispatch.add_dispatch_support
 def eye(num_rows,
         num_columns=None,
         batch_shape=None,
@@ -234,6 +238,7 @@ def eye(num_rows,
 
 
 @tf_export('linalg.lstsq', v1=['linalg.lstsq', 'matrix_solve_ls'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('matrix_solve_ls')
 def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
   r"""Solves one or more linear least-squares problems.
@@ -371,6 +376,7 @@ def matrix_solve_ls(matrix, rhs, l2_regularizer=0.0, fast=True, name=None):
 
 
 @tf_export('linalg.eig', 'eig', v1=[])
+@dispatch.add_dispatch_support
 def eig(tensor, name=None):
   """Computes the eigen decomposition of a batch of matrices.
 
@@ -401,6 +407,7 @@ def eig(tensor, name=None):
 
 
 @tf_export('linalg.eigvals', 'eigvals', v1=[])
+@dispatch.add_dispatch_support
 def eigvals(tensor, name=None):
   """Computes the eigenvalues of one or more matrices.
 
@@ -427,6 +434,7 @@ def eigvals(tensor, name=None):
 
 
 @tf_export('linalg.eigh', v1=['linalg.eigh', 'self_adjoint_eig'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('self_adjoint_eig')
 def self_adjoint_eig(tensor, name=None):
   """Computes the eigen decomposition of a batch of self-adjoint matrices.
@@ -450,6 +458,7 @@ def self_adjoint_eig(tensor, name=None):
 
 
 @tf_export('linalg.eigvalsh', v1=['linalg.eigvalsh', 'self_adjoint_eigvals'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('self_adjoint_eigvals')
 def self_adjoint_eigvals(tensor, name=None):
   """Computes the eigenvalues of one or more self-adjoint matrices.
@@ -473,6 +482,7 @@ def self_adjoint_eigvals(tensor, name=None):
 
 
 @tf_export('linalg.svd', v1=['linalg.svd', 'svd'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('svd')
 def svd(tensor, full_matrices=False, compute_uv=True, name=None):
   r"""Computes the singular value decompositions of one or more matrices.
@@ -544,6 +554,7 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export('norm', 'linalg.norm', v1=[])
+@dispatch.add_dispatch_support
 def norm_v2(tensor,
             ord='euclidean',
             axis=None,
@@ -615,6 +626,7 @@ def norm_v2(tensor,
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=['norm', 'linalg.norm'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(
     None, 'keep_dims is deprecated, use keepdims instead', 'keep_dims')
 def norm(tensor,
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 7e980a0dbb3..8ca63f55987 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -38,6 +38,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.gen_logging_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
@@ -71,6 +72,7 @@ except NameError:
             "only a concern in graph mode. Below is an example "
             "of how to ensure tf.print executes in graph mode:\n")
 @tf_export(v1=["Print"])
+@dispatch.add_dispatch_support
 def Print(input_, data, message=None, first_n=None, summarize=None, name=None):
   """Prints a list of tensors.
 
@@ -136,6 +138,7 @@ def _is_filepath(output_stream):
 # function definition.
 # pylint: disable=g-doc-args
 @tf_export("print")
+@dispatch.add_dispatch_support
 def print_v2(*inputs, **kwargs):
   """Print the specified inputs.
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 556c646f2a7..6a7b4b68420 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.ops.losses import util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
@@ -136,6 +137,7 @@ def _num_elements(losses):
 
 
 @tf_export(v1=["losses.compute_weighted_loss"])
+@dispatch.add_dispatch_support
 def compute_weighted_loss(
     losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -204,6 +206,7 @@ def compute_weighted_loss(
 
 
 @tf_export(v1=["losses.absolute_difference"])
+@dispatch.add_dispatch_support
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -257,6 +260,7 @@ def absolute_difference(
 
 
 @tf_export(v1=["losses.cosine_distance"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def cosine_distance(
     labels, predictions, axis=None, weights=1.0, scope=None,
@@ -313,6 +317,7 @@ def cosine_distance(
 
 
 @tf_export(v1=["losses.hinge_loss"])
+@dispatch.add_dispatch_support
 def hinge_loss(labels, logits, weights=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -363,6 +368,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
 
 
 @tf_export(v1=["losses.huber_loss"])
+@dispatch.add_dispatch_support
 def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -439,6 +445,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
 
 
 @tf_export(v1=["losses.log_loss"])
+@dispatch.add_dispatch_support
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
              loss_collection=ops.GraphKeys.LOSSES,
              reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -496,6 +503,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
 # TODO(b/37208492): Add reduction arg.
 @tf_export(v1=["losses.mean_pairwise_squared_error"])
+@dispatch.add_dispatch_support
 def mean_pairwise_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES):
@@ -592,6 +600,7 @@ def mean_pairwise_squared_error(
 
 
 @tf_export(v1=["losses.mean_squared_error"])
+@dispatch.add_dispatch_support
 def mean_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -645,6 +654,7 @@ def mean_squared_error(
 
 
 @tf_export(v1=["losses.sigmoid_cross_entropy"])
+@dispatch.add_dispatch_support
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -709,6 +719,7 @@ def sigmoid_cross_entropy(
 
 
 @tf_export(v1=["losses.softmax_cross_entropy"])
+@dispatch.add_dispatch_support
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -831,6 +842,7 @@ def _remove_squeezable_dimensions(
 
 
 @tf_export(v1=["losses.sparse_softmax_cross_entropy"])
+@dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy(
     labels, logits, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
index 56e8a894c24..fe99696f82f 100644
--- a/tensorflow/python/ops/manip_ops.py
+++ b/tensorflow/python/ops/manip_ops.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 
 from tensorflow.python.ops import gen_manip_ops as _gen_manip_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access
 @tf_export('roll', v1=['roll', 'manip.roll'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('manip.roll')
 def roll(input, shift, axis, name=None):  # pylint: disable=redefined-builtin
   return _gen_manip_ops.roll(input, shift, axis, name)
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 4c4982c6fd5..31994c16ddd 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -104,6 +104,7 @@ nextafter = gen_math_ops.next_after
 
 
 @tf_export("linspace", v1=["lin_space", "linspace"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("lin_space")
 def linspace_nd(start, stop, num, name=None, axis=0):
   r"""Generates evenly-spaced values in an interval along a given axis.
@@ -214,8 +215,8 @@ linspace = linspace_nd
 
 arg_max = deprecation.deprecated(None, "Use `tf.math.argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
 arg_min = deprecation.deprecated(None, "Use `tf.math.argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
-tf_export(v1=["arg_max"])(arg_max)
-tf_export(v1=["arg_min"])(arg_min)
+tf_export(v1=["arg_max"])(dispatch.add_dispatch_support(arg_max))
+tf_export(v1=["arg_min"])(dispatch.add_dispatch_support(arg_min))
 
 
 # This is set by resource_variable_ops.py. It is included in this way since
@@ -234,6 +235,7 @@ def _set_doc(doc):
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=["math.argmax", "argmax"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -250,6 +252,7 @@ def argmax(input,
 
 
 @tf_export("math.argmax", "argmax", v1=[])
+@dispatch.add_dispatch_support
 def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
   """Returns the index with the largest value across axes of a tensor.
 
@@ -283,6 +286,7 @@ def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None):
 
 
 @tf_export(v1=["math.argmin", "argmin"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "dimension")
 @_set_doc(
@@ -299,6 +303,7 @@ def argmin(input,
 
 
 @tf_export("math.argmin", "argmin", v1=[])
+@dispatch.add_dispatch_support
 def argmin_v2(input, axis=None, output_type=dtypes.int64, name=None):
   """Returns the index with the smallest value across axes of a tensor.
 
@@ -549,6 +554,7 @@ def _neg(x, name=None):
 
 
 @tf_export(v1=["math.scalar_mul", "scalar_mul"])
+@dispatch.add_dispatch_support
 def scalar_mul(scalar, x, name=None):
   """Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
 
@@ -581,6 +587,7 @@ def scalar_mul(scalar, x, name=None):
 
 
 @tf_export("math.scalar_mul", "scalar_mul", v1=[])
+@dispatch.add_dispatch_support
 @_set_doc(scalar_mul.__doc__)
 def scalar_mul_v2(scalar, x, name=None):
   with ops.name_scope(name, "scalar_mul", [x]) as name:
@@ -701,6 +708,7 @@ def sign(x, name=None):
 
 
 @tf_export("math.real", v1=["math.real", "real"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("real")
 @dispatch.add_dispatch_support
 def real(input, name=None):
@@ -735,6 +743,7 @@ def real(input, name=None):
 
 
 @tf_export("math.imag", v1=["math.imag", "imag"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("imag")
 @dispatch.add_dispatch_support
 def imag(input, name=None):
@@ -768,6 +777,7 @@ def imag(input, name=None):
 
 
 @tf_export("math.angle", v1=["math.angle", "angle"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("angle")
 @dispatch.add_dispatch_support
 def angle(input, name=None):
@@ -937,6 +947,7 @@ def saturate_cast(value, dtype, name=None):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_float"])
+@dispatch.add_dispatch_support
 def to_float(x, name="ToFloat"):
   """Casts a tensor to type `float32`.
 
@@ -956,6 +967,7 @@ def to_float(x, name="ToFloat"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_double"])
+@dispatch.add_dispatch_support
 def to_double(x, name="ToDouble"):
   """Casts a tensor to type `float64`.
 
@@ -975,6 +987,7 @@ def to_double(x, name="ToDouble"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_int32"])
+@dispatch.add_dispatch_support
 def to_int32(x, name="ToInt32"):
   """Casts a tensor to type `int32`.
 
@@ -994,6 +1007,7 @@ def to_int32(x, name="ToInt32"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_int64"])
+@dispatch.add_dispatch_support
 def to_int64(x, name="ToInt64"):
   """Casts a tensor to type `int64`.
 
@@ -1013,6 +1027,7 @@ def to_int64(x, name="ToInt64"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_bfloat16"])
+@dispatch.add_dispatch_support
 def to_bfloat16(x, name="ToBFloat16"):
   """Casts a tensor to type `bfloat16`.
 
@@ -1032,6 +1047,7 @@ def to_bfloat16(x, name="ToBFloat16"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_complex64"])
+@dispatch.add_dispatch_support
 def to_complex64(x, name="ToComplex64"):
   """Casts a tensor to type `complex64`.
 
@@ -1051,6 +1067,7 @@ def to_complex64(x, name="ToComplex64"):
 
 @deprecation.deprecated(date=None, instructions="Use `tf.cast` instead.")
 @tf_export(v1=["to_complex128"])
+@dispatch.add_dispatch_support
 def to_complex128(x, name="ToComplex128"):
   """Casts a tensor to type `complex128`.
 
@@ -1265,6 +1282,7 @@ def truediv(x, y, name=None):
     date=None,
     instructions="Deprecated in favor of operator or tf.math.divide.")
 @tf_export(v1=["div"])
+@dispatch.add_dispatch_support
 def div(x, y, name=None):
   """Divides x / y elementwise (using Python 2 division operator semantics).
 
@@ -1288,6 +1306,7 @@ def div(x, y, name=None):
 
 
 @tf_export("math.divide_no_nan", v1=["math.divide_no_nan", "div_no_nan"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("div_no_nan")
 @dispatch.add_dispatch_support
 def div_no_nan(x, y, name=None):
@@ -1620,6 +1639,7 @@ ops.Tensor._override_operator("__ne__", tensor_not_equals)
 
 
 @tf_export("range")
+@dispatch.add_dispatch_support
 def range(start, limit=None, delta=1, dtype=None, name="range"):  # pylint: disable=redefined-builtin
   """Creates a sequence of numbers.
 
@@ -1751,6 +1771,7 @@ def _may_reduce_to_scalar(keepdims, axis, output):
 
 
 @tf_export(v1=["math.reduce_sum", "reduce_sum"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -1885,6 +1906,7 @@ def reduce_sum_with_dims(input_tensor,
 
 
 @tf_export("math.reduce_euclidean_norm")
+@dispatch.add_dispatch_support
 def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the Euclidean norm of elements across dimensions of a tensor.
 
@@ -1928,6 +1950,7 @@ def reduce_euclidean_norm(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.count_nonzero", "count_nonzero"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2005,6 +2028,7 @@ def count_nonzero(input_tensor=None,
 
 
 @tf_export("math.count_nonzero", v1=[])
+@dispatch.add_dispatch_support
 def count_nonzero_v2(
     input,  # pylint: disable=redefined-builtin
     axis=None,
@@ -2072,6 +2096,7 @@ def count_nonzero_v2(
 
 
 @tf_export(v1=["math.reduce_mean", "reduce_mean"])
+@dispatch.add_dispatch_support
 def reduce_mean_v1(input_tensor,
                    axis=None,
                    keepdims=None,
@@ -2198,6 +2223,7 @@ def reduce_mean(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export("math.reduce_variance")
+@dispatch.add_dispatch_support
 def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the variance of elements across dimensions of a tensor.
 
@@ -2246,6 +2272,7 @@ def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export("math.reduce_std")
+@dispatch.add_dispatch_support
 def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the standard deviation of elements across dimensions of a tensor.
 
@@ -2328,6 +2355,7 @@ def reduce_prod(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_prod", "reduce_prod"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2373,6 +2401,7 @@ def reduce_prod_v1(input_tensor,
 
 
 @tf_export(v1=["math.reduce_min", "reduce_min"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2459,6 +2488,7 @@ def reduce_min(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_max", "reduce_max"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2563,6 +2593,7 @@ def reduce_max_with_dims(input_tensor,
 
 
 @tf_export(v1=["math.reduce_all", "reduce_all"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2662,6 +2693,7 @@ def reduce_all(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_any", "reduce_any"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2761,6 +2793,7 @@ def reduce_any(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export(v1=["math.reduce_logsumexp", "reduce_logsumexp"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -2817,6 +2850,7 @@ def reduce_logsumexp_v1(input_tensor,
 
 
 @tf_export("math.reduce_logsumexp", "reduce_logsumexp", v1=[])
+@dispatch.add_dispatch_support
 def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
   """Computes log(sum(exp(elements across dimensions of a tensor))).
 
@@ -2877,6 +2911,7 @@ def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
 
 
 @tf_export("linalg.trace", v1=["linalg.trace", "trace"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("trace")
 @dispatch.add_dispatch_support
 def trace(x, name=None):
@@ -3116,6 +3151,7 @@ def matmul(a,
 
 
 @tf_export("linalg.matvec")
+@dispatch.add_dispatch_support
 def matvec(a,
            b,
            transpose_a=False,
@@ -3219,6 +3255,7 @@ _OverrideBinaryOperatorHelper(matmul, "matmul")
 sparse_matmul = deprecation.deprecated(None, "Use `tf.linalg.matmul` instead")(
     gen_math_ops.sparse_mat_mul)
 tf_export(v1=["sparse_matmul"])(sparse_matmul)
+@dispatch.add_dispatch_support
 
 
 @ops.RegisterStatistics("MatMul", "flops")
@@ -3371,6 +3408,7 @@ def add_n(inputs, name=None):
 
 
 @tf_export("math.accumulate_n", v1=["math.accumulate_n", "accumulate_n"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("accumulate_n")
 def accumulate_n(inputs, shape=None, tensor_dtype=None, name=None):
   """Returns the element-wise sum of a list of tensors.
@@ -3449,6 +3487,7 @@ def _accumulate_n_grad(op, grad):
 
 
 @tf_export("math.sigmoid", "nn.sigmoid", "sigmoid")
+@dispatch.add_dispatch_support
 def sigmoid(x, name=None):
   r"""Computes sigmoid of `x` element-wise.
 
@@ -3521,6 +3560,7 @@ def log_sigmoid(x, name=None):
 
 
 @tf_export("math.bincount", v1=[])
+@dispatch.add_dispatch_support
 def bincount(arr,
              weights=None,
              minlength=None,
@@ -3596,6 +3636,7 @@ def bincount(arr,
 
 
 @tf_export(v1=["math.bincount", "bincount"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("bincount")
 def bincount_v1(arr,
                 weights=None,
@@ -3629,6 +3670,7 @@ def bincount_v1(arr,
 
 
 @tf_export("math.cumsum", "cumsum")
+@dispatch.add_dispatch_support
 def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative sum of the tensor `x` along `axis`.
 
@@ -3700,6 +3742,7 @@ def cumsum(x, axis=0, exclusive=False, reverse=False, name=None):
 
 
 @tf_export("math.cumprod", v1=["math.cumprod", "cumprod"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("cumprod")
 def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative product of the tensor `x` along `axis`.
@@ -3753,6 +3796,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
 
 
 @tf_export("math.cumulative_logsumexp", v1=["math.cumulative_logsumexp"])
+@dispatch.add_dispatch_support
 def cumulative_logsumexp(x, axis=0, exclusive=False, reverse=False, name=None):
   """Compute the cumulative log-sum-exp of the tensor `x` along `axis`.
 
@@ -3912,6 +3956,7 @@ def _unsorted_segment_N(data, segment_ids, num_segments):
 @tf_export(
     "math.unsorted_segment_mean",
     v1=["math.unsorted_segment_mean", "unsorted_segment_mean"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("unsorted_segment_mean")
 @dispatch.add_dispatch_support
 def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
@@ -3958,6 +4003,7 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
 @tf_export(
     "math.unsorted_segment_sqrt_n",
     v1=["math.unsorted_segment_sqrt_n", "unsorted_segment_sqrt_n"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("unsorted_segment_sqrt_n")
 @dispatch.add_dispatch_support
 def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
@@ -4307,6 +4353,7 @@ def sparse_segment_sqrt_n_v2(data,
 
 
 @tf_export("tensordot", "linalg.tensordot")
+@dispatch.add_dispatch_support
 def tensordot(a, b, axes, name=None):
   r"""Tensor contraction of a and b along specified axes and outer product.
 
@@ -4493,6 +4540,7 @@ def tensordot(a, b, axes, name=None):
 
 
 @tf_export("math.polyval")
+@dispatch.add_dispatch_support
 def polyval(coeffs, x, name=None):
   r"""Computes the elementwise value of a polynomial.
 
@@ -4563,6 +4611,7 @@ def polyval(coeffs, x, name=None):
 
 
 @tf_export("math.reciprocal_no_nan")
+@dispatch.add_dispatch_support
 def reciprocal_no_nan(x, name=None):
   """Performs a safe reciprocal operation, element wise.
 
@@ -4665,6 +4714,7 @@ def ndtri(x, name=None):
 
 
 @tf_export("math.ceil", v1=["math.ceil", "ceil"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("ceil")
 @dispatch.add_dispatch_support
 def ceil(x, name=None):
@@ -4778,6 +4828,7 @@ def exp(x, name=None):
 
 
 @tf_export("math.sobol_sample")
+@dispatch.add_dispatch_support
 def sobol_sample(dim, num_results, skip=0, dtype=dtypes.float32, name=None):
   """Generates points from the Sobol sequence.
 
@@ -4802,6 +4853,7 @@ def sobol_sample(dim, num_results, skip=0, dtype=dtypes.float32, name=None):
 
 
 @tf_export("math.rsqrt", v1=["math.rsqrt", "rsqrt"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("rsqrt")
 @dispatch.add_dispatch_support
 def rsqrt(x, name=None):
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 03c1289246e..4bda85077bc 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -39,12 +39,14 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import util as losses_util
 from tensorflow.python.platform import device_context
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("nn.log_poisson_loss")
+@dispatch.add_dispatch_support
 def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
   """Computes log Poisson loss given `log_input`.
 
@@ -110,6 +112,7 @@ def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None):
 
 
 @tf_export(v1=["nn.sigmoid_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
     _sentinel=None,
     labels=None,
@@ -192,6 +195,7 @@ def sigmoid_cross_entropy_with_logits(  # pylint: disable=invalid-name
 # Note: intentionally calling this v2 to not allow existing code with indirect
 # imports to ignore the sentinel behavior.
 @tf_export("nn.sigmoid_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def sigmoid_cross_entropy_with_logits_v2(  # pylint: disable=invalid-name
     labels=None,
     logits=None,
@@ -242,6 +246,7 @@ def sigmoid_cross_entropy_with_logits_v2(  # pylint: disable=invalid-name
 
 
 @tf_export("nn.weighted_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight,
                                           name=None):
   """Computes a weighted cross entropy.
@@ -320,6 +325,7 @@ def weighted_cross_entropy_with_logits_v2(labels, logits, pos_weight,
 
 
 @tf_export(v1=["nn.weighted_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "targets is deprecated, use labels instead", "targets")
 def weighted_cross_entropy_with_logits(labels=None,
                                        logits=None,
@@ -384,6 +390,7 @@ def weighted_cross_entropy_with_logits(labels=None,
 
 
 @tf_export("nn.compute_average_loss")
+@dispatch.add_dispatch_support
 def compute_average_loss(per_example_loss,
                          sample_weight=None,
                          global_batch_size=None):
@@ -440,6 +447,7 @@ def compute_average_loss(per_example_loss,
 
 
 @tf_export("nn.scale_regularization_loss")
+@dispatch.add_dispatch_support
 def scale_regularization_loss(regularization_loss):
   """Scales the sum of the given regularization losses by number of replicas.
 
@@ -478,6 +486,7 @@ def scale_regularization_loss(regularization_loss):
 
 
 @tf_export(v1=["nn.relu_layer"])
+@dispatch.add_dispatch_support
 def relu_layer(x, weights, biases, name=None):
   """Computes Relu(x * weight + biases).
 
@@ -501,6 +510,7 @@ def relu_layer(x, weights, biases, name=None):
 
 
 @tf_export("nn.swish")
+@dispatch.add_dispatch_support
 @custom_gradient.custom_gradient
 def swish(features):
   # pylint: disable=g-doc-args
@@ -538,6 +548,7 @@ def swish(features):
 
 # pylint: disable=redefined-builtin
 @tf_export("linalg.normalize")
+@dispatch.add_dispatch_support
 def normalize(tensor, ord="euclidean", axis=None, name=None):
   """Normalizes `tensor` along dimension `axis` using specified norm.
 
@@ -590,6 +601,7 @@ def normalize(tensor, ord="euclidean", axis=None, name=None):
 
 
 @tf_export(v1=["math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
   """Normalizes along dimension `axis` using an L2 norm.
@@ -618,6 +630,7 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
 
 
 @tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize", v1=[])
+@dispatch.add_dispatch_support
 def l2_normalize_v2(x, axis=None, epsilon=1e-12, name=None):
   """Normalizes along dimension `axis` using an L2 norm.
 
@@ -668,6 +681,7 @@ def _count_nonzero(input_tensor, dtype=dtypes.int64):
 
 
 @tf_export("math.zero_fraction", "nn.zero_fraction")
+@dispatch.add_dispatch_support
 def zero_fraction(value, name=None):
   """Returns the fraction of zeros in `value`.
 
@@ -710,6 +724,7 @@ def zero_fraction(value, name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export(v1=["nn.depthwise_conv2d"])
+@dispatch.add_dispatch_support
 def depthwise_conv2d(input,
                      filter,
                      strides,
@@ -838,6 +853,7 @@ def depthwise_conv2d(input,
 
 
 @tf_export("nn.depthwise_conv2d", v1=[])
+@dispatch.add_dispatch_support
 def depthwise_conv2d_v2(input,
                         filter,
                         strides,
@@ -935,6 +951,7 @@ def depthwise_conv2d_v2(input,
 
 # pylint: disable=redefined-builtin,line-too-long
 @tf_export(v1=["nn.separable_conv2d"])
+@dispatch.add_dispatch_support
 def separable_conv2d(input,
                      depthwise_filter,
                      pointwise_filter,
@@ -1042,6 +1059,7 @@ def separable_conv2d(input,
 
 
 @tf_export("nn.separable_conv2d", v1=[])
+@dispatch.add_dispatch_support
 def separable_conv2d_v2(
     input,
     depthwise_filter,
@@ -1117,6 +1135,7 @@ def separable_conv2d_v2(
 
 
 @tf_export(v1=["nn.sufficient_statistics"])
+@dispatch.add_dispatch_support
 def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
                           keepdims=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
@@ -1174,6 +1193,7 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=None, name=None,
 
 
 @tf_export("nn.sufficient_statistics", v1=[])
+@dispatch.add_dispatch_support
 def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
 
@@ -1203,6 +1223,7 @@ def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
 
 
 @tf_export("nn.normalize_moments")
+@dispatch.add_dispatch_support
 def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
@@ -1235,6 +1256,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
 
 
 @tf_export(v1=["nn.moments"])
+@dispatch.add_dispatch_support
 def moments(
     x,
     axes,
@@ -1300,6 +1322,7 @@ def moments(
 
 
 @tf_export("nn.moments", v1=[])
+@dispatch.add_dispatch_support
 def moments_v2(
     x,
     axes,
@@ -1336,6 +1359,7 @@ def moments_v2(
 
 
 @tf_export(v1=["nn.weighted_moments"])
+@dispatch.add_dispatch_support
 def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
                      keepdims=None):
   """Returns the frequency-weighted mean and variance of `x`.
@@ -1414,6 +1438,7 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=None,
 
 
 @tf_export("nn.weighted_moments", v1=[])
+@dispatch.add_dispatch_support
 def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
   """Returns the frequency-weighted mean and variance of `x`.
 
@@ -1438,6 +1463,7 @@ def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
 
 
 @tf_export("nn.batch_normalization")
+@dispatch.add_dispatch_support
 def batch_normalization(x,
                         mean,
                         variance,
@@ -1508,6 +1534,7 @@ def batch_normalization(x,
 
 
 @tf_export(v1=["nn.fused_batch_norm"])
+@dispatch.add_dispatch_support
 def fused_batch_norm(
     x,
     scale,
@@ -1631,6 +1658,7 @@ def fused_batch_norm(
 
 
 @tf_export(v1=["nn.batch_norm_with_global_normalization"])
+@dispatch.add_dispatch_support
 def batch_norm_with_global_normalization(t=None,
                                          m=None,
                                          v=None,
@@ -1685,6 +1713,7 @@ def batch_norm_with_global_normalization(t=None,
 
 # pylint: disable=redefined-builtin,line-too-long
 @tf_export("nn.batch_norm_with_global_normalization", v1=[])
+@dispatch.add_dispatch_support
 def batch_norm_with_global_normalization_v2(input,
                                             mean,
                                             variance,
@@ -1934,6 +1963,7 @@ def _compute_sampled_logits(weights,
 
 
 @tf_export("nn.nce_loss", v1=[])
+@dispatch.add_dispatch_support
 def nce_loss_v2(weights,
                 biases,
                 labels,
@@ -2038,6 +2068,7 @@ def nce_loss_v2(weights,
 
 
 @tf_export(v1=["nn.nce_loss"])
+@dispatch.add_dispatch_support
 def nce_loss(weights,
              biases,
              labels,
@@ -2149,6 +2180,7 @@ def nce_loss(weights,
 
 
 @tf_export("nn.sampled_softmax_loss", v1=[])
+@dispatch.add_dispatch_support
 def sampled_softmax_loss_v2(weights,
                             biases,
                             labels,
@@ -2240,6 +2272,7 @@ def sampled_softmax_loss_v2(weights,
 
 
 @tf_export(v1=["nn.sampled_softmax_loss"])
+@dispatch.add_dispatch_support
 def sampled_softmax_loss(weights,
                          biases,
                          labels,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index 248c57c1ba5..e7955100b24 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -239,6 +239,7 @@ class _NonAtrousConvolution(object):
 
 
 @tf_export("nn.dilation2d", v1=[])
+@dispatch.add_dispatch_support
 def dilation2d_v2(
     input,   # pylint: disable=redefined-builtin
     filters,  # pylint: disable=redefined-builtin
@@ -306,6 +307,7 @@ def dilation2d_v2(
 
 
 @tf_export(v1=["nn.dilation2d"])
+@dispatch.add_dispatch_support
 def dilation2d_v1(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
     filter=None,  # pylint: disable=redefined-builtin
@@ -324,6 +326,7 @@ dilation2d_v1.__doc__ = gen_nn_ops.dilation2d.__doc__
 
 
 @tf_export("nn.with_space_to_batch")
+@dispatch.add_dispatch_support
 def with_space_to_batch(
     input,  # pylint: disable=redefined-builtin
     dilation_rate,
@@ -772,6 +775,7 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
 
 
 @tf_export(v1=["nn.convolution"])
+@dispatch.add_dispatch_support
 def convolution(
     input,  # pylint: disable=redefined-builtin
     filter,  # pylint: disable=redefined-builtin
@@ -907,7 +911,8 @@ def convolution(
 
 
 @tf_export("nn.convolution", v1=[])
-def convolution_v2(
+@dispatch.add_dispatch_support
+def convolution_v2(  # pylint: disable=missing-docstring
     input,  # pylint: disable=redefined-builtin
     filters,
     strides=None,
@@ -1116,6 +1121,7 @@ class Convolution(object):
 
 
 @tf_export(v1=["nn.pool"])
+@dispatch.add_dispatch_support
 def pool(
     input,  # pylint: disable=redefined-builtin
     window_shape,
@@ -1290,6 +1296,7 @@ def pool(
 
 
 @tf_export("nn.pool", v1=[])
+@dispatch.add_dispatch_support
 def pool_v2(
     input,  # pylint: disable=redefined-builtin
     window_shape,
@@ -1389,6 +1396,7 @@ def pool_v2(
 
 
 @tf_export("nn.atrous_conv2d")
+@dispatch.add_dispatch_support
 def atrous_conv2d(value, filters, rate, padding, name=None):
   """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
 
@@ -1576,6 +1584,7 @@ def convert_padding(padding):
 
 
 @tf_export(v1=["nn.conv1d"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_arg_values(
     None,
     "`NCHW` for data_format is deprecated, use `NCW` instead",
@@ -1674,6 +1683,7 @@ def conv1d(
 
 
 @tf_export("nn.conv1d", v1=[])
+@dispatch.add_dispatch_support
 def conv1d_v2(
     input,  # pylint: disable=redefined-builtin
     filters,
@@ -1739,6 +1749,7 @@ def conv1d_v2(
 
 
 @tf_export("nn.conv1d_transpose")
+@dispatch.add_dispatch_support
 def conv1d_transpose(
     input,  # pylint: disable=redefined-builtin
     filters,
@@ -1827,6 +1838,7 @@ def conv1d_transpose(
 
 
 @tf_export("nn.conv2d", v1=[])
+@dispatch.add_dispatch_support
 def conv2d_v2(input,  # pylint: disable=redefined-builtin
               filters,
               strides,
@@ -1927,6 +1939,7 @@ def conv2d_v2(input,  # pylint: disable=redefined-builtin
 
 
 @tf_export(v1=["nn.conv2d"])
+@dispatch.add_dispatch_support
 def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
     filter=None,
@@ -2024,6 +2037,7 @@ def conv2d(  # pylint: disable=redefined-builtin,dangerous-default-value
 
 
 @tf_export(v1=["nn.conv2d_backprop_filter"])
+@dispatch.add_dispatch_support
 def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
     filter_sizes,
@@ -2084,6 +2098,7 @@ def conv2d_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-defau
 
 
 @tf_export(v1=["nn.conv2d_backprop_input"])
+@dispatch.add_dispatch_support
 def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
     input_sizes,
     filter=None,
@@ -2148,6 +2163,7 @@ def conv2d_backprop_input(  # pylint: disable=redefined-builtin,dangerous-defaul
 
 
 @tf_export(v1=["nn.conv2d_transpose"])
+@dispatch.add_dispatch_support
 def conv2d_transpose(
     value=None,
     filter=None,  # pylint: disable=redefined-builtin
@@ -2224,6 +2240,7 @@ def conv2d_transpose(
 
 
 @tf_export("nn.conv2d_transpose", v1=[])
+@dispatch.add_dispatch_support
 def conv2d_transpose_v2(
     input,  # pylint: disable=redefined-builtin
     filters,  # pylint: disable=redefined-builtin
@@ -2301,6 +2318,7 @@ def conv2d_transpose_v2(
 
 
 @tf_export("nn.atrous_conv2d_transpose")
+@dispatch.add_dispatch_support
 def atrous_conv2d_transpose(value,
                             filters,
                             output_shape,
@@ -2459,6 +2477,7 @@ def atrous_conv2d_transpose(value,
 
 
 @tf_export(v1=["nn.depthwise_conv2d_native"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("nn.depthwise_conv2d_native")
 def depthwise_conv2d_native(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
@@ -2538,6 +2557,7 @@ def depthwise_conv2d_native(  # pylint: disable=redefined-builtin,dangerous-defa
         "nn.depthwise_conv2d_native_backprop_input",
         "nn.depthwise_conv2d_backprop_input"
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("nn.depthwise_conv2d_native_backprop_input")
 def depthwise_conv2d_native_backprop_input(  # pylint: disable=redefined-builtin,dangerous-default-value
     input_sizes,
@@ -2607,6 +2627,7 @@ def depthwise_conv2d_native_backprop_input(  # pylint: disable=redefined-builtin
         "nn.depthwise_conv2d_native_backprop_filter",
         "nn.depthwise_conv2d_backprop_filter"
     ])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("nn.depthwise_conv2d_native_backprop_filter")
 def depthwise_conv2d_native_backprop_filter(  # pylint: disable=redefined-builtin,dangerous-default-value
     input,
@@ -2672,6 +2693,7 @@ def depthwise_conv2d_native_backprop_filter(  # pylint: disable=redefined-builti
 
 
 @tf_export("nn.conv3d", v1=[])
+@dispatch.add_dispatch_support
 def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
               filters,
               strides,
@@ -2691,6 +2713,7 @@ def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
 
 
 @tf_export(v1=["nn.conv3d"])
+@dispatch.add_dispatch_support
 def conv3d_v1(  # pylint: disable=missing-docstring,dangerous-default-value
     input,  # pylint: disable=redefined-builtin
     filter=None,  # pylint: disable=redefined-builtin
@@ -2711,6 +2734,7 @@ conv3d_v1.__doc__ = gen_nn_ops.conv3d.__doc__
 
 
 @tf_export(v1=["nn.conv3d_transpose"])
+@dispatch.add_dispatch_support
 def conv3d_transpose(
     value,
     filter=None,  # pylint: disable=redefined-builtin
@@ -2782,6 +2806,7 @@ def conv3d_transpose(
 
 
 @tf_export("nn.conv3d_transpose", v1=[])
+@dispatch.add_dispatch_support
 def conv3d_transpose_v2(input,  # pylint: disable=redefined-builtin
                         filters,
                         output_shape,
@@ -2861,6 +2886,7 @@ CONV_TRANSPOSE_OPS = (
 
 
 @tf_export("nn.conv_transpose")
+@dispatch.add_dispatch_support
 def conv_transpose(input,  # pylint: disable=redefined-builtin
                    filters,
                    output_shape,
@@ -2958,6 +2984,7 @@ _tf_deterministic_ops.value = None
 
 
 @tf_export("nn.bias_add")
+@dispatch.add_dispatch_support
 def bias_add(value, bias, data_format=None, name=None):
   """Adds `bias` to `value`.
 
@@ -3047,6 +3074,7 @@ def bias_add_v1(value, bias, name=None):
 
 
 @tf_export(v1=["nn.crelu"])
+@dispatch.add_dispatch_support
 def crelu(features, name=None, axis=-1):
   """Computes Concatenated ReLU.
 
@@ -3079,12 +3107,14 @@ def crelu(features, name=None, axis=-1):
 
 
 @tf_export("nn.crelu", v1=[])
+@dispatch.add_dispatch_support
 def crelu_v2(features, axis=-1, name=None):
   return crelu(features, name=name, axis=axis)
 crelu_v2.__doc__ = crelu.__doc__
 
 
 @tf_export("nn.relu6")
+@dispatch.add_dispatch_support
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
 
@@ -3107,6 +3137,7 @@ def relu6(features, name=None):
 
 
 @tf_export("nn.leaky_relu")
+@dispatch.add_dispatch_support
 def leaky_relu(features, alpha=0.2, name=None):
   """Compute the Leaky ReLU activation function.
 
@@ -3245,6 +3276,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
 
 
 @tf_export(v1=["nn.softmax", "math.softmax"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
@@ -3289,6 +3321,7 @@ def softmax(logits, axis=None, name=None, dim=None):
 
 
 @tf_export("nn.softmax", "math.softmax", v1=[])
+@dispatch.add_dispatch_support
 def softmax_v2(logits, axis=None, name=None):
   """Computes softmax activations.
 
@@ -3316,6 +3349,7 @@ def softmax_v2(logits, axis=None, name=None):
 
 
 @tf_export(v1=["nn.log_softmax", "math.log_softmax"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
@@ -3346,6 +3380,7 @@ def log_softmax(logits, axis=None, name=None, dim=None):
 
 
 @tf_export("nn.log_softmax", "math.log_softmax", v1=[])
+@dispatch.add_dispatch_support
 def log_softmax_v2(logits, axis=None, name=None):
   """Computes log softmax activations.
 
@@ -3382,6 +3417,7 @@ def _ensure_xent_args(name, sentinel, labels, logits):
 
 
 @tf_export("nn.softmax_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
@@ -3444,6 +3480,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
 
 
 @tf_export(v1=["nn.softmax_cross_entropy_with_logits_v2"])
+@dispatch.add_dispatch_support
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax_cross_entropy_with_logits_v2_helper(
     labels, logits, axis=None, name=None, dim=None):
@@ -3571,6 +3608,7 @@ See `tf.nn.softmax_cross_entropy_with_logits_v2`.
 
 
 @tf_export(v1=["nn.softmax_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions=_XENT_DEPRECATION)
 def softmax_cross_entropy_with_logits(
     _sentinel=None,  # pylint: disable=invalid-name
@@ -3639,6 +3677,7 @@ def softmax_cross_entropy_with_logits(
 
 
 @tf_export(v1=["nn.sparse_softmax_cross_entropy_with_logits"])
+@dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy_with_logits(
     _sentinel=None,  # pylint: disable=invalid-name
     labels=None,
@@ -3764,6 +3803,7 @@ def sparse_softmax_cross_entropy_with_logits(
 
 
 @tf_export("nn.sparse_softmax_cross_entropy_with_logits", v1=[])
+@dispatch.add_dispatch_support
 def sparse_softmax_cross_entropy_with_logits_v2(labels, logits, name=None):
   """Computes sparse softmax cross entropy between `logits` and `labels`.
 
@@ -3816,6 +3856,7 @@ def sparse_softmax_cross_entropy_with_logits_v2(labels, logits, name=None):
 
 
 @tf_export("nn.avg_pool", v1=["nn.avg_pool_v2"])
+@dispatch.add_dispatch_support
 def avg_pool_v2(input, ksize, strides, padding, data_format=None, name=None):  # pylint: disable=redefined-builtin
   """Performs the avg pooling on the input.
 
@@ -3878,6 +3919,7 @@ def avg_pool_v2(input, ksize, strides, padding, data_format=None, name=None):  #
 
 
 @tf_export(v1=["nn.avg_pool", "nn.avg_pool2d"])
+@dispatch.add_dispatch_support
 def avg_pool(value, ksize, strides, padding, data_format="NHWC",
              name=None, input=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
@@ -3922,6 +3964,7 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC",
 
 
 @tf_export("nn.avg_pool2d", v1=[])
+@dispatch.add_dispatch_support
 def avg_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
@@ -3961,6 +4004,7 @@ def avg_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
 
 
 @tf_export("nn.avg_pool1d")
+@dispatch.add_dispatch_support
 def avg_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
@@ -4006,6 +4050,7 @@ def avg_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):  #
 
 
 @tf_export("nn.avg_pool3d")
+@dispatch.add_dispatch_support
 def avg_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):  # pylint: disable=redefined-builtin
   """Performs the average pooling on the input.
 
@@ -4046,6 +4091,7 @@ def avg_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool", v1=["nn.max_pool_v2"])
+@dispatch.add_dispatch_support
 def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
   """Performs the max pooling on the input.
 
@@ -4106,6 +4152,7 @@ def max_pool_v2(input, ksize, strides, padding, data_format=None, name=None):
 
 
 @tf_export(v1=["nn.max_pool"])
+@dispatch.add_dispatch_support
 def max_pool(value,
              ksize,
              strides,
@@ -4155,6 +4202,7 @@ def max_pool(value,
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool1d")
+@dispatch.add_dispatch_support
 def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
   """Performs the max pooling on the input.
 
@@ -4199,6 +4247,7 @@ def max_pool1d(input, ksize, strides, padding, data_format="NWC", name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool2d")
+@dispatch.add_dispatch_support
 def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
   """Performs the max pooling on the input.
 
@@ -4237,6 +4286,7 @@ def max_pool2d(input, ksize, strides, padding, data_format="NHWC", name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("nn.max_pool3d")
+@dispatch.add_dispatch_support
 def max_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
   """Performs the max pooling on the input.
 
@@ -4279,6 +4329,7 @@ def max_pool3d(input, ksize, strides, padding, data_format="NDHWC", name=None):
 
 
 @tf_export("nn.max_pool_with_argmax", v1=[])
+@dispatch.add_dispatch_support
 def max_pool_with_argmax_v2(
     input,  # pylint: disable=redefined-builtin
     ksize,
@@ -4348,6 +4399,7 @@ def max_pool_with_argmax_v2(
 
 
 @tf_export(v1=["nn.max_pool_with_argmax"])
+@dispatch.add_dispatch_support
 def max_pool_with_argmax_v1(  # pylint: disable=missing-docstring,invalid-name
     input,  # pylint: disable=redefined-builtin
     ksize,
@@ -4442,6 +4494,7 @@ def _calc_bias_add_flops(graph, node):
 
 
 @tf_export(v1=["nn.xw_plus_b"])
+@dispatch.add_dispatch_support
 def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
   """Computes matmul(x, weights) + biases.
 
@@ -4691,6 +4744,7 @@ def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):
 
 
 @tf_export("math.top_k", "nn.top_k")
+@dispatch.add_dispatch_support
 def top_k(input, k=1, sorted=True, name=None):  # pylint: disable=redefined-builtin
   """Finds values and indices of the `k` largest entries for the last dimension.
 
@@ -4751,6 +4805,7 @@ def nth_element(input, n, reverse=False, name=None):  # pylint: disable=redefine
 
 
 @tf_export(v1=["nn.fractional_max_pool"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
                         "args are deprecated.  Use fractional_max_pool_v2.")
 def fractional_max_pool(value,
@@ -4837,6 +4892,7 @@ def fractional_max_pool(value,
 
 
 @tf_export("nn.fractional_max_pool", v1=[])
+@dispatch.add_dispatch_support
 def fractional_max_pool_v2(value,
                            pooling_ratio,
                            pseudo_random=False,
@@ -4922,6 +4978,7 @@ def fractional_max_pool_v2(value,
 
 
 @tf_export(v1=["nn.fractional_avg_pool"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
                         "args are deprecated.  Use fractional_avg_pool_v2.")
 def fractional_avg_pool(value,
@@ -4987,6 +5044,7 @@ def fractional_avg_pool(value,
 
 
 @tf_export("nn.fractional_avg_pool", v1=[])
+@dispatch.add_dispatch_support
 def fractional_avg_pool_v2(value,
                            pooling_ratio,
                            pseudo_random=False,
@@ -5065,6 +5123,7 @@ def _calc_dilation2d_flops(graph, node):
 
 
 @tf_export(v1=["nn.erosion2d"])
+@dispatch.add_dispatch_support
 def erosion2d(value, kernel, strides, rates, padding, name=None):
   """Computes the grayscale erosion of 4-D `value` and 3-D `kernel` tensors.
 
@@ -5124,6 +5183,7 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
 
 
 @tf_export("nn.erosion2d", v1=[])
+@dispatch.add_dispatch_support
 def erosion2d_v2(value,
                  filters,
                  strides,
@@ -5193,6 +5253,7 @@ def erosion2d_v2(value,
 
 
 @tf_export(v1=["math.in_top_k", "nn.in_top_k"])
+@dispatch.add_dispatch_support
 def in_top_k(predictions, targets, k, name=None):
   r"""Says whether the targets are in the top `K` predictions.
 
@@ -5227,6 +5288,7 @@ def in_top_k(predictions, targets, k, name=None):
 
 
 @tf_export("math.in_top_k", "nn.in_top_k", v1=[])
+@dispatch.add_dispatch_support
 def in_top_k_v2(targets, predictions, k, name=None):
   return in_top_k(predictions, targets, k, name)
 
@@ -5234,7 +5296,11 @@ def in_top_k_v2(targets, predictions, k, name=None):
 in_top_k_v2.__doc__ = in_top_k.__doc__
 
 
-tf_export(v1=["nn.quantized_avg_pool"])(gen_nn_ops.quantized_avg_pool)
-tf_export(v1=["nn.quantized_conv2d"])(gen_nn_ops.quantized_conv2d)
-tf_export(v1=["nn.quantized_relu_x"])(gen_nn_ops.quantized_relu_x)
-tf_export(v1=["nn.quantized_max_pool"])(gen_nn_ops.quantized_max_pool)
+tf_export(v1=["nn.quantized_avg_pool"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_avg_pool))
+tf_export(v1=["nn.quantized_conv2d"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_conv2d))
+tf_export(v1=["nn.quantized_relu_x"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_relu_x))
+tf_export(v1=["nn.quantized_max_pool"])(
+    dispatch.add_dispatch_support(gen_nn_ops.quantized_max_pool))
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 9f9e7229442..81a532bb150 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -25,10 +25,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export(v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("verify_tensor_all_finite")
 def verify_tensor_all_finite(t=None, msg=None, name=None, x=None, message=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
@@ -50,6 +52,7 @@ def verify_tensor_all_finite(t=None, msg=None, name=None, x=None, message=None):
 
 
 @tf_export("debugging.assert_all_finite", v1=[])
+@dispatch.add_dispatch_support
 def verify_tensor_all_finite_v2(x, message, name=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
 
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 8e518e913be..edcae89aada 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import parsing_config
 from tensorflow.python.ops.gen_parsing_ops import *
 # pylint: enable=wildcard-import,undefined-variable
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -77,6 +78,7 @@ def _prepend_none_dimension(features):
 
 
 @tf_export("io.parse_example", v1=[])
+@dispatch.add_dispatch_support
 def parse_example_v2(serialized, features, example_names=None, name=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
@@ -314,6 +316,7 @@ def parse_example_v2(serialized, features, example_names=None, name=None):
 
 
 @tf_export(v1=["io.parse_example", "parse_example"])
+@dispatch.add_dispatch_support
 def parse_example(serialized, features, name=None, example_names=None):
   return parse_example_v2(serialized, features, example_names, name)
 
@@ -373,6 +376,7 @@ def _parse_example_raw(serialized, names, params, name):
 
 
 @tf_export(v1=["io.parse_single_example", "parse_single_example"])
+@dispatch.add_dispatch_support
 def parse_single_example(serialized, features, name=None, example_names=None):
   """Parses a single `Example` proto.
 
@@ -407,6 +411,7 @@ def parse_single_example(serialized, features, name=None, example_names=None):
 
 
 @tf_export("io.parse_single_example", v1=[])
+@dispatch.add_dispatch_support
 def parse_single_example_v2(
     serialized, features, example_names=None, name=None
     ):
@@ -448,6 +453,7 @@ def parse_single_example_v2(
 
 
 @tf_export("io.parse_sequence_example")
+@dispatch.add_dispatch_support
 def parse_sequence_example(serialized,
                            context_features=None,
                            sequence_features=None,
@@ -692,6 +698,7 @@ def _parse_sequence_example_raw(serialized,
 @tf_export("io.parse_single_sequence_example",
            v1=["io.parse_single_sequence_example",
                "parse_single_sequence_example"])
+@dispatch.add_dispatch_support
 def parse_single_sequence_example(
     serialized, context_features=None, sequence_features=None,
     example_name=None, name=None):
@@ -835,6 +842,7 @@ def _parse_single_sequence_example_raw(serialized,
 
 
 @tf_export("io.decode_raw", v1=[])
+@dispatch.add_dispatch_support
 def decode_raw(input_bytes,
                out_type,
                little_endian=True,
@@ -877,6 +885,7 @@ def decode_raw(input_bytes,
 
 
 @tf_export(v1=["decode_raw", "io.decode_raw"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "bytes is deprecated, use input_bytes instead",
                              "bytes")
@@ -921,6 +930,7 @@ def decode_raw_v1(
 
 # Swap `name` and `na_value` for backward compatibility.
 @tf_export(v1=["io.decode_csv", "decode_csv"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("decode_csv")
 def decode_csv(records,
                record_defaults,
@@ -970,6 +980,7 @@ def decode_csv(records,
 
 
 @tf_export("io.decode_csv", v1=[])
+@dispatch.add_dispatch_support
 def decode_csv_v2(records,
                   record_defaults,
                   field_delim=",",
diff --git a/tensorflow/python/ops/proto_ops.py b/tensorflow/python/ops/proto_ops.py
index 1f7300dbef9..0e19aad584c 100644
--- a/tensorflow/python/ops/proto_ops.py
+++ b/tensorflow/python/ops/proto_ops.py
@@ -22,10 +22,11 @@ from __future__ import print_function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops.gen_decode_proto_ops import decode_proto_v2 as decode_proto
 from tensorflow.python.ops.gen_encode_proto_ops import encode_proto
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
-tf_export("io.decode_proto")(decode_proto)
-tf_export("io.encode_proto")(encode_proto)
+tf_export("io.decode_proto")(dispatch.add_dispatch_support(decode_proto))
+tf_export("io.encode_proto")(dispatch.add_dispatch_support(encode_proto))
 
 ops.NotDifferentiable("DecodeProtoV2")
 ops.NotDifferentiable("EncodeProto")
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 7f971cd558f..782902f2f71 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 #===============================================================================
@@ -40,6 +41,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('ragged.boolean_mask')
+@dispatch.add_dispatch_support
 def boolean_mask(data, mask, name=None):
   """Applies a boolean mask to `data` without flattening the mask dimensions.
 
@@ -538,6 +540,7 @@ def ragged_one_hot(indices,
 # ragged.stack_dynamic_partitions
 #===============================================================================
 @tf_export('ragged.stack_dynamic_partitions')
+@dispatch.add_dispatch_support
 def stack_dynamic_partitions(data, partitions, num_partitions, name=None):
   """Stacks dynamic partitions of a Tensor or RaggedTensor.
 
@@ -699,6 +702,7 @@ def reverse(tensor, axis, name=None):
 
 
 @tf_export('ragged.cross')
+@dispatch.add_dispatch_support
 def cross(inputs, name=None):
   """Generates feature cross from a list of tensors.
 
@@ -725,6 +729,7 @@ def cross(inputs, name=None):
 
 
 @tf_export('ragged.cross_hashed')
+@dispatch.add_dispatch_support
 def cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
   """Generates hashed feature cross from a list of tensors.
 
diff --git a/tensorflow/python/ops/ragged/ragged_concat_ops.py b/tensorflow/python/ops/ragged/ragged_concat_ops.py
index 9bcb1aa4765..cd710f449a6 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_gather_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -71,6 +72,7 @@ def concat(values, axis, name=None):
 
 
 @tf_export('ragged.stack')
+@dispatch.add_dispatch_support
 def stack(values, axis=0, name=None):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` `RaggedTensor`.
 
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index aa148ae7fe8..3a6f6231149 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -34,6 +35,7 @@ from tensorflow.python.util.tf_export import tf_export
 # Op to construct a constant RaggedTensor from a nested Python list.
 #===============================================================================
 @tf_export("ragged.constant")
+@dispatch.add_dispatch_support
 def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None,
              name=None, row_splits_dtype=dtypes.int64):
   """Constructs a constant RaggedTensor from a nested Python list.
@@ -86,6 +88,7 @@ def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None,
 
 
 @tf_export(v1=["ragged.constant_value"])
+@dispatch.add_dispatch_support
 def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None,
                    row_splits_dtype="int64"):
   """Constructs a RaggedTensorValue from a nested Python list.
@@ -311,6 +314,7 @@ def _default_inner_shape_for_pylist(pylist, ragged_rank):
 
 
 @tf_export(v1=["ragged.placeholder"])
+@dispatch.add_dispatch_support
 def placeholder(dtype, ragged_rank, value_shape=None, name=None):
   """Creates a placeholder for a `tf.RaggedTensor` that will always be fed.
 
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
index cc45f729e58..00b5ced6170 100644
--- a/tensorflow/python/ops/ragged/ragged_functional_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -24,10 +24,12 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_config
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("ragged.map_flat_values")
+@dispatch.add_dispatch_support
 def map_flat_values(op, *args, **kwargs):
   """Applies `op` to the values of one or more RaggedTensors.
 
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
index 5483cda571c..73a53583ada 100644
--- a/tensorflow/python/ops/ragged/ragged_math_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import segment_id_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -38,6 +39,7 @@ from tensorflow.python.util.tf_export import tf_export
 #===============================================================================
 # pylint: disable=redefined-builtin
 @tf_export('ragged.range')
+@dispatch.add_dispatch_support
 def range(starts, limits=None, deltas=1, dtype=None,
           name=None, row_splits_dtype=dtypes.int64):
   """Returns a `RaggedTensor` containing the specified sequences of numbers.
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index d5f21832044..0d9c4d506f3 100755
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -29,10 +29,12 @@ from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat as util_compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("strings.bytes_split")
+@dispatch.add_dispatch_support
 def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
   """Split string elements of `input` into bytes.
 
@@ -80,6 +82,7 @@ def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.unicode_encode")
+@dispatch.add_dispatch_support
 def unicode_encode(input,
                    output_encoding,
                    errors="replace",
@@ -177,6 +180,7 @@ def unicode_encode(input,
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.unicode_decode")
+@dispatch.add_dispatch_support
 def unicode_decode(input,
                    input_encoding,
                    errors="replace",
@@ -222,6 +226,7 @@ def unicode_decode(input,
 
 
 @tf_export("strings.unicode_decode_with_offsets")
+@dispatch.add_dispatch_support
 def unicode_decode_with_offsets(input,
                                 input_encoding,
                                 errors="replace",
@@ -283,6 +288,7 @@ def unicode_decode_with_offsets(input,
 
 
 @tf_export("strings.unicode_split")
+@dispatch.add_dispatch_support
 def unicode_split(input,
                   input_encoding,
                   errors="replace",
@@ -330,6 +336,7 @@ def unicode_split(input,
 
 
 @tf_export("strings.unicode_split_with_offsets")
+@dispatch.add_dispatch_support
 def unicode_split_with_offsets(input,
                                input_encoding,
                                errors="replace",
@@ -453,6 +460,7 @@ def _unicode_decode(input, input_encoding, errors, replacement_char,
 
 
 @tf_export("strings.split", v1=[])
+@dispatch.add_dispatch_support
 def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable=redefined-builtin
   """Split elements of `input` based on `sep` into a `RaggedTensor`.
 
@@ -514,6 +522,7 @@ def string_split_v2(input, sep=None, maxsplit=-1, name=None):  # pylint: disable
 
 
 @tf_export(v1=["string_split"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "delimiter is deprecated, please use sep instead.",
                              "delimiter")
@@ -578,6 +587,7 @@ def string_split(source, sep=None, skip_empty=True, delimiter=None,
 # In TensorFlow 1.x, "tf.strings.split" uses the new signature (with maxsplit),
 # but we need to add the result_type argument.
 @tf_export(v1=["strings.split"])
+@dispatch.add_dispatch_support
 def strings_split_v1(input=None, sep=None, maxsplit=-1,  # pylint: disable=redefined-builtin
                      result_type="SparseTensor", source=None, name=None):
   """Split elements of `input` based on `sep`.
@@ -651,6 +661,7 @@ def reduce_join(inputs, axis=None, keepdims=None, separator="", name=None):
 
 
 @tf_export("strings.ngrams")
+@dispatch.add_dispatch_support
 def ngrams(data,
            ngram_width,
            separator=" ",
diff --git a/tensorflow/python/ops/ragged/segment_id_ops.py b/tensorflow/python/ops/ragged/segment_id_ops.py
index 5329860743e..0d4a58bfea4 100644
--- a/tensorflow/python/ops/ragged/segment_id_ops.py
+++ b/tensorflow/python/ops/ragged/segment_id_ops.py
@@ -25,12 +25,14 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 # For background on "segments" and "segment ids", see:
 # https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.row_splits_to_segment_ids")
+@dispatch.add_dispatch_support
 def row_splits_to_segment_ids(splits, name=None, out_type=None):
   """Generates the segmentation corresponding to a RaggedTensor `row_splits`.
 
@@ -74,6 +76,7 @@ def row_splits_to_segment_ids(splits, name=None, out_type=None):
 # For background on "segments" and "segment ids", see:
 # https://www.tensorflow.org/api_docs/python/tf/math#Segmentation
 @tf_export("ragged.segment_ids_to_row_splits")
+@dispatch.add_dispatch_support
 def segment_ids_to_row_splits(segment_ids, num_segments=None,
                               out_type=None, name=None):
   """Generates the RaggedTensor `row_splits` corresponding to a segmentation.
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 83cb7fcc92a..1af91ed0dd3 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -36,10 +36,12 @@ from tensorflow.python.ops.gen_random_ops import *
 # pylint: enable=wildcard-import
 
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("random.normal", v1=["random.normal", "random_normal"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_normal")
 def random_normal(shape,
                   mean=0.0,
@@ -155,6 +157,7 @@ def parameterized_truncated_normal(shape,
 
 @tf_export("random.truncated_normal",
            v1=["random.truncated_normal", "truncated_normal"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("truncated_normal")
 def truncated_normal(shape,
                      mean=0.0,
@@ -202,6 +205,7 @@ ops.NotDifferentiable("TruncatedNormal")
 
 
 @tf_export("random.uniform", v1=["random.uniform", "random_uniform"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_uniform")
 def random_uniform(shape,
                    minval=0,
@@ -313,6 +317,7 @@ ops.NotDifferentiable("RandomUniform")
 
 
 @tf_export("random.shuffle", v1=["random.shuffle", "random_shuffle"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_shuffle")
 def random_shuffle(value, seed=None, name=None):
   """Randomly shuffles a tensor along its first dimension.
@@ -345,6 +350,7 @@ def random_shuffle(value, seed=None, name=None):
 
 
 @tf_export("image.random_crop", v1=["image.random_crop", "random_crop"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_crop")
 def random_crop(value, size, seed=None, name=None):
   """Randomly crops a tensor to a given size.
@@ -389,6 +395,7 @@ def random_crop(value, size, seed=None, name=None):
 
 
 @tf_export(v1=["random.multinomial", "multinomial"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     date=None, instructions="Use `tf.random.categorical` instead.")
 def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
@@ -468,6 +475,7 @@ def _maybe_set_static_shape_helper(tensor, shape, postfix_tensor):
 
 
 @tf_export("random.gamma", v1=["random.gamma", "random_gamma"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_gamma")
 def random_gamma(shape,
                  alpha,
@@ -561,6 +569,7 @@ def random_gamma(shape,
 
 
 @tf_export(v1=["random.poisson", "random_poisson"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("random_poisson")
 def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
@@ -601,6 +610,7 @@ def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
 
 
 @tf_export("random.poisson", v1=[])
+@dispatch.add_dispatch_support
 def random_poisson_v2(shape, lam, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
 
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index b87e5d65a37..6c11ebefb1c 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -342,6 +343,7 @@ def _reverse_seq(input_seq, lengths):
                         "keras.layers.RNN(cell))`, which is equivalent to "
                         "this API")
 @tf_export(v1=["nn.bidirectional_dynamic_rnn"])
+@dispatch.add_dispatch_support
 def bidirectional_dynamic_rnn(cell_fw,
                               cell_bw,
                               inputs,
@@ -499,6 +501,7 @@ def bidirectional_dynamic_rnn(cell_fw,
     None,
     "Please use `keras.layers.RNN(cell)`, which is equivalent to this API")
 @tf_export(v1=["nn.dynamic_rnn"])
+@dispatch.add_dispatch_support
 def dynamic_rnn(cell,
                 inputs,
                 sequence_length=None,
@@ -912,6 +915,7 @@ def _dynamic_rnn_loop(cell,
 
 
 @tf_export(v1=["nn.raw_rnn"])
+@dispatch.add_dispatch_support
 def raw_rnn(cell,
             loop_fn,
             parallel_iterations=None,
@@ -1238,6 +1242,7 @@ def raw_rnn(cell,
                         "Please use `keras.layers.RNN(cell, unroll=True)`, "
                         "which is equivalent to this API")
 @tf_export(v1=["nn.static_rnn"])
+@dispatch.add_dispatch_support
 def static_rnn(cell,
                inputs,
                initial_state=None,
@@ -1416,6 +1421,7 @@ def static_rnn(cell,
                         "Please use `keras.layers.RNN(cell, stateful=True)`, "
                         "which is equivalent to this API")
 @tf_export(v1=["nn.static_state_saving_rnn"])
+@dispatch.add_dispatch_support
 def static_state_saving_rnn(cell,
                             inputs,
                             state_saver,
@@ -1510,6 +1516,7 @@ def static_state_saving_rnn(cell,
                         "keras.layers.RNN(cell, unroll=True))`, which is "
                         "equivalent to this API")
 @tf_export(v1=["nn.static_bidirectional_rnn"])
+@dispatch.add_dispatch_support
 def static_bidirectional_rnn(cell_fw,
                              cell_bw,
                              inputs,
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index bee85dc4a5b..7ee5a16ca9a 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import gen_script_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
@@ -370,6 +371,7 @@ def _EagerPyFuncGrad(op, *dy):
 
 
 @tf_export("py_function")
+@dispatch.add_dispatch_support
 def eager_py_func(func, inp, Tout, name=None):
   """Wraps a python function into a TensorFlow op that executes it eagerly.
 
@@ -551,6 +553,7 @@ def py_func_common(func, inp, Tout, stateful=True, name=None):
     stateful argument making all functions stateful.
     """)
 @tf_export(v1=["py_func"])
+@dispatch.add_dispatch_support
 def py_func(func, inp, Tout, stateful=True, name=None):
   return py_func_common(func, inp, Tout, stateful, name=name)
 
@@ -559,6 +562,7 @@ py_func.__doc__ = "%s" % py_func_common.__doc__
 
 
 @tf_export("numpy_function")
+@dispatch.add_dispatch_support
 def numpy_function(func, inp, Tout, name=None):
   """Wraps a python function and uses it as a TensorFlow op.
 
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index 988d437bae8..0b65033ce8c 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import gen_set_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -32,6 +33,7 @@ _VALID_DTYPES = set([
 
 
 @tf_export("sets.size", v1=["sets.size", "sets.set_size"])
+@dispatch.add_dispatch_support
 def set_size(a, validate_indices=True):
   """Compute number of unique elements along last dimension of `a`.
 
@@ -135,6 +137,7 @@ def _set_operation(a, b, set_operation, validate_indices=True):
 
 @tf_export(
     "sets.intersection", v1=["sets.intersection", "sets.set_intersection"])
+@dispatch.add_dispatch_support
 def set_intersection(a, b, validate_indices=True):
   """Compute set intersection of elements in last dimension of `a` and `b`.
 
@@ -205,6 +208,7 @@ def set_intersection(a, b, validate_indices=True):
 
 @tf_export(
     "sets.difference", v1=["sets.difference", "sets.set_difference"])
+@dispatch.add_dispatch_support
 def set_difference(a, b, aminusb=True, validate_indices=True):
   """Compute set difference of elements in last dimension of `a` and `b`.
 
@@ -286,6 +290,7 @@ def set_difference(a, b, aminusb=True, validate_indices=True):
 
 @tf_export(
     "sets.union", v1=["sets.union", "sets.set_union"])
+@dispatch.add_dispatch_support
 def set_union(a, b, validate_indices=True):
   """Compute set union of elements in last dimension of `a` and `b`.
 
diff --git a/tensorflow/python/ops/signal/dct_ops.py b/tensorflow/python/ops/signal/dct_ops.py
index d628e54cdf9..18730743941 100644
--- a/tensorflow/python/ops/signal/dct_ops.py
+++ b/tensorflow/python/ops/signal/dct_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import math_ops as _math_ops
 from tensorflow.python.ops.signal import fft_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -50,6 +51,7 @@ def _validate_dct_arguments(input_tensor, dct_type, n, axis, norm):
 
 # TODO(rjryan): Implement `axis` parameter.
 @tf_export("signal.dct", v1=["signal.dct", "spectral.dct"])
+@dispatch.add_dispatch_support
 def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
   """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
 
@@ -181,6 +183,7 @@ def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disabl
 
 # TODO(rjryan): Implement `n` and `axis` parameters.
 @tf_export("signal.idct", v1=["signal.idct", "spectral.idct"])
+@dispatch.add_dispatch_support
 def idct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
   """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`.
 
diff --git a/tensorflow/python/ops/signal/fft_ops.py b/tensorflow/python/ops/signal/fft_ops.py
index 6e9e8ef80e4..86a94cf5de7 100644
--- a/tensorflow/python/ops/signal/fft_ops.py
+++ b/tensorflow/python/ops/signal/fft_ops.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import manip_ops
 from tensorflow.python.ops import math_ops as _math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -181,17 +182,23 @@ ifft2d = gen_spectral_ops.ifft2d
 fft3d = gen_spectral_ops.fft3d
 ifft3d = gen_spectral_ops.ifft3d
 rfft = _rfft_wrapper(gen_spectral_ops.rfft, 1, "rfft")
-tf_export("signal.rfft", v1=["signal.rfft", "spectral.rfft"])(rfft)
+tf_export("signal.rfft", v1=["signal.rfft", "spectral.rfft"])(
+    dispatch.add_dispatch_support(rfft))
 irfft = _irfft_wrapper(gen_spectral_ops.irfft, 1, "irfft")
-tf_export("signal.irfft", v1=["signal.irfft", "spectral.irfft"])(irfft)
+tf_export("signal.irfft", v1=["signal.irfft", "spectral.irfft"])(
+    dispatch.add_dispatch_support(irfft))
 rfft2d = _rfft_wrapper(gen_spectral_ops.rfft2d, 2, "rfft2d")
-tf_export("signal.rfft2d", v1=["signal.rfft2d", "spectral.rfft2d"])(rfft2d)
+tf_export("signal.rfft2d", v1=["signal.rfft2d", "spectral.rfft2d"])(
+    dispatch.add_dispatch_support(rfft2d))
 irfft2d = _irfft_wrapper(gen_spectral_ops.irfft2d, 2, "irfft2d")
-tf_export("signal.irfft2d", v1=["signal.irfft2d", "spectral.irfft2d"])(irfft2d)
+tf_export("signal.irfft2d", v1=["signal.irfft2d", "spectral.irfft2d"])(
+    dispatch.add_dispatch_support(irfft2d))
 rfft3d = _rfft_wrapper(gen_spectral_ops.rfft3d, 3, "rfft3d")
-tf_export("signal.rfft3d", v1=["signal.rfft3d", "spectral.rfft3d"])(rfft3d)
+tf_export("signal.rfft3d", v1=["signal.rfft3d", "spectral.rfft3d"])(
+    dispatch.add_dispatch_support(rfft3d))
 irfft3d = _irfft_wrapper(gen_spectral_ops.irfft3d, 3, "irfft3d")
-tf_export("signal.irfft3d", v1=["signal.irfft3d", "spectral.irfft3d"])(irfft3d)
+tf_export("signal.irfft3d", v1=["signal.irfft3d", "spectral.irfft3d"])(
+    dispatch.add_dispatch_support(irfft3d))
 
 
 def _fft_size_for_grad(grad, rank):
@@ -363,6 +370,7 @@ def _irfft_grad_helper(rank, rfft_fn):
 
 
 @tf_export("signal.fftshift")
+@dispatch.add_dispatch_support
 def fftshift(x, axes=None, name=None):
   """Shift the zero-frequency component to the center of the spectrum.
 
@@ -407,6 +415,7 @@ def fftshift(x, axes=None, name=None):
 
 
 @tf_export("signal.ifftshift")
+@dispatch.add_dispatch_support
 def ifftshift(x, axes=None, name=None):
   """The inverse of fftshift.
 
diff --git a/tensorflow/python/ops/signal/mel_ops.py b/tensorflow/python/ops/signal/mel_ops.py
index b95876bc977..cf0bed9ef1b 100644
--- a/tensorflow/python/ops/signal/mel_ops.py
+++ b/tensorflow/python/ops/signal/mel_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.signal import shape_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -90,6 +91,7 @@ def _validate_arguments(num_mel_bins, sample_rate,
 
 
 @tf_export('signal.linear_to_mel_weight_matrix')
+@dispatch.add_dispatch_support
 def linear_to_mel_weight_matrix(num_mel_bins=20,
                                 num_spectrogram_bins=129,
                                 sample_rate=8000,
diff --git a/tensorflow/python/ops/signal/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py
index 56cbff40bca..948b78a858e 100644
--- a/tensorflow/python/ops/signal/mfcc_ops.py
+++ b/tensorflow/python/ops/signal/mfcc_ops.py
@@ -22,10 +22,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.signal import dct_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('signal.mfccs_from_log_mel_spectrograms')
+@dispatch.add_dispatch_support
 def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
   """Computes [MFCCs][mfcc] of `log_mel_spectrograms`.
 
diff --git a/tensorflow/python/ops/signal/reconstruction_ops.py b/tensorflow/python/ops/signal/reconstruction_ops.py
index fcdcf592f14..e340e97b3e5 100644
--- a/tensorflow/python/ops/signal/reconstruction_ops.py
+++ b/tensorflow/python/ops/signal/reconstruction_ops.py
@@ -23,10 +23,12 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("signal.overlap_and_add")
+@dispatch.add_dispatch_support
 def overlap_and_add(signal, frame_step, name=None):
   """Reconstructs a signal from a framed representation.
 
diff --git a/tensorflow/python/ops/signal/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
index 1c95873fc3d..7a3acce3475 100644
--- a/tensorflow/python/ops/signal/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.signal import util_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -55,6 +56,7 @@ def _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis):
 
 
 @tf_export("signal.frame")
+@dispatch.add_dispatch_support
 def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
           name=None):
   """Expands `signal`'s `axis` dimension into frames of `frame_length`.
diff --git a/tensorflow/python/ops/signal/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py
index d096e53e8f8..7c4c5542b84 100644
--- a/tensorflow/python/ops/signal/spectral_ops.py
+++ b/tensorflow/python/ops/signal/spectral_ops.py
@@ -31,10 +31,12 @@ from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.ops.signal import reconstruction_ops
 from tensorflow.python.ops.signal import shape_ops
 from tensorflow.python.ops.signal import window_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('signal.stft')
+@dispatch.add_dispatch_support
 def stft(signals, frame_length, frame_step, fft_length=None,
          window_fn=window_ops.hann_window,
          pad_end=False, name=None):
@@ -95,6 +97,7 @@ def stft(signals, frame_length, frame_step, fft_length=None,
 
 
 @tf_export('signal.inverse_stft_window_fn')
+@dispatch.add_dispatch_support
 def inverse_stft_window_fn(frame_step,
                            forward_window_fn=window_ops.hann_window,
                            name=None):
@@ -156,6 +159,7 @@ def inverse_stft_window_fn(frame_step,
 
 
 @tf_export('signal.inverse_stft')
+@dispatch.add_dispatch_support
 def inverse_stft(stfts,
                  frame_length,
                  frame_step,
@@ -291,6 +295,7 @@ def _enclosing_power_of_two(value):
 
 
 @tf_export('signal.mdct')
+@dispatch.add_dispatch_support
 def mdct(signals, frame_length, window_fn=window_ops.vorbis_window,
          pad_end=False, norm=None, name=None):
   """Computes the [Modified Discrete Cosine Transform][mdct] of `signals`.
@@ -366,6 +371,7 @@ def mdct(signals, frame_length, window_fn=window_ops.vorbis_window,
 
 
 @tf_export('signal.inverse_mdct')
+@dispatch.add_dispatch_support
 def inverse_mdct(mdcts,
                  window_fn=window_ops.vorbis_window,
                  norm=None,
diff --git a/tensorflow/python/ops/signal/window_ops.py b/tensorflow/python/ops/signal/window_ops.py
index bb10bdf4be5..eb33c3f3b58 100644
--- a/tensorflow/python/ops/signal/window_ops.py
+++ b/tensorflow/python/ops/signal/window_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -52,6 +53,7 @@ def _check_params(window_length, dtype):
 
 
 @tf_export('signal.kaiser_window')
+@dispatch.add_dispatch_support
 def kaiser_window(window_length, beta=12., dtype=dtypes.float32, name=None):
   """Generate a [Kaiser window][kaiser].
 
@@ -91,6 +93,7 @@ def kaiser_window(window_length, beta=12., dtype=dtypes.float32, name=None):
 
 
 @tf_export('signal.kaiser_bessel_derived_window')
+@dispatch.add_dispatch_support
 def kaiser_bessel_derived_window(window_length, beta=12.,
                                  dtype=dtypes.float32, name=None):
   """Generate a [Kaiser Bessel derived window][kbd].
@@ -118,6 +121,7 @@ def kaiser_bessel_derived_window(window_length, beta=12.,
 
 
 @tf_export('signal.vorbis_window')
+@dispatch.add_dispatch_support
 def vorbis_window(window_length, dtype=dtypes.float32, name=None):
   """Generate a [Vorbis power complementary window][vorbis].
 
@@ -142,6 +146,7 @@ def vorbis_window(window_length, dtype=dtypes.float32, name=None):
 
 
 @tf_export('signal.hann_window')
+@dispatch.add_dispatch_support
 def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
   """Generate a [Hann window][hann].
 
@@ -167,6 +172,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
 
 
 @tf_export('signal.hamming_window')
+@dispatch.add_dispatch_support
 def hamming_window(window_length, periodic=True, dtype=dtypes.float32,
                    name=None):
   """Generate a [Hamming][hamming] window.
diff --git a/tensorflow/python/ops/sort_ops.py b/tensorflow/python/ops/sort_ops.py
index 92435e6bdef..4e66a80bc01 100644
--- a/tensorflow/python/ops/sort_ops.py
+++ b/tensorflow/python/ops/sort_ops.py
@@ -30,10 +30,12 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('sort')
+@dispatch.add_dispatch_support
 def sort(values, axis=-1, direction='ASCENDING', name=None):
   """Sorts a tensor.
   
@@ -67,6 +69,7 @@ def sort(values, axis=-1, direction='ASCENDING', name=None):
 
 
 @tf_export('argsort')
+@dispatch.add_dispatch_support
 def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
   """Returns the indices of a tensor that give its sorted order along an axis.
 
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 844aa3c744c..c4c88ab86ef 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -1065,6 +1065,7 @@ def sparse_slice(sp_input, start, size, name=None):
 
 
 @tf_export(v1=["sparse_to_dense"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     None,
     "Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.")
@@ -1994,6 +1995,7 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
 
 
 @tf_export(v1=["io.serialize_sparse", "serialize_sparse"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("serialize_sparse")
 def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
@@ -2014,6 +2016,7 @@ def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
 
 
 @tf_export("io.serialize_sparse", v1=[])
+@dispatch.add_dispatch_support
 def serialize_sparse_v2(sp_input, out_type=dtypes.string, name=None):
   """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
 
@@ -2040,6 +2043,7 @@ def serialize_sparse_v2(sp_input, out_type=dtypes.string, name=None):
 
 
 @tf_export(v1=["io.serialize_many_sparse", "serialize_many_sparse"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("serialize_many_sparse")
 def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
@@ -2069,6 +2073,7 @@ def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
 
 
 @tf_export("io.serialize_many_sparse", v1=[])
+@dispatch.add_dispatch_support
 def serialize_many_sparse_v2(sp_input, out_type=dtypes.string, name=None):
   """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
 
@@ -2172,6 +2177,7 @@ def deserialize_sparse(serialized_sparse, dtype, rank=None, name=None):
 @tf_export(
     "io.deserialize_many_sparse",
     v1=["io.deserialize_many_sparse", "deserialize_many_sparse"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("deserialize_many_sparse")
 def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   """Deserialize and concatenate `SparseTensors` from a serialized minibatch.
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index a05a488408d..036346cdecd 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -42,11 +42,13 @@ from tensorflow.python.ops import gen_special_math_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 # TODO(b/27419586) Change docstring for required dtype of x once int allowed
 @tf_export('math.lbeta', v1=['math.lbeta', 'lbeta'])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints('lbeta')
 def lbeta(x, name=None):
   r"""Computes \\(ln(|Beta(x)|)\\), reducing along the last dimension.
@@ -102,6 +104,7 @@ def lbeta(x, name=None):
 
 
 @tf_export('math.special.dawsn')
+@dispatch.add_dispatch_support
 def dawsn(x, name=None):
   """Computes Dawson's integral of `x` element-wise.
 
@@ -131,6 +134,7 @@ def dawsn(x, name=None):
 
 
 @tf_export('math.special.expint')
+@dispatch.add_dispatch_support
 def expint(x, name=None):
   """Computes the Exponential integral of `x` element-wise.
 
@@ -159,6 +163,7 @@ def expint(x, name=None):
 
 
 @tf_export('math.special.fresnel_cos')
+@dispatch.add_dispatch_support
 def fresnel_cos(x, name=None):
   """Computes Fresnel's cosine integral of `x` element-wise.
 
@@ -188,6 +193,7 @@ def fresnel_cos(x, name=None):
 
 
 @tf_export('math.special.fresnel_sin')
+@dispatch.add_dispatch_support
 def fresnel_sin(x, name=None):
   """Computes Fresnel's sine integral of `x` element-wise.
 
@@ -216,6 +222,7 @@ def fresnel_sin(x, name=None):
 
 
 @tf_export('math.special.spence')
+@dispatch.add_dispatch_support
 def spence(x, name=None):
   """Computes Spence's integral of `x` element-wise.
 
@@ -244,6 +251,7 @@ def spence(x, name=None):
 
 
 @tf_export('math.bessel_i0')
+@dispatch.add_dispatch_support
 def bessel_i0(x, name=None):
   """Computes the Bessel i0 function of `x` element-wise.
 
@@ -268,6 +276,7 @@ def bessel_i0(x, name=None):
 
 
 @tf_export('math.bessel_i1')
+@dispatch.add_dispatch_support
 def bessel_i1(x, name=None):
   """Computes the Bessel i1 function of `x` element-wise.
 
@@ -325,6 +334,7 @@ def _enclosing_tpu_context():
 
 
 @tf_export('einsum', 'linalg.einsum')
+@dispatch.add_dispatch_support
 def einsum(equation, *inputs, **kwargs):
   """Tensor contraction over specified indices and outer product.
 
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index 2bf53d3a0f7..0ae29ba0219 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -27,6 +27,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_stateless_random_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable("StatelessMultinomial")
@@ -40,6 +41,7 @@ ops.NotDifferentiable("StatelessTruncatedNormal")
 
 
 @tf_export("random.experimental.stateless_split")
+@dispatch.add_dispatch_support
 def split(seed, num=2):
   """Splits an RNG seed into `num` new seeds by adding a leading axis.
 
@@ -73,6 +75,7 @@ def split(seed, num=2):
 
 
 @tf_export("random.experimental.stateless_fold_in")
+@dispatch.add_dispatch_support
 def fold_in(seed, data):
   """Folds in data to an RNG seed to form a new RNG seed.
 
@@ -111,6 +114,7 @@ def fold_in(seed, data):
 
 
 @tf_export("random.stateless_uniform")
+@dispatch.add_dispatch_support
 def stateless_random_uniform(shape,
                              seed,
                              minval=0,
@@ -205,6 +209,7 @@ def stateless_random_uniform(shape,
 
 
 @tf_export("random.stateless_binomial")
+@dispatch.add_dispatch_support
 def stateless_random_binomial(shape,
                               seed,
                               counts,
@@ -274,6 +279,7 @@ def stateless_random_binomial(shape,
 
 
 @tf_export("random.stateless_gamma")
+@dispatch.add_dispatch_support
 def stateless_random_gamma(shape,
                            seed,
                            alpha,
@@ -372,6 +378,7 @@ def stateless_random_gamma(shape,
 
 
 @tf_export("random.stateless_poisson")
+@dispatch.add_dispatch_support
 def stateless_random_poisson(shape,
                              seed,
                              lam,
@@ -434,6 +441,7 @@ def stateless_random_poisson(shape,
 
 
 @tf_export("random.stateless_normal")
+@dispatch.add_dispatch_support
 def stateless_random_normal(shape,
                             seed,
                             mean=0.0,
@@ -474,6 +482,7 @@ def stateless_random_normal(shape,
 
 
 @tf_export("random.stateless_truncated_normal")
+@dispatch.add_dispatch_support
 def stateless_truncated_normal(shape,
                                seed,
                                mean=0.0,
@@ -520,6 +529,7 @@ def stateless_truncated_normal(shape,
 
 
 @tf_export(v1=["random.stateless_multinomial"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(
     date=None, instructions="Use `tf.random.stateless_categorical` instead.")
 def stateless_multinomial(logits,
@@ -562,6 +572,7 @@ def stateless_multinomial(logits,
 
 
 @tf_export("random.stateless_categorical")
+@dispatch.add_dispatch_support
 def stateless_categorical(logits,
                           num_samples,
                           seed,
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 09ba078383a..dd0ae223d9d 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -73,6 +73,7 @@ regex_full_match.__doc__ = gen_string_ops.regex_full_match.__doc__
 
 @tf_export(
     "strings.regex_replace", v1=["strings.regex_replace", "regex_replace"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("regex_replace")
 @dispatch.add_dispatch_support
 def regex_replace(input, pattern, rewrite, replace_global=True, name=None):
@@ -112,6 +113,7 @@ def regex_replace(input, pattern, rewrite, replace_global=True, name=None):
 
 
 @tf_export("strings.format")
+@dispatch.add_dispatch_support
 def string_format(template, inputs, placeholder="{}", summarize=3, name=None):
   r"""Formats a string template using a list of tensors.
 
@@ -300,6 +302,7 @@ def _reduce_join_reduction_dims(x, axis):
 
 
 @tf_export(v1=["strings.reduce_join", "reduce_join"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_args(None,
                              "keep_dims is deprecated, use keepdims instead",
                              "keep_dims")
@@ -412,6 +415,7 @@ string_length_v2.__doc__ = gen_string_ops.string_length.__doc__
 
 
 @tf_export(v1=["substr"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated(None, "Use `tf.strings.substr` instead of `tf.substr`.")
 def substr_deprecated(input, pos, len, name=None, unit="BYTE"):
   return substr(input, pos, len, name=name, unit=unit)
@@ -476,6 +480,7 @@ def string_to_number(input, out_type=dtypes.float32, name=None):
 
 
 @tf_export(v1=["strings.to_number", "string_to_number"])
+@dispatch.add_dispatch_support
 def string_to_number_v1(
     string_tensor=None,
     out_type=dtypes.float32,
@@ -519,6 +524,7 @@ def string_to_hash_bucket(input, num_buckets, name=None):
 
 
 @tf_export(v1=["strings.to_hash_bucket", "string_to_hash_bucket"])
+@dispatch.add_dispatch_support
 def string_to_hash_bucket_v1(
     string_tensor=None,
     num_buckets=None,
@@ -532,6 +538,7 @@ string_to_hash_bucket_v1.__doc__ = gen_string_ops.string_to_hash_bucket.__doc__
 
 
 @tf_export("strings.join", v1=["strings.join", "string_join"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("string_join")
 @dispatch.add_dispatch_support
 def string_join(inputs, separator="", name=None):

From fdcdac12a72ead0128463dc029af58e896897cc9 Mon Sep 17 00:00:00 2001
From: Rajeshwar Reddy T <43972606+rthadur@users.noreply.github.com>
Date: Fri, 15 May 2020 11:22:06 -0700
Subject: [PATCH 274/412] Update bot_config.yml

---
 .github/bot_config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index d63bd2ce844..fdb19d453c2 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -1,4 +1,4 @@
- # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
     #
     # Licensed under the Apache License, Version 2.0 (the "License");
     # you may not use this file except in compliance with the License.

From ff17316b19d5958605cebf941e4302d60f405784 Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Fri, 15 May 2020 11:20:09 -0700
Subject: [PATCH 275/412] Check for `_metrics` in case sublayer is resetting
 `_metrics` property.

PiperOrigin-RevId: 311767501
Change-Id: I1f97904314a0f1912c918b89f461edd1183f4604
---
 tensorflow/python/keras/engine/base_layer.py    | 2 +-
 tensorflow/python/keras/engine/base_layer_v1.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 94b696d842b..0f4bec92e39 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -2588,7 +2588,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # Keep track of metric instance created in subclassed layer.
     from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
     for val in nest.flatten(value):
-      if isinstance(val, metrics_module.Metric):
+      if isinstance(val, metrics_module.Metric) and hasattr(self, '_metrics'):
         self._metrics.append(val)
 
     # TODO(scottzhu): Need to track Module object as well for weight tracking.
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 4a277ec3a3e..80e0b4be2f1 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -2226,7 +2226,7 @@ class Layer(base_layer.Layer):
     # Keep track of metric instance created in subclassed layer.
     from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
     for val in nest.flatten(value):
-      if isinstance(val, metrics_module.Metric):
+      if isinstance(val, metrics_module.Metric) and hasattr(self, '_metrics'):
         self._metrics.append(val)
 
     # TODO(scottzhu): Need to track Module object as well for weight tracking.

From 6e2219518cf6351bd7067b98e75e2862b5c5b88a Mon Sep 17 00:00:00 2001
From: Rajeshwar Reddy T <43972606+rthadur@users.noreply.github.com>
Date: Fri, 15 May 2020 11:23:59 -0700
Subject: [PATCH 276/412] Update bot_config.yml

---
 .github/bot_config.yml | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index fdb19d453c2..ee6037f4b94 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -1,23 +1,23 @@
-    # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-    #
-    # Licensed under the Apache License, Version 2.0 (the "License");
-    # you may not use this file except in compliance with the License.
-    # You may obtain a copy of the License at
-    #
-    #     http://www.apache.org/licenses/LICENSE-2.0
-    #
-    # Unless required by applicable law or agreed to in writing, software
-    # distributed under the License is distributed on an "AS IS" BASIS,
-    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    # See the License for the specific language governing permissions and
-    # limitations under the License.
-    # ============================================================================
-    #
-    # THIS IS A GENERATED DOCKERFILE.
-    #
-    # This file was assembled from multiple pieces, whose use is documented
-    # throughout. Please refer to the TensorFlow dockerfiles documentation
-    # for more information.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 # A list of assignees
 assignees:

From 1f530076d15fe482234b83273e4432b14c353853 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 15 May 2020 11:21:31 -0700
Subject: [PATCH 277/412] Fix TFLite builds on Windows/MacOS

Avoid using `--enable_platform_specific_config` when cross-compiling for
iOS/Android, as this pulls in host build flags, which may not be
appropriate (e.g., when cross-compiling for Android on a Windows host).

Also fix an issue when building tensorflowlite_c for iOS.

Fixes #38525.

PiperOrigin-RevId: 311767770
Change-Id: I80b817fd89a6889dc78be50f1def8b899f091cb6
---
 .bazelrc                  | 11 ++++++++++-
 configure.py              |  1 -
 tensorflow/lite/c/BUILD   |  3 +++
 tensorflow/tensorflow.bzl |  3 +++
 4 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/.bazelrc b/.bazelrc
index 224238d7c0b..7e0f820b4c2 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -235,10 +235,15 @@ build:c++17 --cxxopt=-std=c++1z
 build:c++17 --cxxopt=-stdlib=libc++
 build:c++1z --config=c++17
 
-# Enable using platform specific build settings
+# Enable using platform specific build settings, except when cross-compiling for
+# mobile platforms.
 build --enable_platform_specific_config
+build:android --noenable_platform_specific_config
+build:ios --noenable_platform_specific_config
 
 # Suppress C++ compiler warnings, otherwise build logs become 10s of MBs.
+build:android --copt=-w
+build:ios --copt=-w
 build:linux --copt=-w
 build:macos --copt=-w
 build:windows --copt=/w
@@ -258,6 +263,10 @@ build:macos --define=INCLUDEDIR=$(PREFIX)/include
 # TF_SYSTEM_LIBS do not work on windows.
 
 # By default, build TF in C++ 14 mode.
+build:android --cxxopt=-std=c++14
+build:android --host_cxxopt=-std=c++14
+build:ios --cxxopt=-std=c++14
+build:ios --host_cxxopt=-std=c++14
 build:linux --cxxopt=-std=c++14
 build:linux --host_cxxopt=-std=c++14
 build:macos --cxxopt=-std=c++14
diff --git a/configure.py b/configure.py
index 945c3036a8d..9154000d944 100644
--- a/configure.py
+++ b/configure.py
@@ -1387,7 +1387,6 @@ def main():
     # Windows.
     environ_cp['TF_DOWNLOAD_CLANG'] = '0'
     environ_cp['TF_NEED_MPI'] = '0'
-    environ_cp['TF_SET_ANDROID_WORKSPACE'] = '0'
 
   if is_macos():
     environ_cp['TF_NEED_TENSORRT'] = '0'
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index e1702d40d5a..1aa043b7c0c 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -22,6 +22,9 @@ package(
 tflite_cc_shared_object(
     name = "tensorflowlite_c",
     linkopts = select({
+        "//tensorflow:ios": [
+            "-Wl,-exported_symbols_list,$(location //tensorflow/lite/c:exported_symbols.lds)",
+        ],
         "//tensorflow:macos": [
             "-Wl,-exported_symbols_list,$(location //tensorflow/lite/c:exported_symbols.lds)",
         ],
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index f56330b428a..c029de9a4e8 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -615,6 +615,9 @@ def tf_cc_shared_object(
             linkshared = 1,
             data = data + data_extra,
             linkopts = linkopts + _rpath_linkopts(name_os_full) + select({
+                clean_dep("//tensorflow:ios"): [
+                    "-Wl,-install_name,@rpath/" + soname,
+                ],
                 clean_dep("//tensorflow:macos"): [
                     "-Wl,-install_name,@rpath/" + soname,
                 ],

From 9cb8d45b72233c19125c4ca8890fae5611110ec9 Mon Sep 17 00:00:00 2001
From: Lev Proleev <levp@google.com>
Date: Fri, 15 May 2020 11:44:26 -0700
Subject: [PATCH 278/412] Add NNAPI delegate support for Elu

PiperOrigin-RevId: 311772163
Change-Id: I94393872c9afa25aafc2fc55f688d47caa57ed14
---
 .../lite/delegates/nnapi/acceleration_test_list.cc |  7 ++++---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.cc  | 14 ++++++++++++--
 .../lite/delegates/nnapi/nnapi_delegate_kernel.h   |  1 +
 tensorflow/lite/kernels/lstm_test.cc               |  2 +-
 tensorflow/lite/nnapi/NeuralNetworksTypes.h        |  7 +++++++
 tensorflow/lite/nnapi/nnapi_implementation.cc      | 13 -------------
 6 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index cc9e049123e..46a6a720d1e 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -56,6 +56,7 @@ FloatActivationsOpTest/PRelu,29
 LogisticOpTest/LogisticOpTest/Sigmoid(.+nt8)?/\d+
 LogisticOpTest/LogisticOpTest/Sigmoid/\d+
 TanhOpTest/TanhOpTest/Tanh(.+nt8)?/\d+,29
+FloatActivationsOpTest/Elu,30
 FloatActivationsOpTest/HardSwish
 QuantizedActivationsOpTest/HardSwish
 QuantizedActivationsOpTest/HardSwishBias
@@ -301,14 +302,14 @@ VariedShapeSpec/ReshapeOpTest/WithStretchDimension/1
 
 # resize_bilinear_test
 // align_corners & half_pixel_centers are not implemented in NNAPI before API 30
-ResizeBilinearOpTest/ResizeBilinearOpTest.+HalfPixelCenters.*,30
+ResizeBilinearOpTest/ResizeBilinearOpTest.+HalfPixelCenters.*/0,30
 // Only models with constant size tensor are accelerated
 ResizeBilinearOpTest/ResizeBilinearOpTest/.+/0,29
 
 # resize_nearest_neighbor_test
 // align_corners & half_pixel_centers are not implemented in NNAPI before API 30
-ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+AlignCorners.*,30
-ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+HalfPixelCenters.*,30
+ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+AlignCorners.*/0,30
+ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest.+HalfPixelCenters.*/0,30
 // Only models with constant size tensor are accelerated
 ResizeNearestNeighborOpTest/ResizeNearestNeighborOpTest/.+/0,29
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index ff6ad0dc0d9..e6faea62bf6 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdarg>
 #include <cstddef>
 #include <cstdint>
+#include <cstdio>
 #include <cstring>
 #include <functional>
 #include <initializer_list>
@@ -1623,7 +1624,7 @@ bool NNAPIDelegateKernel::Validate(
       }
     } break;
     case kTfLiteBuiltinResizeBilinear: {
-      ExpectMaxOpVersion(version, 2, &val_ctx);
+      ExpectMaxOpVersion(version, 3, &val_ctx);
       const auto& input = context->tensors[node->inputs->data[0]];
       const auto output_dims = context->tensors[node->outputs->data[0]].dims;
       Expect(input.dims->size == 4,
@@ -1663,7 +1664,7 @@ bool NNAPIDelegateKernel::Validate(
       }
     } break;
     case kTfLiteBuiltinResizeNearestNeighbor: {
-      ExpectMaxOpVersion(version, 2, &val_ctx);
+      ExpectMaxOpVersion(version, 3, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
                                  &val_ctx);
       ExpectIsFloatOrQuant8Operator(context, node, &val_ctx);
@@ -2334,6 +2335,11 @@ bool NNAPIDelegateKernel::Validate(
              NNAPIValidationFailureType::kUnsupportedInputType,
              "NNAPI only supports floating point input.", &val_ctx);
     } break;
+    case kTfLiteBuiltinElu: {
+      ExpectOpVersion(version, 1, &val_ctx);
+      ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI13,
+                                 &val_ctx);
+    } break;
     default:
       // All other operators are not mapped.
       AddValidationFailure(NNAPIValidationFailureType::kUnsupportedOperator,
@@ -3111,6 +3117,10 @@ TfLiteStatus NNAPIDelegateKernel::Map(
       mapping_args.builder->AddScalarBoolOperand(builtin->keep_dims);
       *nn_op_type = ANEURALNETWORKS_REDUCE_SUM;
     } break;
+    case kTfLiteBuiltinElu: {
+      mapping_args.builder->AddScalarFloat32Operand(1.0);
+      *nn_op_type = ANEURALNETWORKS_ELU;
+    } break;
     default:
       // All other operators are not mapped.
       return kTfLiteError;
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
index 668fdf5b5f6..af93d9650c9 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h
@@ -31,6 +31,7 @@ namespace nnapi {
 constexpr int32_t kMinSdkVersionForNNAPI = 27;
 constexpr int32_t kMinSdkVersionForNNAPI11 = 28;
 constexpr int32_t kMinSdkVersionForNNAPI12 = 29;
+constexpr int32_t kMinSdkVersionForNNAPI13 = 30;
 
 // Track tensor indices to NN API tensor indices mapping.
 class OperandMapping {
diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc
index 2bd31eae8db..62634e6bfbd 100644
--- a/tensorflow/lite/kernels/lstm_test.cc
+++ b/tensorflow/lite/kernels/lstm_test.cc
@@ -2050,7 +2050,7 @@ TEST_P(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
       }};
 
   VerifyGoldens(lstm_input_, lstm_golden_output_, &layer_norm_lstm,
-                /*tolerance=*/0.000902065);
+                /*tolerance=*/0.0009021);
 }
 
 class CifgPeepholeProjectionNoClippingLayerNormLstmInt8Test
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index 851c1718e0a..a3dfd373405 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -136,6 +136,13 @@ enum {
   ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_LSTM = 92,
   ANEURALNETWORKS_UNIDIRECTIONAL_SEQUENCE_RNN = 93,
   ANEURALNETWORKS_RESIZE_NEAREST_NEIGHBOR = 94,
+  ANEURALNETWORKS_QUANTIZED_LSTM = 95,
+  ANEURALNETWORKS_IF = 96,
+  ANEURALNETWORKS_WHILE = 97,
+  ANEURALNETWORKS_ELU = 98,
+  ANEURALNETWORKS_HARD_SWISH = 99,
+  ANEURALNETWORKS_FILL = 100,
+  ANEURALNETWORKS_RANK = 101,
 };
 
 /**
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index 71a4de53e9a..accdfb6c7da 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -45,19 +45,6 @@ int32_t GetAndroidSdkVersion() {
       }
       result = result * 10 + digit;
     }
-    // TODO(levp): remove once SDK gets updated to 29th level
-    // Upgrade SDK version for pre-release Q to be able to test functionality
-    // available from SDK level 29.
-    if (result == 28) {
-      char versionCodename[PROP_VALUE_MAX];
-      const char* versionCodenameProp = "ro.build.version.codename";
-      length = __system_property_get(versionCodenameProp, versionCodename);
-      if (length != 0) {
-        if (versionCodename[0] == 'Q') {
-          return 29;
-        }
-      }
-    }
     return result;
   }
   return 0;

From 362818b71e540df53f909f09f55f5c31234c29ca Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Fri, 15 May 2020 11:52:10 -0700
Subject: [PATCH 279/412] Attempt to build libtensorflow GPU with the
 manylinux2010 toolchain.

PiperOrigin-RevId: 311773671
Change-Id: I6a0a34852786fb2187ea7ad131a6e4878c84e089
---
 tensorflow/tools/ci_build/builds/libtensorflow.sh       | 2 +-
 tensorflow/tools/ci_build/linux/libtensorflow_docker.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 44180b8bf84..a281afe7442 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -54,7 +54,7 @@ function build_libtensorflow_tarball() {
   BAZEL_OPTS="--config=opt --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
   export CC_OPT_FLAGS="-mavx -msse4.2"
   if [ "${TF_NEED_CUDA}" == "1" ]; then
-    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
+    BAZEL_OPTS="${BAZEL_OPTS} --config=cuda --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
     export TF_NEED_ROCM=0
   fi
   bazel clean --expunge
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index 467b8dc8083..1b255682671 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -36,7 +36,7 @@ DOCKER_BINARY="docker"
 if [ "${TF_NEED_CUDA}" == "1" ]; then
   DOCKER_IMAGE="tf-tensorflow-gpu"
   DOCKER_BINARY="nvidia-docker"
-  DOCKER_FILE="Dockerfile.gpu"
+  DOCKER_FILE="Dockerfile.rbe.cuda10.1-cudnn7-ubuntu16.04-manylinux2010"
 fi
 if [ "${TF_NEED_ROCM}" == "1" ]; then
   DOCKER_IMAGE="tf-tensorflow-rocm"

From 75132b735b4f0f0ec7f86f5d3db9b8e05209ab63 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 11:54:21 -0700
Subject: [PATCH 280/412] Minor cleanup for strings in
 xplane_to_profile_response.

PiperOrigin-RevId: 311774079
Change-Id: I445cd1121c548dd2beb133057eeab4f434939df9
---
 .../core/profiler/convert/xplane_to_profile_response.cc   | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
index e6fe74942fc..70a07171310 100644
--- a/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_profile_response.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/convert/xplane_to_profile_response.h"
 
+#include <string>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -78,14 +80,14 @@ Status ConvertProtoToJson(const Proto& proto_output, std::string* json_output) {
     // tensorflow::StringPiece.
     auto error_msg = status.message();
     return errors::Internal(
-        strings::StrCat("Could not convert proto to JSON string: ",
-                        StringPiece(error_msg.data(), error_msg.length())));
+        "Could not convert proto to JSON string: ",
+        absl::string_view(error_msg.data(), error_msg.length()));
   }
   return Status::OK();
 }
 
 // Returns the tool name with extension.
-string ToolName(absl::string_view tool) {
+std::string ToolName(absl::string_view tool) {
   if (tool == kTraceViewer) return "trace.json.gz";
   if (tool == kMemoryProfile) return "memory_profile.json.gz";
   return absl::StrCat(tool, ".pb");

From b1fc80f4a199f353d5bce0a79689b08181d3d96d Mon Sep 17 00:00:00 2001
From: Thomas O'Malley <omalleyt@google.com>
Date: Fri, 15 May 2020 12:00:16 -0700
Subject: [PATCH 281/412] Reduce Functional.__call__ Python overhead by ~5-10%

PiperOrigin-RevId: 311775071
Change-Id: I45dd0a1ce865d6c17f7b5e292799348e1e17a91c
---
 tensorflow/python/keras/engine/functional.py | 46 +++++++-------------
 tensorflow/python/keras/engine/node.py       |  4 ++
 2 files changed, 19 insertions(+), 31 deletions(-)

diff --git a/tensorflow/python/keras/engine/functional.py b/tensorflow/python/keras/engine/functional.py
index c79e2849c4f..f219e590daf 100644
--- a/tensorflow/python/keras/engine/functional.py
+++ b/tensorflow/python/keras/engine/functional.py
@@ -469,11 +469,11 @@ class Functional(training_lib.Model):
         mask: (Optional) Tensor or nested structure of Tensors.
 
     Returns:
-        Two lists: output_tensors, output_masks
+        output_tensors
     """
     inputs = self._flatten_to_reference_inputs(inputs)
     if mask is None:
-      masks = [None for _ in range(len(inputs))]
+      masks = [None] * len(inputs)
     else:
       masks = self._flatten_to_reference_inputs(mask)
     for input_t, mask in zip(inputs, masks):
@@ -481,55 +481,39 @@ class Functional(training_lib.Model):
 
     # Dictionary mapping reference tensors to computed tensors.
     tensor_dict = {}
+    tensor_usage_count = self._tensor_usage_count
     for x, y in zip(self.inputs, inputs):
       y = self._conform_to_reference_input(y, ref_input=x)
       x_id = str(id(x))
-      tensor_dict[x_id] = [y] * self._tensor_usage_count[x_id]
+      tensor_dict[x_id] = [y] * tensor_usage_count[x_id]
 
-    depth_keys = list(self._nodes_by_depth.keys())
+    nodes_by_depth = self._nodes_by_depth
+    depth_keys = list(nodes_by_depth.keys())
     depth_keys.sort(reverse=True)
 
     for depth in depth_keys:
-      nodes = self._nodes_by_depth[depth]
+      nodes = nodes_by_depth[depth]
       for node in nodes:
         if node.is_input:
           continue  # Input tensors already exist.
 
-        if not all(
-            str(id(tensor)) in tensor_dict
-            for tensor in nest.flatten(node.keras_inputs)):
+        if any(t_id not in tensor_dict for t_id in node.flat_input_ids):
           continue  # Node is not computable, try skipping.
 
-        layer = node.layer
         args, kwargs = node.map_arguments(tensor_dict)
-        outputs = layer(*args, **kwargs)
+        outputs = node.layer(*args, **kwargs)
 
         # Update tensor_dict.
-        for x, y in zip(nest.flatten(node.outputs), nest.flatten(outputs)):
-          x_id = str(id(x))
-          tensor_dict[x_id] = [y] * self._tensor_usage_count[x_id]
+        for x_id, y in zip(node.flat_output_ids, nest.flatten(outputs)):
+          tensor_dict[x_id] = [y] * tensor_usage_count[x_id]
 
     output_tensors = []
-    output_shapes = []
     for x in self.outputs:
-      assert str(id(x)) in tensor_dict, 'Could not compute output ' + str(x)
-      tensor = tensor_dict[str(id(x))].pop()
-      output_shapes.append(x.shape)
-      output_tensors.append(tensor)
+      x_id = str(id(x))
+      assert x_id in tensor_dict, 'Could not compute output ' + str(x)
+      output_tensors.append(tensor_dict[x_id].pop())
 
-    if output_shapes is not None:
-      input_shapes = [x.shape for x in inputs]
-      try:
-        cache_key = tuple(tf_utils.convert_shapes(input_shapes, to_tuples=True))
-        self._output_shape_cache[cache_key] = nest.pack_sequence_as(
-            self._nested_outputs, output_shapes)
-      except ValueError:
-        # In case there are unknown TensorShape, eg for sparse tensor input,
-        # We skip the caching since the shape is unknown.
-        pass
-
-    output_tensors = nest.pack_sequence_as(self._nested_outputs, output_tensors)
-    return output_tensors
+    return nest.pack_sequence_as(self._nested_outputs, output_tensors)
 
   def _flatten_to_reference_inputs(self, tensors):
     """Maps `tensors` to their respective `keras.Input`."""
diff --git a/tensorflow/python/keras/engine/node.py b/tensorflow/python/keras/engine/node.py
index 945cf1c64bd..a9e0b621d75 100644
--- a/tensorflow/python/keras/engine/node.py
+++ b/tensorflow/python/keras/engine/node.py
@@ -102,6 +102,10 @@ class Node(object):
       tensor._keras_history = KerasHistory(
           layer=layer, node_index=node_index, tensor_index=i)
 
+    # Cached for performance.
+    self.flat_input_ids = [str(id(t)) for t in self._keras_inputs]
+    self.flat_output_ids = [str(id(t)) for t in nest.flatten(self.outputs)]
+
   @property
   def keras_inputs(self):
     """Tensors input to this node that can be traced back to a `keras.Input`."""

From 0b59eaf0bf66b71dc108ec4f73c548fc48abc36d Mon Sep 17 00:00:00 2001
From: "T.J. Alumbaugh" <talumbau@google.com>
Date: Fri, 15 May 2020 12:09:13 -0700
Subject: [PATCH 282/412] Provide builtin_op_kernels target with Ruy and GEMV
 caching unconditionally enabled

PiperOrigin-RevId: 311776871
Change-Id: I948ea5524fdcf17c36e6219fb1ae18fafdecee4e
---
 tensorflow/lite/kernels/BUILD | 294 +++++++++++++++++++---------------
 1 file changed, 168 insertions(+), 126 deletions(-)

diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 3a29fee5699..657b5d89a85 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -235,6 +235,15 @@ cc_library(
     visibility = ["//visibility:private"],
 )
 
+cc_library(
+    name = "tflite_with_ruy_and_caching_enabled",
+    defines = [
+        "TFLITE_WITH_RUY",
+        "TFLITE_WITH_RUY_GEMV",
+    ],
+    visibility = ["//visibility:private"],
+)
+
 cc_library(
     name = "tflite_with_ruy_default",
     build_for_embedded = True,
@@ -423,140 +432,157 @@ cc_library(
     ],
 )
 
+BUILTIN_KERNEL_SRCS = [
+    "activations.cc",
+    "add.cc",
+    "add_n.cc",
+    "arg_min_max.cc",
+    "audio_spectrogram.cc",
+    "basic_rnn.cc",
+    "batch_matmul.cc",
+    "batch_to_space_nd.cc",
+    "bidirectional_sequence_lstm.cc",
+    "bidirectional_sequence_rnn.cc",
+    "cast.cc",
+    "ceil.cc",
+    "comparisons.cc",
+    "concatenation.cc",
+    "conv.cc",
+    "densify.cc",
+    "depth_to_space.cc",
+    "depthwise_conv.cc",
+    "dequantize.cc",
+    "detection_postprocess.cc",
+    "div.cc",
+    "elementwise.cc",
+    "embedding_lookup.cc",
+    "embedding_lookup_sparse.cc",
+    "exp.cc",
+    "expand_dims.cc",
+    "fake_quant.cc",
+    "fill.cc",
+    "floor.cc",
+    "floor_div.cc",
+    "floor_mod.cc",
+    "fully_connected.cc",
+    "gather.cc",
+    "gather_nd.cc",
+    "hashtable_lookup.cc",
+    "if.cc",
+    "l2norm.cc",
+    "local_response_norm.cc",
+    "logical.cc",
+    "lsh_projection.cc",
+    "lstm.cc",
+    "matrix_diag.cc",
+    "matrix_set_diag.cc",
+    "maximum_minimum.cc",
+    "mfcc.cc",
+    "mirror_pad.cc",
+    "mul.cc",
+    "neg.cc",
+    "non_max_suppression.cc",
+    "numeric_verify.cc",
+    "one_hot.cc",
+    "pack.cc",
+    "pad.cc",
+    "pooling.cc",
+    "pow.cc",
+    "quantize.cc",
+    "range.cc",
+    "rank.cc",
+    "reduce.cc",
+    "reshape.cc",
+    "resize_bilinear.cc",
+    "resize_nearest_neighbor.cc",
+    "reverse.cc",
+    "reverse_sequence.cc",
+    "round.cc",
+    "scatter_nd.cc",
+    "segment_sum.cc",
+    "select.cc",
+    "shape.cc",
+    "skip_gram.cc",
+    "slice.cc",
+    "space_to_batch_nd.cc",
+    "space_to_depth.cc",
+    "sparse_to_dense.cc",
+    "split.cc",
+    "split_v.cc",
+    "squared_difference.cc",
+    "squeeze.cc",
+    "strided_slice.cc",
+    "sub.cc",
+    "svdf.cc",
+    "tile.cc",
+    "topk_v2.cc",
+    "transpose.cc",
+    "transpose_conv.cc",
+    "unidirectional_sequence_lstm.cc",
+    "unidirectional_sequence_rnn.cc",
+    "unique.cc",
+    "unpack.cc",
+    "where.cc",
+    "while.cc",
+    "zeros_like.cc",
+]
+
+BUILTIN_KERNEL_DEPS = [
+    ":cpu_backend_context",
+    ":cpu_backend_gemm",
+    ":cpu_backend_threadpool",
+    ":eigen_support",
+    ":kernel_util",
+    ":lstm_eval",
+    ":lstm_shared",
+    ":op_macros",
+    ":padding",
+    "@com_google_absl//absl/memory",
+    "@com_google_absl//absl/strings",
+    "//third_party/eigen3",
+    "@flatbuffers",
+    "//tensorflow/lite:framework_lib",
+    "//tensorflow/lite:minimal_logging",
+    "//tensorflow/lite:string_util",
+    "//tensorflow/lite/c:common",
+    "//tensorflow/lite/kernels/internal:audio_utils",
+    "//tensorflow/lite/kernels/internal:common",
+    "//tensorflow/lite/kernels/internal:compatibility",
+    "//tensorflow/lite/kernels/internal:cpu_check",
+    "//tensorflow/lite/kernels/internal:kernel_utils",
+    "//tensorflow/lite/kernels/internal:optimized",
+    "//tensorflow/lite/kernels/internal:optimized_base",
+    "//tensorflow/lite/kernels/internal:quantization_util",
+    "//tensorflow/lite/kernels/internal:reference_base",
+    "//tensorflow/lite/kernels/internal:strided_slice_logic",
+    "//tensorflow/lite/kernels/internal:tensor",
+    "//tensorflow/lite/kernels/internal:tensor_utils",
+    "//tensorflow/lite/kernels/internal:types",
+]
+
 cc_library(
     name = "builtin_op_kernels",
-    srcs = [
-        "activations.cc",
-        "add.cc",
-        "add_n.cc",
-        "arg_min_max.cc",
-        "audio_spectrogram.cc",
-        "basic_rnn.cc",
-        "batch_matmul.cc",
-        "batch_to_space_nd.cc",
-        "bidirectional_sequence_lstm.cc",
-        "bidirectional_sequence_rnn.cc",
-        "cast.cc",
-        "ceil.cc",
-        "comparisons.cc",
-        "concatenation.cc",
-        "conv.cc",
-        "densify.cc",
-        "depth_to_space.cc",
-        "depthwise_conv.cc",
-        "dequantize.cc",
-        "detection_postprocess.cc",
-        "div.cc",
-        "elementwise.cc",
-        "embedding_lookup.cc",
-        "embedding_lookup_sparse.cc",
-        "exp.cc",
-        "expand_dims.cc",
-        "fake_quant.cc",
-        "fill.cc",
-        "floor.cc",
-        "floor_div.cc",
-        "floor_mod.cc",
-        "fully_connected.cc",
-        "gather.cc",
-        "gather_nd.cc",
-        "hashtable_lookup.cc",
-        "if.cc",
-        "l2norm.cc",
-        "local_response_norm.cc",
-        "logical.cc",
-        "lsh_projection.cc",
-        "lstm.cc",
-        "matrix_diag.cc",
-        "matrix_set_diag.cc",
-        "maximum_minimum.cc",
-        "mfcc.cc",
-        "mirror_pad.cc",
-        "mul.cc",
-        "neg.cc",
-        "non_max_suppression.cc",
-        "numeric_verify.cc",
-        "one_hot.cc",
-        "pack.cc",
-        "pad.cc",
-        "pooling.cc",
-        "pow.cc",
-        "quantize.cc",
-        "range.cc",
-        "rank.cc",
-        "reduce.cc",
-        "reshape.cc",
-        "resize_bilinear.cc",
-        "resize_nearest_neighbor.cc",
-        "reverse.cc",
-        "reverse_sequence.cc",
-        "round.cc",
-        "scatter_nd.cc",
-        "segment_sum.cc",
-        "select.cc",
-        "shape.cc",
-        "skip_gram.cc",
-        "slice.cc",
-        "space_to_batch_nd.cc",
-        "space_to_depth.cc",
-        "sparse_to_dense.cc",
-        "split.cc",
-        "split_v.cc",
-        "squared_difference.cc",
-        "squeeze.cc",
-        "strided_slice.cc",
-        "sub.cc",
-        "svdf.cc",
-        "tile.cc",
-        "topk_v2.cc",
-        "transpose.cc",
-        "transpose_conv.cc",
-        "unidirectional_sequence_lstm.cc",
-        "unidirectional_sequence_rnn.cc",
-        "unique.cc",
-        "unpack.cc",
-        "where.cc",
-        "while.cc",
-        "zeros_like.cc",
-    ],
+    srcs = BUILTIN_KERNEL_SRCS,
     hdrs = [
         "dequantize.h",
     ],
     copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
     visibility = ["//visibility:private"],
-    deps = [
-        ":cpu_backend_context",
-        ":cpu_backend_gemm",
-        ":cpu_backend_threadpool",
-        ":eigen_support",
-        ":kernel_util",
-        ":lstm_eval",
-        ":lstm_shared",
-        ":op_macros",
-        ":padding",
-        "//tensorflow/lite:framework_lib",
-        "//tensorflow/lite:minimal_logging",
-        "//tensorflow/lite:string_util",
-        "//tensorflow/lite/c:common",
-        "//tensorflow/lite/kernels/internal:audio_utils",
-        "//tensorflow/lite/kernels/internal:common",
-        "//tensorflow/lite/kernels/internal:compatibility",
-        "//tensorflow/lite/kernels/internal:cpu_check",
-        "//tensorflow/lite/kernels/internal:kernel_utils",
-        "//tensorflow/lite/kernels/internal:optimized",
-        "//tensorflow/lite/kernels/internal:optimized_base",
-        "//tensorflow/lite/kernels/internal:quantization_util",
-        "//tensorflow/lite/kernels/internal:reference_base",
-        "//tensorflow/lite/kernels/internal:strided_slice_logic",
-        "//tensorflow/lite/kernels/internal:tensor",
-        "//tensorflow/lite/kernels/internal:tensor_utils",
-        "//tensorflow/lite/kernels/internal:types",
-        "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@farmhash_archive//:farmhash",
-        "@flatbuffers",
+    deps = BUILTIN_KERNEL_DEPS + ["@farmhash_archive//:farmhash"],
+)
+
+# Creates a target where Ruy is unconditionally enabled along with caching
+# on GEMV operations. This is useful for TF Lite deployments where custom
+# copts are not allowed, e.g. b/156119344
+cc_library(
+    name = "builtin_op_kernels_ruy_and_caching",
+    srcs = BUILTIN_KERNEL_SRCS,
+    hdrs = [
+        "dequantize.h",
     ],
+    copts = tflite_copts() + tf_opts_nortti_if_android() + EXTRA_EIGEN_COPTS,
+    visibility = ["//visibility:private"],
+    deps = BUILTIN_KERNEL_DEPS + ["@farmhash_archive//:farmhash"] + [":tflite_with_ruy_and_caching_enabled"],
 )
 
 cc_library(
@@ -673,6 +699,22 @@ cc_library(
     ],
 )
 
+#  TODO(b/156664104) Remove once runtime flag available.
+cc_library(
+    name = "builtin_ops_ruy_and_caching_enabled",
+    srcs = ["register.cc"],
+    hdrs = [
+        "builtin_op_kernels.h",
+        "fully_connected.h",
+        "register.h",
+    ],
+    deps = [
+        ":builtin_op_kernels_ruy_and_caching",
+        "//tensorflow/lite:framework_lib",
+        "//tensorflow/lite/c:common",
+    ],
+)
+
 # The builtin_ops target will resolve to optimized kernels when available. This
 # target uses reference kernels only, and is useful for validation and testing.
 # It should *not* generally be used in production.

From d968853cc6825c705a4443844319279c464b152e Mon Sep 17 00:00:00 2001
From: Xiao Yu <fishx@google.com>
Date: Fri, 15 May 2020 12:12:51 -0700
Subject: [PATCH 283/412] Skip TFE_ContextAsyncWait for tfrt. In current
 TF-TFRT integration, all ops are executed synchronously. We will revisit this
 later.

PiperOrigin-RevId: 311777624
Change-Id: I3a27805dcce53ccf572f3c500d6fd0a532b286b2
---
 tensorflow/c/eager/c_api.cc                    | 4 +---
 tensorflow/c/eager/context_interface.h         | 3 +++
 tensorflow/core/common_runtime/eager/context.h | 2 ++
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 5c01ccb82bb..f5535c80d30 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -899,9 +899,7 @@ TF_CAPI_EXPORT extern void TFE_ContextAsyncWait(TFE_Context* ctx,
 #if defined(IS_MOBILE_PLATFORM)
   status->status = tensorflow::Status::OK();
 #else   // !defined(IS_MOBILE_PLATFORM)
-  tensorflow::EagerContext* context =
-      tensorflow::ContextFromInterface(tensorflow::unwrap(ctx));
-  status->status = context->SyncExecutors();
+  status->status = tensorflow::unwrap(ctx)->AsyncWait();
 #endif  // !IS_MOBILE_PLATFORM
 }
 
diff --git a/tensorflow/c/eager/context_interface.h b/tensorflow/c/eager/context_interface.h
index d21ab45e579..76f182f4945 100644
--- a/tensorflow/c/eager/context_interface.h
+++ b/tensorflow/c/eager/context_interface.h
@@ -101,6 +101,9 @@ class AbstractContextInterface {
   // Destroy the step resource container for a training step.
   virtual void EndStep() = 0;
 
+  // Block until all pending nodes are finished,
+  virtual Status AsyncWait() = 0;
+
  protected:
   virtual ~AbstractContextInterface() {}
 };
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index d034aaf2f9c..d03a91c817a 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -295,6 +295,8 @@ class EagerContext : public AbstractContextInterface, public core::RefCounted {
   // errors, and the error message will be combined from all executors.
   Status SyncExecutors();
 
+  Status AsyncWait() override { return SyncExecutors(); }
+
   core::RefCountPtr<KernelAndDevice> GetCachedKernel(Fprint128 cache_key);
 
   void AddKernelToCache(Fprint128 cache_key, KernelAndDevice* kernel);

From 985275ea27f1e542da9d267a0a42b791d4159ac5 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 15 May 2020 12:15:58 -0700
Subject: [PATCH 284/412] When restoring a variable with an initializer, pass
 through restore metadata rather than forgetting it

This avoids 2x memory usage when restoring with a distribution strategy, since otherwise variables are restored twice (with two live copies the second time).

PiperOrigin-RevId: 311778129
Change-Id: I60c1c23d0b554d30e3913f588e6f11a7c430fe71
---
 .../python/distribute/checkpointing_test.py   | 36 +++++++++++++++++++
 .../python/distribute/distribute_lib.py       | 20 ++++++++---
 2 files changed, 52 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/distribute/checkpointing_test.py b/tensorflow/python/distribute/checkpointing_test.py
index 040faf6f6ce..ad646905315 100644
--- a/tensorflow/python/distribute/checkpointing_test.py
+++ b/tensorflow/python/distribute/checkpointing_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import adam as adam_v1
@@ -96,6 +97,41 @@ class TrainingCheckpointTests(test.TestCase, parameterized.TestCase):
         self.assertEqual((training_continuation + 1) * num_training_steps,
                          root.optimizer_step.numpy())
 
+  @combinations.generate(
+      combinations.combine(
+          distribution=[
+              strategy_combinations.mirrored_strategy_with_one_cpu,
+              strategy_combinations.mirrored_strategy_with_gpu_and_cpu,
+              strategy_combinations.tpu_strategy,
+              strategy_combinations.central_storage_strategy_with_two_gpus,
+          ],
+          mode=["eager"]))
+  def testInitializeFromCheckpoint(self, distribution):
+    variable_shape = [5]
+    save_checkpoint = trackable_utils.Checkpoint(v=variables_lib.Variable(
+        array_ops.ones(variable_shape)))
+    save_path = save_checkpoint.save(
+        os.path.join(self.get_temp_dir(), "checkpoint"))
+    with distribution.scope():
+      restore_checkpoint = trackable_utils.Checkpoint()
+      restore_checkpoint.restore(save_path)
+      initial_value = restore_checkpoint._preload_simple_restoration(
+          "v", variable_shape)
+      v = variables_lib.Variable(initial_value)
+      # Check that the variable is now tagged as restored. `Checkpoint` then
+      # knows it doesn't have to restore `v`'s value when it's assigned to an
+      # object.
+      self.assertGreater(v._update_uid, 0)
+      self.assertAllClose(array_ops.ones(variable_shape), v)
+      v.assign(array_ops.zeros(variable_shape))
+      # Assignment to an object should not trigger restoration, since we already
+      # restored the object through an initializer. This wouldn't be a
+      # correctness issue, but it would mean that models would use twice as much
+      # memory when loading (the buffer already assigned to the variable, and
+      # the new restoration).
+      restore_checkpoint.v = v
+      self.assertAllClose(array_ops.zeros(variable_shape), v)
+
   @combinations.generate(
       combinations.combine(
           distribution=[
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index 6baa15f59c1..4531e922840 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -1772,13 +1772,25 @@ class StrategyExtendedV2(object):
       kwargs["distribute_strategy"] = strategy
 
       # Unwrap `initial_value` if it is a `CheckpointInitialValue` to avoid
-      # dereferencing a `Tensor` that is without a `name`.
-      # TODO(b/138130844): Revisit the following check once
-      # `CheckpointInitialValue` class is removed.
+      # dereferencing a `Tensor` that is without a `name`. We still need to
+      # propagate the metadata it's holding.
       if isinstance(kwargs["initial_value"], trackable.CheckpointInitialValue):
+        checkpoint_restore_uid = kwargs[
+            "initial_value"].checkpoint_position.restore_uid
         kwargs["initial_value"] = kwargs["initial_value"].wrapped_value
+      else:
+        checkpoint_restore_uid = None
 
-      return self._create_variable(next_creator, **kwargs)
+      created = self._create_variable(next_creator, **kwargs)
+
+      if checkpoint_restore_uid is not None:
+        # pylint: disable=protected-access
+        # Let the checkpointing infrastructure know that the variable was
+        # already restored so it doesn't waste memory loading the value again.
+        created._maybe_initialize_trackable()
+        created._update_uid = checkpoint_restore_uid
+        # pylint: enable=protected-access
+      return created
 
     def distributed_getter(getter, *args, **kwargs):
       if not self._allow_variable_partition():

From 28229ffdbf8d996449bf2ad8289d18201f21ca7b Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 15 May 2020 12:20:44 -0700
Subject: [PATCH 285/412] Delete Tensor constructor that takes a pointer.
 Otherwise, say,
 std::make_unique<Tensor>(GetTensorSomewhereThatActuallyReturnsAPointer())
 would construct boolean tensor without a compile time error.

PiperOrigin-RevId: 311778946
Change-Id: Ibdb69ff7c4a9697028ed30ac40ffb0797b4493f9
---
 tensorflow/core/framework/tensor.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 54541be0b4f..744a14e007e 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <type_traits>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -239,6 +240,12 @@ class Tensor {
   /// are not valid.
   Tensor(Tensor&& other);
 
+  // Explicitly delete constructor that take a pointer (except char*)
+  // so that the pointer doesn't get implicitly cast to bool.
+  template <typename T, typename std::enable_if<!std::is_same<T, char>::value,
+                                                T>::type* = nullptr>
+  explicit Tensor(T* t) = delete;
+
   ~Tensor();
 
   /// Returns the data type.

From 321d3d9fd09b956e163e859909465690f73806a8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 12:37:37 -0700
Subject: [PATCH 286/412] Update Eigen to:
 https://gitlab.com/libeigen/eigen/-/commit/9b411757abd8458f9689b1384c6bf75da9b82357

PiperOrigin-RevId: 311782120
Change-Id: I8b68ee1dbc23f1e7861c17d8a7715867860124dc
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 949c6920e33..404d253e8bd 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -237,11 +237,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "2c7c0aec4271dfca6b8a7707e2112f67c4cb3bdf7c89c0e98d3fcd39707c4468",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-49f1aeb60d9f759859fce0d16aa5d1ecc7168d51",
+        sha256 = "59f7cc665fff375f142d558e7c08c95ac254fa13d077cbecce757a556d30e0d9",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-9b411757abd8458f9689b1384c6bf75da9b82357",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/49f1aeb60d9f759859fce0d16aa5d1ecc7168d51/eigen-49f1aeb60d9f759859fce0d16aa5d1ecc7168d51.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/49f1aeb60d9f759859fce0d16aa5d1ecc7168d51/eigen-49f1aeb60d9f759859fce0d16aa5d1ecc7168d51.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/9b411757abd8458f9689b1384c6bf75da9b82357/eigen-9b411757abd8458f9689b1384c6bf75da9b82357.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/9b411757abd8458f9689b1384c6bf75da9b82357/eigen-9b411757abd8458f9689b1384c6bf75da9b82357.tar.gz",
         ],
     )
 

From 2db0d85d05a3dad9153b9afa6bd2ed5ba7c24102 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 12:40:36 -0700
Subject: [PATCH 287/412] Fix case where embedding column -> use_safe_embedding
 = false is used with variable partitioning.

PiperOrigin-RevId: 311782693
Change-Id: I38b59943a25adbe77e9f3f01c49a713876cc3f22
---
 .../python/feature_column/feature_column.py   |   4 +-
 .../feature_column/feature_column_test.py     | 312 ++++++++++++------
 .../feature_column/feature_column_v2.py       |   4 +-
 .../feature_column/feature_column_v2_test.py  | 193 +++++++----
 4 files changed, 336 insertions(+), 177 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 87420d0e850..07df4e914c9 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -2546,7 +2546,7 @@ class _EmbeddingColumn(
     embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
     if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
         sparse_id_rank <= 2):
-      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
+      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
     # Return embedding lookup result.
     return embedding_lookup_sparse(
         embedding_weights,
@@ -2696,7 +2696,7 @@ class _SharedEmbeddingColumn(
       embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
       if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
           sparse_id_rank <= 2):
-        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
+        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
       # Return embedding lookup result.
       return embedding_lookup_sparse(
           embedding_weights,
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 21def9cfa2c..38800fc2162 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections
 import copy
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.example import example_pb2
@@ -852,9 +853,9 @@ class HashedCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -1714,10 +1715,10 @@ class LinearModelTest(test.TestCase):
       # We check the mapping by checking that we have the right keys,
       # and that the values (output_tensors) were indeed the ones used to
       # form the input layer.
-      self.assertItemsEqual(all_cols, cols_to_output_tensors.keys())
+      self.assertCountEqual(all_cols, cols_to_output_tensors.keys())
       input_layer_inputs = [tensor for tensor in input_layer.op.inputs[:-1]]
       output_tensors = [tensor for tensor in cols_to_output_tensors.values()]
-      self.assertItemsEqual(input_layer_inputs, output_tensors)
+      self.assertCountEqual(input_layer_inputs, output_tensors)
 
   def test_dense_collection(self):
     price = fc._numeric_column('price')
@@ -2841,7 +2842,7 @@ class FunctionalInputLayerTest(test.TestCase):
       cols_to_vars = {}
       all_cols = [price1, dense_feature_bucketized, some_embedding_column]
       fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -2891,7 +2892,7 @@ class FunctionalInputLayerTest(test.TestCase):
           shared_embedding_a, shared_embedding_b
       ]
       fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -2927,7 +2928,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_from_feature_columns',
           partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0)):
         fc.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(3, len(cols_to_vars[some_embedding_column]))
@@ -3043,7 +3044,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_layer/sparse_feature_embedding/embedding_weights:0',
           'input_layer_1/sparse_feature_embedding/embedding_weights:0'
       ]
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -3077,7 +3078,7 @@ class FunctionalInputLayerTest(test.TestCase):
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -3129,7 +3130,7 @@ class FunctionalInputLayerTest(test.TestCase):
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -3618,9 +3619,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -4058,9 +4059,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -4363,9 +4364,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), weight_collections=('my_weights',))
 
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
-    self.assertItemsEqual([], ops.get_collection('my_weights'))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertCountEqual([], ops.get_collection('my_weights'))
 
   @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
@@ -4820,7 +4821,7 @@ class IndicatorColumnTest(test.TestCase):
         self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
 
 
-class EmbeddingColumnTest(test.TestCase):
+class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def test_defaults(self):
@@ -4956,10 +4957,29 @@ class EmbeddingColumnTest(test.TestCase):
       _assert_sparse_tensor_value(self, self.evaluate(output_a),
                                   self.evaluate(output_embedded))
 
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
+      })
   @test_util.run_deprecated_v1
-  def test_get_dense_tensor(self):
+  def test_get_dense_tensor(self, use_safe_embedding_lookup,
+                            partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, ids [2]
         # example 1, ids [0, 1]
@@ -4974,12 +4994,20 @@ class EmbeddingColumnTest(test.TestCase):
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+
+    def _initializer(shape, dtype, partition_info=None):
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -4997,25 +5025,43 @@ class EmbeddingColumnTest(test.TestCase):
     # Build columns.
     categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc._embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column = fc._embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup = embedding_column._get_dense_tensor(
-        _LazyBuilder({
-            'aaa': sparse_input
-        }))
+      # Provide sparse input and get dense result.
+      embedding_lookup = embedding_column._get_dense_tensor(
+          _LazyBuilder({'aaa': sparse_input}))
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(('vars/embedding_weights/part_0:0',
+                             'vars/embedding_weights/part_1:0'),
+                            tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(('vars/embedding_weights:0',),
+                            tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
+
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_3d(self):
     # Inputs.
@@ -5072,7 +5118,7 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
@@ -5102,11 +5148,11 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     my_vars = ops.get_collection('my_vars')
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in my_vars]))
+    self.assertCountEqual(('embedding_weights:0',),
+                          tuple([v.name for v in my_vars]))
 
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
@@ -5169,8 +5215,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertCountEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, embedding_lookup.eval(
@@ -5233,8 +5279,8 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertCountEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
@@ -5280,14 +5326,14 @@ class EmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v for v in ops.get_collection(
               ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -5361,14 +5407,14 @@ class EmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -5450,13 +5496,11 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in global_vars]))
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in trainable_vars]))
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in trainable_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, trainable_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
@@ -5513,17 +5557,16 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('input_layer/aaa_embedding/embedding_weights:0',),
-        tuple([v.name for v in global_vars]))
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
       self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
 
-class SharedEmbeddingColumnTest(test.TestCase):
+class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   def test_defaults(self):
@@ -5772,33 +5815,59 @@ class SharedEmbeddingColumnTest(test.TestCase):
       _assert_sparse_tensor_value(self, self.evaluate(output_b),
                                   self.evaluate(output_b_embedded))
 
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
+      })
   @test_util.run_deprecated_v1
-  def test_get_dense_tensor(self):
+  def test_get_dense_tensor(self, use_safe_embedding_lookup,
+                            partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
-    input_features = {
-        'aaa': input_a,
-        'bbb': input_b
-    }
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
 
     # Embedding variable.
     embedding_dimension = 2
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
-    def _initializer(shape, dtype, partition_info):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+
+    def _initializer(shape, dtype, partition_info=None):
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -5808,38 +5877,65 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1:
         (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
     )
-    expected_lookups_b = (
-        # example 0:
-        (1., 2.),  # ids [0], embedding = [1, 2]
-        # example 1:
-        (0., 0.),  # ids [], embedding = [0, 0]
-    )
+    if use_safe_embedding_lookup:
+      expected_lookups_b = (
+          # example 0:
+          (1., 2.),  # ids [0], embedding = [1, 2]
+          # example 1:
+          (0., 0.),  # ids [], embedding = [0, 0]
+      )
+    else:
+      expected_lookups_b = (
+          # example 0:
+          (1., 2.),  # ids [0], embedding = [1, 2]
+      )
 
     # Build columns.
     categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup_a = embedding_column_a._get_dense_tensor(
-        _LazyBuilder(input_features))
-    embedding_lookup_b = embedding_column_b._get_dense_tensor(
-        _LazyBuilder(input_features))
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
 
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
+      # Provide sparse input and get dense result.
+      embedding_lookup_a = embedding_column_a._get_dense_tensor(
+          _LazyBuilder(input_features))
+      embedding_lookup_b = embedding_column_b._get_dense_tensor(
+          _LazyBuilder(input_features))
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(('vars/embedding_weights/part_0:0',
+                             'vars/embedding_weights/part_1:0'),
+                            tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(('vars/embedding_weights:0',),
+                            tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
-      self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
-      self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+    self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+    self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
 
   @test_util.run_deprecated_v1
   def test_get_dense_tensor_weight_collections(self):
@@ -5886,11 +5982,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
         tuple(v.name for v in global_vars))
     my_vars = ops.get_collection('my_vars')
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
         tuple(v.name for v in my_vars))
 
@@ -5997,14 +6093,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
           'linear_model/aaa_bbb_shared_embedding_1/weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v for v in ops.get_collection(
               ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
@@ -6091,14 +6187,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0',
           'linear_model/aaa_bbb_shared_embedding_1/weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_bbb_shared_embedding/embedding_weights:0']
@@ -6195,16 +6291,16 @@ class SharedEmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
         tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           tuple([v.name for v in trainable_vars]))
     else:
-      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
+      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 23a9861eb1b..b572987d52d 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -3263,7 +3263,7 @@ class EmbeddingColumn(
     embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
     if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
         sparse_id_rank <= 2):
-      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse
+      embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
     # Return embedding lookup result.
     return embedding_lookup_sparse(
         embedding_weights,
@@ -3558,7 +3558,7 @@ class SharedEmbeddingColumn(
       embedding_lookup_sparse = embedding_ops.safe_embedding_lookup_sparse
       if (not self.use_safe_embedding_lookup and sparse_id_rank is not None and
           sparse_id_rank <= 2):
-        embedding_lookup_sparse = (embedding_ops.embedding_lookup_sparse)
+        embedding_lookup_sparse = embedding_ops.embedding_lookup_sparse_v2
       # Return embedding lookup result.
       return embedding_lookup_sparse(
           embedding_weights,
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index fe769850fb0..cba87a51c23 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -2087,7 +2087,7 @@ class LinearModelTest(test.TestCase):
       for var in model.variables:
         self.assertIsInstance(var, variables_lib.VariableV1)
       variable_names = [var.name for var in model.variables]
-      self.assertItemsEqual([
+      self.assertCountEqual([
           'linear_model/dense_feature_bucketized/weights:0',
           'linear_model/price1/weights:0',
           'linear_model/sparse_feature_embedding/embedding_weights:0',
@@ -2731,10 +2731,10 @@ class OldLinearModelTest(test.TestCase):
       # We check the mapping by checking that we have the right keys,
       # and that the values (output_tensors) were indeed the ones used to
       # form the input layer.
-      self.assertItemsEqual(all_cols, cols_to_output_tensors.keys())
+      self.assertCountEqual(all_cols, cols_to_output_tensors.keys())
       input_layer_inputs = [tensor for tensor in input_layer.op.inputs[:-1]]
       output_tensors = [tensor for tensor in cols_to_output_tensors.values()]
-      self.assertItemsEqual(input_layer_inputs, output_tensors)
+      self.assertCountEqual(input_layer_inputs, output_tensors)
 
   def test_dense_collection(self):
     price = fc.numeric_column('price')
@@ -3411,7 +3411,7 @@ class FunctionalInputLayerTest(test.TestCase):
       cols_to_vars = {}
       all_cols = [price1, dense_feature_bucketized, some_embedding_column]
       fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -3461,7 +3461,7 @@ class FunctionalInputLayerTest(test.TestCase):
           shared_embedding_a, shared_embedding_b
       ]
       fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(1, len(cols_to_vars[some_embedding_column]))
@@ -3497,7 +3497,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_from_feature_columns',
           partitioner=partitioned_variables.fixed_size_partitioner(3, axis=0)):
         fc_old.input_layer(features, all_cols, cols_to_vars=cols_to_vars)
-      self.assertItemsEqual(list(cols_to_vars.keys()), all_cols)
+      self.assertCountEqual(list(cols_to_vars.keys()), all_cols)
       self.assertEqual(0, len(cols_to_vars[price1]))
       self.assertEqual(0, len(cols_to_vars[dense_feature_bucketized]))
       self.assertEqual(3, len(cols_to_vars[some_embedding_column]))
@@ -3616,7 +3616,7 @@ class FunctionalInputLayerTest(test.TestCase):
           'input_layer/sparse_feature_embedding/embedding_weights:0',
           'input_layer_1/sparse_feature_embedding/embedding_weights:0'
       ]
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
@@ -5904,7 +5904,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -5968,7 +5968,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6036,7 +6036,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6109,7 +6109,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6180,7 +6180,7 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('embedding_weights:0',),
+    self.assertCountEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6230,14 +6230,14 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -6274,15 +6274,25 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       {
           'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
       }, {
           'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
       })
   @test_util.run_deprecated_v1
-  def test_dense_features(self, use_safe_embedding_lookup):
+  def test_dense_features(self, use_safe_embedding_lookup, partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, ids [2]
         # example 1, ids [0, 1]
@@ -6297,13 +6307,20 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
 
     def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -6321,25 +6338,43 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     # Build columns.
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        use_safe_embedding_lookup=use_safe_embedding_lookup)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
 
-    # Provide sparse input and get dense result.
-    l = df.DenseFeatures((embedding_column,))
-    dense_features = l({'aaa': sparse_input})
+      # Provide sparse input and get dense result.
+      l = df.DenseFeatures((embedding_column,))
+      dense_features = l({'aaa': sparse_input})
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
+           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
+          tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
+          tuple([v.name for v in global_vars]))
     for v in global_vars:
       self.assertIsInstance(v, variables_lib.Variable)
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in trainable_vars]))
+    if partition_variables:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
+           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
+          tuple([v.name for v in trainable_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
     self.evaluate(lookup_ops.tables_initializer())
@@ -6410,9 +6445,9 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
-    self.assertItemsEqual([],
+    self.assertCountEqual([],
                           ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6475,10 +6510,10 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('input_layer/aaa_embedding/embedding_weights:0',),
+    self.assertCountEqual(('input_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -6528,14 +6563,14 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -6610,14 +6645,14 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'linear_model/aaa_embedding/weights:0',
           'linear_model/aaa_embedding/embedding_weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
@@ -6972,15 +7007,26 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
   @parameterized.named_parameters(
       {
           'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
       }, {
           'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
       })
   @test_util.run_deprecated_v1
-  def test_get_dense_tensor(self, use_safe_embedding_lookup):
+  def test_get_dense_tensor(self, use_safe_embedding_lookup,
+                            partition_variables):
     # Inputs.
-    vocabulary_size = 3
+    vocabulary_size = 4
     # -1 values are ignored.
     input_a = np.array([
         [2, -1, -1],  # example 0, ids [2]
@@ -6997,13 +7043,20 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
     embedding_values = (
         (1., 2.),  # id 0
         (3., 5.),  # id 1
-        (7., 11.)  # id 2
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
     )
 
     def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
       self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
       return embedding_values
 
     # Expected lookup result, using combiner='mean'.
@@ -7031,22 +7084,32 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        use_safe_embedding_lookup=use_safe_embedding_lookup)
 
-    # Provide sparse input and get dense result.
-    embedding_lookup_a = embedding_column_a.get_dense_tensor(
-        fc.FeatureTransformationCache(input_features), None)
-    embedding_lookup_b = embedding_column_b.get_dense_tensor(
-        fc.FeatureTransformationCache(input_features), None)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+          [categorical_column_a, categorical_column_b],
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
+      # Provide sparse input and get dense result.
+      embedding_lookup_a = embedding_column_a.get_dense_tensor(
+          fc.FeatureTransformationCache(input_features), None)
+      embedding_lookup_b = embedding_column_b.get_dense_tensor(
+          fc.FeatureTransformationCache(input_features), None)
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
-                          tuple([v.name for v in global_vars]))
+    if partition_variables:
+      self.assertCountEqual(('vars/aaa_bbb_shared_embedding/part_0:0',
+                             'vars/aaa_bbb_shared_embedding/part_1:0'),
+                            tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(('vars/aaa_bbb_shared_embedding:0',),
+                            tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
 
     self.evaluate(variables_lib.global_variables_initializer())
@@ -7279,14 +7342,14 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
           'aaa_bbb_shared_embedding:0',
           'linear_model/bbb_shared_embedding/weights:0',
       )
-      self.assertItemsEqual(
+      self.assertCountEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
           v.name: v
           for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
-      self.assertItemsEqual(expected_var_names, trainable_vars.keys())
+      self.assertCountEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars['aaa_bbb_shared_embedding:0']
       linear_weights_a = trainable_vars[
@@ -7420,18 +7483,18 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
+    self.assertCountEqual(
         ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
         tuple([v.name for v in global_vars]))
     for v in global_vars:
       self.assertIsInstance(v, variables_lib.Variable)
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     if trainable:
-      self.assertItemsEqual(
+      self.assertCountEqual(
           ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
           tuple([v.name for v in trainable_vars]))
     else:
-      self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
+      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = global_vars
 
     self.evaluate(variables_lib.global_variables_initializer())

From 08968c30dcf6907cb2a9b0d8f56d0358cac39edf Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Fri, 15 May 2020 12:41:50 -0700
Subject: [PATCH 288/412] Clarify why we have *-gpu package. Fix
 https://github.com/tensorflow/tensorflow/issues/39581

PiperOrigin-RevId: 311782903
Change-Id: If002f2d2b112012e1c75e0c16f7a922546a9bba5
---
 tensorflow/tools/pip_package/setup.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 4b8289a6202..8a5450d78b6 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -43,8 +43,6 @@ from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
-DOCLINES = __doc__.split('\n')
-
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
@@ -93,6 +91,16 @@ if 'tf_nightly' in project_name:
     elif 'tensorflow_estimator' in pkg:
       REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
+DOCLINES = __doc__.split('\n')
+if project_name.endswith('-gpu'):
+  project_name_no_gpu = project_name[:-len('-gpu')]
+  _GPU_PACKAGE_NOTE = 'Note that %s package by default supports both CPU and '\
+      'GPU. %s has the same content and exists solely for backward '\
+      'compatiblity. Please migrate to %s for GPU support.'\
+      % (project_name_no_gpu, project_name, project_name_no_gpu)
+  DOCLINES.append(_GPU_PACKAGE_NOTE)
+
+
 # pylint: disable=line-too-long
 CONSOLE_SCRIPTS = [
     'toco_from_protos = tensorflow.lite.toco.python.toco_from_protos:main',

From 8dd28457699100145cad17aa4d44da81fddefda9 Mon Sep 17 00:00:00 2001
From: Eugene Kuznetsov <eugene.kuznetsov@amd.com>
Date: Fri, 15 May 2020 19:34:30 +0000
Subject: [PATCH 289/412] Reviewer requests

---
 tensorflow/stream_executor/rocm/rocm_gpu_executor.cc |  3 ++-
 third_party/gpus/cuda_configure.bzl                  | 10 +++++++---
 third_party/gpus/rocm_configure.bzl                  |  3 +--
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
index 216602a7597..fd3b5f19913 100644
--- a/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/stream_executor/rocm/rocm_gpu_executor.cc
@@ -133,8 +133,9 @@ bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
     GpuDriver::UnloadModule(context_, module);
     gpu_binary_to_module_.erase(module_it);
     const char* mem_it = nullptr;
-    for (auto x : in_memory_modules_)
+    for (auto x : in_memory_modules_) {
       if (x.second == module) mem_it = x.first;
+    }
     if (mem_it != nullptr) in_memory_modules_.erase(mem_it);
   }
   return true;
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index ce924fe4cd2..7e779a993e2 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -809,20 +809,24 @@ def make_copy_files_rule(repository_ctx, name, srcs, outs):
 )""" % (name, "\n".join(outs), " && \\\n".join(cmds))
 
 def make_copy_dir_rule(repository_ctx, name, src_dir, out_dir, exceptions=None):
-    """Returns a rule to recursively copy a directory."""
+    """Returns a rule to recursively copy a directory.
+    If exceptions is not None, it must be a list of files or directories in 
+    'src_dir'; these will be excluded from copying. 
+    """
     src_dir = _norm_path(src_dir)
     out_dir = _norm_path(out_dir)
     outs = read_dir(repository_ctx, src_dir)
     post_cmd=''
     if exceptions!=None:
-      outs = [x for x in outs if not any([x.startswith(y) for y in exceptions])]
+      outs = [x for x in outs if not any([x.startswith(src_dir+"/"+y) 
+        for y in exceptions])]
     outs = [('        "%s",' % out.replace(src_dir, out_dir)) for out in outs]
     # '@D' already contains the relative path for a single file, see
     # http://docs.bazel.build/versions/master/be/make-variables.html#predefined_genrule_variables
     out_dir = "$(@D)/%s" % out_dir if len(outs) > 1 else "$(@D)"
     if exceptions!=None:
       for x in exceptions:
-        post_cmd+=" ; rm -fR " + x.replace(src_dir, out_dir)
+        post_cmd+=" ; rm -fR " + out_dir + "/" + x
     return """genrule(
     name = "%s",
     outs = [
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 3f518fb05f1..4cfec2459e4 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -615,8 +615,7 @@ def _create_local_rocm_repository(repository_ctx):
             name = "rocm-include",
             src_dir = rocm_toolkit_path + "/include",
             out_dir = "rocm/include",
-            exceptions = [rocm_toolkit_path + "/include/gtest", 
-              rocm_toolkit_path + "/include/gmock"],
+            exceptions = ["gtest", "gmock"],
         ),
         make_copy_dir_rule(
             repository_ctx,

From 2bbf57217f277f20be3d4eabc0fb839011251ab5 Mon Sep 17 00:00:00 2001
From: Tres Popp <tpopp@google.com>
Date: Fri, 15 May 2020 13:08:27 -0700
Subject: [PATCH 290/412] Change XLA's default to disable cpu_fast_math options
 with the exception of min_max behavior.

This is due to issues around inf/nan behavior on the cpu. tf_library still enables all fast math options though with the observation that currently most users of this are desiring performance and have tested their code already.

PiperOrigin-RevId: 311787817
Change-Id: Iab012d49435845dc5b7a5fcedca89bf159ec65a3
---
 tensorflow/compiler/aot/tfcompile.bzl           | 14 +++++++++++++-
 tensorflow/compiler/aot/tfcompile_main.cc       |  2 ++
 tensorflow/compiler/xla/debug_options_flags.cc  | 17 +++++++++++++++--
 tensorflow/compiler/xla/python/xla.cc           |  6 +-----
 .../xla/service/cpu/tests/cpu_intrinsic_test.cc |  7 +++++++
 .../service/cpu/tests/cpu_vectorization_test.cc |  7 +++++++
 tensorflow/compiler/xla/service/llvm_ir/BUILD   |  1 +
 .../xla/service/llvm_ir/alias_analysis_test.cc  |  2 +-
 .../compiler/xla/service/llvm_ir/llvm_util.cc   |  9 +++++++--
 .../xla/tests/client_library_test_base.h        |  1 +
 tensorflow/compiler/xla/tests/hlo_test_base.cc  | 10 ++++++++++
 tensorflow/compiler/xla/tests/hlo_test_base.h   |  4 ++++
 .../xla/tests/vector_ops_simple_test.cc         |  3 +--
 tensorflow/compiler/xla/xla.proto               | 13 ++++++++++++-
 .../python/kernel_tests/betainc_op_test.py      |  7 ++++---
 tensorflow/python/ops/gradient_checker_test.py  |  2 +-
 16 files changed, 87 insertions(+), 18 deletions(-)

diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index abccefbcdbb..f0c3e7da0ba 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -42,7 +42,8 @@ def tf_library(
         mlir_components = "None",
         deps = None,
         tags = []):
-    """Runs tfcompile to compile a TensorFlow graph into executable code.
+    """Runs tfcompile to compile a TensorFlow graph into executable code with fast
+    math enabled on cpu.
 
     Given an invocation of tf_library(name="foo", ...), generates the following
     build targets:
@@ -207,6 +208,15 @@ def tf_library(
         srcs.append(debug_info)
         debug_info_flag = " --debug_info=$(location " + debug_info + ")"
 
+    default_fast_math_xla_flags = "XLA_FLAGS=\"\
+      --xla_cpu_enable_fast_math=true \
+      --xla_cpu_fast_math_honor_nans=false \
+      --xla_cpu_fast_math_honor_infs=false \
+      --xla_cpu_fast_math_honor_functions=false \
+      --xla_cpu_fast_math_honor_division=false \
+      --xla_cpu_enable_fast_min_max=true \
+      $${XLA_FLAGS:-}\" "
+
     native.genrule(
         name = ("gen_" + name),
         srcs = srcs,
@@ -216,6 +226,7 @@ def tf_library(
             function_object_file,
         ],
         cmd = (
+            default_fast_math_xla_flags +
             "CUDA_VISIBLE_DEVICES='' " +
             "$(location " + tfcompile_tool + ")" +
             " --graph=$(location " + tfcompile_graph + ")" +
@@ -256,6 +267,7 @@ def tf_library(
             session_module_pb,
         ],
         cmd = (
+            default_fast_math_xla_flags +
             "CUDA_VISIBLE_DEVICES='' " +
             "$(location " + tfcompile_tool + ")" +
             " --graph=$(location " + tfcompile_graph + ")" +
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index f0cf8f2ded9..846947454bb 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -67,6 +67,8 @@ int main(int argc, char** argv) {
   flags.entry_point = "entry";
   flags.debug_info_path_begin_marker = "";
 
+  // Note that tfcompile.bzl's tf_library macro sets fast math flags as that is
+  // generally the preferred case.
   std::vector<tensorflow::Flag> flag_list;
   AppendMainFlags(&flag_list, &flags);
   xla::AppendDebugOptionsFlags(&flag_list);
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 60a563ee956..4152982bf4c 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -55,9 +55,16 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   // b/77879207.
   opts.set_xla_gpu_disable_multi_streaming(true);
 
-  // TODO(jlebar): Disable fastmath once doing so is not a performance
-  // regression.
+  // Disable forms of fast math that have caused users problems in the past.
   opts.set_xla_cpu_enable_fast_math(true);
+  opts.set_xla_cpu_fast_math_honor_nans(true);
+  opts.set_xla_cpu_fast_math_honor_infs(true);
+  opts.set_xla_cpu_fast_math_honor_functions(true);
+  opts.set_xla_cpu_fast_math_honor_division(true);
+
+  // By default, copy TF's Eigen style min_max behavior with nans.
+  opts.set_xla_cpu_enable_fast_min_max(false);
+
   opts.set_xla_gpu_enable_fast_min_max(true);
 
   opts.set_xla_allow_excess_precision(true);
@@ -261,6 +268,12 @@ static void AllocateFlags() {
       "When xla_cpu_enable_fast_math is true then this controls whether we "
       "forbid to approximate calculations for functions. Ignored when "
       "xla_cpu_enable_fast_math is false."));
+  flag_objects->push_back(tensorflow::Flag(
+      "xla_cpu_enable_fast_min_max",
+      bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_min_max),
+      flag_values->xla_cpu_enable_fast_min_max(),
+      "Enable fast floating point min/max lowering that always propagates "
+      "NaNs."));
   flag_objects->push_back(tensorflow::Flag(
       "xla_gpu_enable_fast_min_max",
       bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index 65fb5311994..f10ec978399 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -872,11 +872,7 @@ PYBIND11_MODULE(xla_extension, m) {
         DebugOptions* debug_options =
             options.executable_build_options.mutable_debug_options();
         // Sets fast-math-disabling default options expected by JAX.
-        // TODO(phawkins): make these XLA-wide defaults.
-        debug_options->set_xla_cpu_fast_math_honor_infs(true);
-        debug_options->set_xla_cpu_fast_math_honor_nans(true);
-        debug_options->set_xla_cpu_fast_math_honor_division(true);
-        debug_options->set_xla_cpu_fast_math_honor_functions(true);
+        debug_options->set_xla_cpu_enable_fast_min_max(false);
         debug_options->set_xla_gpu_enable_fast_min_max(false);
         return options;
       }))
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index b6d6de28bc5..efeab3bd31a 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -70,6 +70,13 @@ class CpuUnaryIntrinsicTest
     return absl::StrCat(opcode, "_On_", triple,
                         (features.empty() ? "" : "_With"), features);
   }
+
+ private:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    HloTestBase::SetAotFastMathDebugOptions(&debug_options);
+    return debug_options;
+  }
 };
 
 // Creates a module with a call to the unary op, and tests if the
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
index 8a72eb15487..757d878e224 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_vectorization_test.cc
@@ -69,6 +69,13 @@ class CpuVectorizationTest
     return absl::StrCat(opcode, "_On_", triple,
                         (features.empty() ? "" : "_With"), features);
   }
+
+ private:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
+    HloTestBase::SetAotFastMathDebugOptions(&debug_options);
+    return debug_options;
+  }
 };
 
 TEST_P(CpuVectorizationTest, DoIt) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/BUILD b/tensorflow/compiler/xla/service/llvm_ir/BUILD
index 39399df7ad8..cabcc8e06ee 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/BUILD
+++ b/tensorflow/compiler/xla/service/llvm_ir/BUILD
@@ -64,6 +64,7 @@ cc_library(
     srcs = ["llvm_util.cc"],
     hdrs = ["llvm_util.h"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
diff --git a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
index 453a5cd84b2..f7808773592 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/alias_analysis_test.cc
@@ -58,7 +58,7 @@ ENTRY while3 {
 
   CompileAndVerifyIr(hlo_string, R"(
 ; CHECK-LABEL: @body(i8* %retval
-; CHECK: %[[add_result:.*]] = fadd fast float %[[fadd_lhs:.*]], %[[fadd_rhs:.*]]
+; CHECK: %[[add_result:.*]] = fadd reassoc nsz contract  float %[[fadd_lhs:.*]], %[[fadd_rhs:.*]]
 ; CHECK: store float %[[add_result]], float* %[[store_dest:.*]], align 4, !alias.scope ![[alias_scope_md_for_store:[0-9]+]]
 ;
 ; CHECK-LABEL: @condition(i8* %retval, i8* noalias %run_options, i8** noalias %params
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index 4c9a8d3e004..c2b11819448 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
@@ -90,7 +91,9 @@ llvm::CallInst* EmitCallToIntrinsic(
 
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
                           llvm::IRBuilder<>* b) {
-  if (b->getFastMathFlags().noNaNs()) {
+  // TODO(tpopp): Pass this information down from the HLO's ModuleConfig.
+  if (b->getFastMathFlags().noNaNs() ||
+      GetDebugOptionsFromFlags().xla_cpu_enable_fast_min_max()) {
     auto cmp = b->CreateFCmpUGE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
@@ -103,7 +106,9 @@ llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
 
 llvm::Value* EmitFloatMin(llvm::Value* lhs_value, llvm::Value* rhs_value,
                           llvm::IRBuilder<>* b) {
-  if (b->getFastMathFlags().noNaNs()) {
+  // TODO(tpopp): Pass this information down from the HLO's ModuleConfig.
+  if (b->getFastMathFlags().noNaNs() ||
+      GetDebugOptionsFromFlags().xla_cpu_enable_fast_min_max()) {
     auto cmp = b->CreateFCmpULE(lhs_value, rhs_value);
     return b->CreateSelect(cmp, lhs_value, rhs_value);
   } else {
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 5b83186ffa4..790497f888e 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -76,6 +76,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   void SetFastMathDisabled(bool disabled) {
     auto* opts = execution_options_.mutable_debug_options();
     opts->set_xla_cpu_enable_fast_math(!disabled);
+    opts->set_xla_cpu_enable_fast_min_max(!disabled);
     opts->set_xla_gpu_enable_fast_min_max(!disabled);
   }
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 8eed609a134..7b64be5597b 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -165,6 +165,16 @@ PrecisionConfig HloTestBase::DefaultPrecisionConfig(int operands) {
   return precision_config;
 }
 
+void HloTestBase::SetAotFastMathDebugOptions(DebugOptions* options) {
+  options->set_xla_cpu_enable_fast_math(true);
+  options->set_xla_gpu_enable_fast_min_max(true);
+  options->set_xla_cpu_enable_fast_min_max(true);
+  options->set_xla_cpu_fast_math_honor_nans(false);
+  options->set_xla_cpu_fast_math_honor_infs(false);
+  options->set_xla_cpu_fast_math_honor_functions(false);
+  options->set_xla_cpu_fast_math_honor_division(false);
+}
+
 DebugOptions HloTestBase::GetDebugOptionsForTest() {
   auto debug_options = GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index d05776a0cb9..85b1876dd3c 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -100,6 +100,10 @@ class HloTestBase : public ::testing::Test {
 
   static PrecisionConfig DefaultPrecisionConfig(int operands);
 
+  // Sets most fath math options to be enabled to model the fast math flags
+  // generally used for CPU:AOT compilation.
+  static void SetAotFastMathDebugOptions(DebugOptions* options);
+
  protected:
   // This uses the interpreter backend as the reference backend and
   // automatically finds another supported backend as the test backend. If the
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index 3407a68f709..40e226f9902 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -310,8 +310,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
 
 XLA_TEST_F(VecOpsSimpleTest, ClampFloatEdgeCases) {
   XlaBuilder builder(TestName());
-  mutable_debug_options()->set_xla_cpu_enable_fast_math(false);
-  mutable_debug_options()->set_xla_gpu_enable_fast_min_max(false);
+  SetFastMathDisabled(true);
   auto low = ConstantR1<float>(&builder, {NAN, 1, 1});
   auto high = ConstantR1<float>(&builder, {3, NAN, 3});
   auto x = ConstantR1<float>(&builder, {2, 2, NAN});
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index f4b08f454b9..9374b1fca6a 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -148,9 +148,20 @@ message DebugOptions {
   // xla_cpu_enable_fast_math is false.
   bool xla_cpu_fast_math_honor_functions = 129;
 
+  // When false we lower the Minimum and Maximum hlos in the CPU backend such
+  // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NaN.  In other words, if flag
+  // this is false we always propagate NaNs through Min and Max.
+  //
+  // Note, this does not correspond to the exact same behavior as the gpu flag
+  // below!
+  bool xla_cpu_enable_fast_min_max = 140;
+
   // When true we lower the Minimum and Maximum hlos in the GPU backend such
   // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
   // this is true we don't propagate NaNs through Min and Max.
+  //
+  // Note, this does not correspond to the exact same behavior as the cpu flag
+  // above!
   bool xla_gpu_enable_fast_min_max = 100;
 
   // Allows xla to increase the output precision of floating point operations.
@@ -280,7 +291,7 @@ message DebugOptions {
   // memory, or have bugs.
   bool xla_gpu_unsafe_fallback_to_driver_on_ptxas_error = 139;
 
-  // Next id: 140
+  // Next id: 141
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index c4f70b5bc29..c564c822918 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -55,8 +55,8 @@ class BetaincTest(test.TestCase):
       # the scipy version of betainc uses a double-only implementation.
       # TODO(ebrevdo): identify reasons for (sometime) precision loss
       # with doubles
-      rtol = 1e-4 if dtype == dtypes.float32 else 5e-5
-      atol = 9e-6 if dtype == dtypes.float32 else 3e-6
+      rtol = 1e-4
+      atol = 1e-5
       self.assertAllCloseAccordingToType(
           scipy_out, tf_out, rtol=rtol, atol=atol)
 
@@ -66,7 +66,8 @@ class BetaincTest(test.TestCase):
       with self.cached_session():
         tf_comb = math_ops.betainc(a_comb, b_comb, x_comb).eval()
       scipy_comb = special.betainc(a_comb, b_comb, x_comb, dtype=np_dt)
-      self.assertAllCloseAccordingToType(scipy_comb, tf_comb)
+      self.assertAllCloseAccordingToType(
+          scipy_comb, tf_comb, rtol=rtol, atol=atol)
 
       # Test broadcasting between scalars and other shapes
       with self.cached_session():
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/gradient_checker_test.py
index 92ca9c2971e..c8ebf12569a 100644
--- a/tensorflow/python/ops/gradient_checker_test.py
+++ b/tensorflow/python/ops/gradient_checker_test.py
@@ -149,7 +149,7 @@ class GradientCheckerTest(test.TestCase):
       self.assertAllEqual(correct, analytical)
       self.assertAllClose(correct, numerical, rtol=1e-4)
       self.assertLess(
-          gradient_checker.compute_gradient_error(x, size, y, size), 2e-4)
+          gradient_checker.compute_gradient_error(x, size, y, size), 3e-4)
 
   @test_util.run_deprecated_v1
   def testComplexConj(self):

From b3bf8bd856b7698bb84cdae07570cf0494ac9374 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 13:15:33 -0700
Subject: [PATCH 291/412] Extend Keras Lambda layers to work with functions of
 any signature rather than only functions that take one argument.

Any *args and **kwargs passed when calling the lambda layer will be forwarded directly to the underlying lambda.

PiperOrigin-RevId: 311789009
Change-Id: Ic072d2252038330cc944d7f565f14806753d7436
---
 tensorflow/python/keras/layers/core.py        | 43 +++++++++----------
 tensorflow/python/keras/layers/core_test.py   | 20 ---------
 .../v1/tensorflow.keras.layers.-lambda.pbtxt  |  2 +-
 .../v2/tensorflow.keras.layers.-lambda.pbtxt  |  2 +-
 4 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index d1528c7ba59..db9c47eca17 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -53,7 +53,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.training.tracking import base as trackable
 from tensorflow.python.util import nest
-from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -738,8 +738,7 @@ class Lambda(Layer):
   models. `Lambda` layers are best suited for simple operations or
   quick experimentation. For more advanced use cases, follow
   [this guide](https://www.tensorflow.org/guide/keras/custom_layers_and_models)
-  for subclassing `tf.keras.layers.Layer`. (Do not subclass
-  `tf.keras.layers.Lamba`.)
+  for subclassing `tf.keras.layers.Layer`.
 
   The main reason to subclass `tf.keras.layers.Layer` instead of using a
   `Lambda` layer is saving and inspecting a Model. `Lambda` layers
@@ -799,7 +798,8 @@ class Lambda(Layer):
     computation, but anything more complex should use a subclass Layer instead.
 
   Arguments:
-    function: The function to evaluate when the layer is called.
+    function: The function to be evaluated. Takes input tensor as first
+      argument.
     output_shape: Expected output shape from function. This argument can be
       inferred if not explicitly provided. Can be a tuple or function. If a
       tuple, it only specifies the first dimension onward;
@@ -812,8 +812,8 @@ class Lambda(Layer):
     mask: Either None (indicating no masking) or a callable with the same
       signature as the `compute_mask` layer method, or a tensor that will be
       returned as output mask regardless of what the input is.
-    arguments: Optional dictionary of keyword arguments to pass by default to
-      the function when those arguments are not passed to the layer call.
+    arguments: Optional dictionary of keyword arguments to be passed to the
+      function.
   Input shape: Arbitrary. Use the keyword argument input_shape (tuple of
     integers, does not include the samples axis) when using this layer as the
     first layer in a model.
@@ -823,16 +823,11 @@ class Lambda(Layer):
   @trackable.no_automatic_dependency_tracking
   def __init__(self, function, output_shape=None, mask=None, arguments=None,
                **kwargs):
+    super(Lambda, self).__init__(**kwargs)
+
     self.arguments = arguments or {}
     self.function = function
 
-    # Decorate the function to produce this layer's call method
-    def _call_wrapper(*args, **kwargs):
-      return self._call_wrapper(*args, **kwargs)
-    self.call = tf_decorator.make_decorator(function, _call_wrapper)
-
-    super(Lambda, self).__init__(**kwargs)
-
     if mask is not None:
       self.supports_masking = True
     self.mask = mask
@@ -841,8 +836,9 @@ class Lambda(Layer):
     # Warning on every invocation will be quite irksome in Eager mode.
     self._already_warned = False
 
-    self._expects_training_arg = 'training' in self._call_fn_args
-    self._expects_mask_arg = 'mask' in self._call_fn_args
+    function_args = tf_inspect.getfullargspec(function).args
+    self._fn_expects_training_arg = 'training' in function_args
+    self._fn_expects_mask_arg = 'mask' in function_args
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
@@ -873,22 +869,23 @@ class Lambda(Layer):
     output_shapes = tf_utils.convert_shapes(self._output_shape, to_tuples=False)
     return nest.map_structure(_add_batch, output_shapes)
 
-  def _call_wrapper(self, *args, **kwargs):
+  def call(self, inputs, mask=None, training=None):
     # We must copy for thread safety, but it only needs to be a shallow copy.
-    call_kwargs = {k: v for k, v in self.arguments.items()}
-
-    # override default kwargs with the args passed to the layer call
-    call_kwargs.update(kwargs)
+    kwargs = {k: v for k, v in self.arguments.items()}
+    if self._fn_expects_mask_arg:
+      kwargs['mask'] = mask
+    if self._fn_expects_training_arg:
+      kwargs['training'] = training
 
     created_variables = []
-    def _variable_creator(next_creator, **creator_kwargs):
-      var = next_creator(**creator_kwargs)
+    def _variable_creator(next_creator, **kwargs):
+      var = next_creator(**kwargs)
       created_variables.append(var)
       return var
 
     with backprop.GradientTape(watch_accessed_variables=True) as tape,\
         variable_scope.variable_creator_scope(_variable_creator):
-      result = self.function(*args, **call_kwargs)
+      result = self.function(inputs, **kwargs)
     self._check_variables(created_variables, tape.watched_variables())
     return result
 
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index aa1192e12fc..3daa187f1ce 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -139,26 +139,6 @@ class LambdaLayerTest(keras_parameterized.TestCase):
     out = ld([x1, x2])
     self.assertAllEqual(out.shape, [3, 2])
 
-  def test_lambda_multiple_args(self):
-    ld = keras.layers.Lambda(lambda x, y: x[0] + y)
-    x1 = np.ones([3, 2], np.float32)
-    x2 = np.ones([3, 5], np.float32)
-
-    expected_result = x1 * 2
-    self.assertAllEqual(ld([x1, x2], x1), expected_result)
-    self.assertAllEqual(ld([x1, x2], y=x1), expected_result)
-    self.assertAllEqual(ld(x=[x1, x2], y=x1), expected_result)
-
-  def test_lambda_constructor_args_and_multiple_args(self):
-    x1 = np.ones([3, 2], np.float32)
-    x2 = np.ones([3, 5], np.float32)
-    ld = keras.layers.Lambda(lambda x, y: x[0] + y, arguments={'y': x1*2})
-
-    self.assertAllEqual(ld([x1, x2]), x1 * 3)
-    self.assertAllEqual(ld([x1, x2], y=x1), x1 * 2)
-    self.assertAllEqual(ld(x=[x1, x2]), x1 * 3)
-    self.assertAllEqual(ld(x=[x1, x2], y=x1), x1 * 2)
-
   def test_lambda_output_shape(self):
     l = keras.layers.Lambda(lambda x: x + 1, output_shape=(1, 1))
     l(keras.backend.variable(np.ones((1, 1))))
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index d4dbe96d1ba..22fa730112f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index d4dbe96d1ba..22fa730112f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -145,7 +145,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"

From dc1c299833317401e14a1651ed906164464425c8 Mon Sep 17 00:00:00 2001
From: Xiao Yu <fishx@google.com>
Date: Fri, 15 May 2020 13:32:17 -0700
Subject: [PATCH 292/412] Add Unsupported dtype in tfrt for backward
 compatibility. We will use this dtype to support legacy types (e.g.
 DT_RESOURCE, DT_VARIANT) that are not natively implemented in TFRT.

PiperOrigin-RevId: 311791879
Change-Id: Ied0bfadf68f07e68fe8eb941c0d02bcb9f1a0b40
---
 tensorflow/c/eager/context_interface.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/c/eager/context_interface.h b/tensorflow/c/eager/context_interface.h
index 76f182f4945..2861fa43b66 100644
--- a/tensorflow/c/eager/context_interface.h
+++ b/tensorflow/c/eager/context_interface.h
@@ -101,7 +101,7 @@ class AbstractContextInterface {
   // Destroy the step resource container for a training step.
   virtual void EndStep() = 0;
 
-  // Block until all pending nodes are finished,
+  // Block until all pending nodes are finished.
   virtual Status AsyncWait() = 0;
 
  protected:

From 31583920dcdeb54ffdb34acb1ce6b1db546ad33c Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Fri, 15 May 2020 13:34:26 -0700
Subject: [PATCH 293/412] Create _HostComputeMlir for use in TF MLIR.

PiperOrigin-RevId: 311792286
Change-Id: I6ec57f9b23c17dd52e756ead4ddfad58ecdb2f76
---
 .../mlir/tensorflow/ir/tf_generated_ops.td    | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index aa1601c4032..82282bb925a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -10586,6 +10586,27 @@ def TF_ZerosLikeOp : TF_Op<"ZerosLike", [NoSideEffect, SameOperandsAndResultType
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF__HostComputeMlirOp : TF_Op<"_HostComputeMlir", []> {
+  let summary = "A host-side computation called from a TPU device.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$inputs,
+
+    StrAttr:$key,
+    DefaultValuedAttr<I64Attr, "0">:$tpu_core
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$outputs
+  );
+
+  TF_DerivedOperandTypeListAttr Tinputs = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF__RecvTPUEmbeddingActivationsOp : TF_Op<"_RecvTPUEmbeddingActivations", []> {
   let summary = "An op that receives embeddng activations on the TPU.";
 

From 27ac446be5b10ee68900696a2c5184fce727e86d Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Fri, 15 May 2020 13:34:58 -0700
Subject: [PATCH 294/412] Enable MLIR saved model import by default in
 TFLiteConverterV2's saved model API

PiperOrigin-RevId: 311792366
Change-Id: I98356499c0a1eb7c740104ca4b11af5d45c4a4a1
---
 tensorflow/lite/python/lite.py | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 99be58f4376..ce59c56a1d0 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -386,13 +386,8 @@ class TFLiteConverterBase(object):
         return True
     return False
 
-  def _parse_saved_model_args(self, always_enable_saved_model_import=False):
-    """Parses SavedModel arguments from the given Keras/RNN SavedModel.
-
-    Args:
-      always_enable_saved_model_import: Bool. When the value is true, it enables
-        MLIR saved model import path regardless of checking the conditions.
-    """
+  def _parse_saved_model_args(self):
+    """Parses SavedModel arguments from the given Keras/RNN SavedModel."""
     if not self.experimental_new_converter:
       self.saved_model_dir = None
       return
@@ -405,17 +400,16 @@ class TFLiteConverterBase(object):
         # frozen graph def path.
         self.saved_model_dir = None
         return
-      if (not always_enable_saved_model_import and
-          not self._contains_function_with_implements_attr(saved_model_proto)):
+      if not self._contains_function_with_implements_attr(saved_model_proto):
         self.saved_model_dir = None
-        return
-
-      if not self._saved_model_exported_names:
-        self._saved_model_exported_names = []
-      self._saved_model_version = saved_model_proto.saved_model_schema_version
-      if self._saved_model_version not in [1, 2]:
-        raise ValueError("SavedModel file format({0}) is not supported".format(
-            self._saved_model_version))
+      else:
+        if not self._saved_model_exported_names:
+          self._saved_model_exported_names = []
+        self._saved_model_version = saved_model_proto.saved_model_schema_version
+        if self._saved_model_version not in [1, 2]:
+          raise ValueError(
+              "SavedModel file format({0}) is not supported".format(
+                  self._saved_model_version))
 
 
 class TFLiteConverterBaseV2(TFLiteConverterBase):
@@ -548,7 +542,7 @@ class TFLiteSavedModelConverterV2(TFLiteConverterBaseV2):
     self._saved_model_tags = saved_model_tags
     self._saved_model_exported_names = saved_model_exported_names
     self._trackable_obj = trackable_obj
-    self._parse_saved_model_args(always_enable_saved_model_import=True)
+    self._parse_saved_model_args()
 
   def convert(self):
     """Converts a TensorFlow GraphDef based on instance variables.

From 340ac1aedb082dbf3092608354c8f5a1d2d276d9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 13:49:39 -0700
Subject: [PATCH 295/412] Move GetDeviceCoordinates() function and related
 constants in tpu_rewrite pass to common utility file.

PiperOrigin-RevId: 311795001
Change-Id: If86babf6656da132fb58b1a2266034f3b341e06d
---
 tensorflow/compiler/mlir/tensorflow/BUILD     |  2 +
 .../tensorflow/transforms/tpu_rewrite_pass.cc | 63 +++++++------------
 .../utils/tpu_rewrite_device_util.cc          | 33 +++++++++-
 .../utils/tpu_rewrite_device_util.h           | 10 +++
 .../utils/tpu_rewrite_device_util_test.cc     | 26 ++++++++
 5 files changed, 91 insertions(+), 43 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index eb220a31f80..2bbdbb383a1 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -1279,6 +1279,7 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -1293,6 +1294,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
         "@llvm-project//llvm:support",
+        "@llvm-project//mlir:IR",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index f5e9da915c8..986736a9502 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -64,19 +64,14 @@ static llvm::cl::opt<bool> tpu_compile_metadata_debug(
                    "'tf._TPUCompileMlir' op as a proto debug string"));
 
 constexpr char kNumReplicasAttr[] = "num_replicas";
-constexpr char kNumCoresPerReplicaAttr[] = "num_cores_per_replica";
 constexpr char kStepMarkerLocationAttr[] = "step_marker_location";
 constexpr char kPaddingMapAttr[] = "padding_map";
-constexpr char kTopologyAttr[] = "topology";
-constexpr char kDeviceAssignmentAttr[] = "device_assignment";
 constexpr char kDeviceAttr[] = "device";
 constexpr char kDevicesAttr[] = "devices";
 constexpr char kVersionsAttr[] = "tf.versions";
 
 constexpr char kBadStringArrayElementMsg[] =
     "bad '{0}' attribute at index {1}, not a string";
-constexpr char kBadIntArrayElementMsg[] =
-    "bad '{0}' attribute at index {1}, not an int";
 constexpr char kBadArrayElementMsg[] =
     "bad '{0}' attribute at index {1} with value '{2}': failed to parse to {3}";
 constexpr char kBadArrayAttrLengthMsg[] =
@@ -163,32 +158,6 @@ LogicalResult EncapsulateFuncAndSerialize(FuncOp entry_func,
   return success();
 }
 
-// Extracts device coordinates from a device assignment attribute on an op.
-LogicalResult GetDeviceCoordinates(
-    tf_device::ClusterFuncOp op,
-    llvm::SmallVectorImpl<int64_t>* device_assignment) {
-  auto device_assignment_attr =
-      op.getAttrOfType<ArrayAttr>(kDeviceAssignmentAttr);
-  if (!device_assignment_attr)
-    return op.emitOpError(CreateMissingAttributeMsg(kDeviceAssignmentAttr));
-
-  device_assignment->reserve(device_assignment_attr.size());
-
-  for (auto device_coordinate_and_idx :
-       llvm::enumerate(device_assignment_attr)) {
-    auto device_coordinate =
-        device_coordinate_and_idx.value().dyn_cast<IntegerAttr>();
-    if (!device_coordinate)
-      return op.emitOpError(llvm::formatv(kBadIntArrayElementMsg,
-                                          kDeviceAssignmentAttr,
-                                          device_coordinate_and_idx.index()));
-
-    device_assignment->push_back(device_coordinate.getInt());
-  }
-
-  return success();
-}
-
 // Populates a TPUCompileMetadataProto with StepMarkerLocation from a
 // `tf_device::ClusterFuncOp`.
 LogicalResult SetMetadataProtoStepMarkerLocation(
@@ -661,27 +630,41 @@ LogicalResult Rewrite(
           : nullptr;
   if (replicate) num_replicas = replicate.n().getLimitedValue();
 
-  auto num_cores_per_replica_attr =
-      cluster_func.getAttrOfType<IntegerAttr>(kNumCoresPerReplicaAttr);
+  auto num_cores_per_replica_attr = cluster_func.getAttrOfType<IntegerAttr>(
+      tensorflow::kNumCoresPerReplicaAttr);
   if (!num_cores_per_replica_attr)
     return cluster_func.emitOpError(
-        CreateMissingAttributeMsg(kNumCoresPerReplicaAttr));
+        CreateMissingAttributeMsg(tensorflow::kNumCoresPerReplicaAttr));
 
   int num_cores_per_replica = num_cores_per_replica_attr.getInt();
 
-  auto topology_attr = cluster_func.getAttrOfType<StringAttr>(kTopologyAttr);
+  auto topology_attr =
+      cluster_func.getAttrOfType<StringAttr>(tensorflow::kTopologyAttr);
   if (!topology_attr)
-    return cluster_func.emitOpError(CreateMissingAttributeMsg(kTopologyAttr));
+    return cluster_func.emitOpError(
+        CreateMissingAttributeMsg(tensorflow::kTopologyAttr));
 
-  llvm::SmallVector<int64_t, 6> device_assignment;
-  if (failed(GetDeviceCoordinates(cluster_func, &device_assignment)))
-    return failure();
+  auto device_assignment_attr = cluster_func.getAttrOfType<mlir::ArrayAttr>(
+      tensorflow::kDeviceAssignmentAttr);
+  if (!device_assignment_attr)
+    return cluster_func.emitOpError(
+        llvm::formatv("requires attribute '{0}'",
+                      tensorflow::kDeviceAssignmentAttr)
+            .str());
+
+  auto status_or_device_coodinates =
+      tensorflow::GetDeviceCoordinates(device_assignment_attr);
+  if (!status_or_device_coodinates.ok())
+    return cluster_func.emitError()
+           << "error in fetching tpu device coordinates: "
+           << status_or_device_coodinates.status().error_message();
 
   // Determine compilation and execution devices.
   auto status_or_tpu_device_assignment =
       tensorflow::GetTPUCompilationAndExecutionDevices(
           devices, num_replicas, num_cores_per_replica,
-          topology_attr.getValue(), device_assignment);
+          topology_attr.getValue(),
+          status_or_device_coodinates.ConsumeValueOrDie());
   if (!status_or_tpu_device_assignment.ok())
     return cluster_func.emitError()
            << "error in fetching TPU compilation/execution devices: "
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 06c10c26835..282b7ad3139 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -39,6 +39,12 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/statusor.h"
 
 namespace tensorflow {
+
+const char* const kTPUReplicatedHost = "TPU_REPLICATED_HOST";
+const char* const kNumCoresPerReplicaAttr = "num_cores_per_replica";
+const char* const kTopologyAttr = "topology";
+const char* const kDeviceAssignmentAttr = "device_assignment";
+
 // Device coordinates are defined as (x, y, z, core), thus resulting in a rank 4
 // topology.
 constexpr int kTPUTopologyRank = 4;
@@ -46,8 +52,8 @@ constexpr int kTPUTopologyRank = 4;
 constexpr char kDeviceTPUSystem[] = "TPU_SYSTEM";
 constexpr char kDeviceTPU[] = "TPU";
 constexpr char kTPUReplicatedCore[] = "TPU_REPLICATED_CORE";
-constexpr char kTopologyAttr[] = "topology";
-constexpr char kDeviceAssignmentAttr[] = "device_assignment";
+constexpr char kBadIntArrayElementMsg[] =
+    "bad '{0}' attribute at index {1}, not an int";
 
 using Device = DeviceNameUtils::ParsedName;
 using Devices = llvm::ArrayRef<DeviceNameUtils::ParsedName>;
@@ -417,6 +423,27 @@ GetGeneralTPUExecutionDeviceAssignment(
 
 }  // anonymous namespace
 
+StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
+    mlir::ArrayAttr device_assignment_attr) {
+  llvm::SmallVector<int64_t, 8> device_coordinates;
+  device_coordinates.reserve(device_assignment_attr.size());
+
+  for (auto device_coordinate_and_idx :
+       llvm::enumerate(device_assignment_attr)) {
+    auto device_coordinate =
+        device_coordinate_and_idx.value().dyn_cast<mlir::IntegerAttr>();
+    if (!device_coordinate)
+      return errors::InvalidArgument(
+          llvm::formatv(kBadIntArrayElementMsg, kDeviceAssignmentAttr,
+                        device_coordinate_and_idx.index())
+              .str());
+
+    device_coordinates.push_back(device_coordinate.getInt());
+  }
+
+  return device_coordinates;
+}
+
 StatusOr<TPUDeviceAssignment> GetTPUCompilationAndExecutionDevices(
     Devices devices, int num_replicas, int num_cores_per_replica,
     llvm::StringRef topology_attr,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index 5fdb6b8768b..6bb541ab683 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -30,6 +31,11 @@ limitations under the License.
 namespace tensorflow {
 using stream_executor::port::StatusOr;
 
+extern const char* const kTPUReplicatedHost;
+extern const char* const kNumCoresPerReplicaAttr;
+extern const char* const kTopologyAttr;
+extern const char* const kDeviceAssignmentAttr;
+
 // A TPU device for execution alongside its associated host CPU device.
 struct TPUDeviceAndHost {
   TPUDeviceAndHost() {}
@@ -67,6 +73,10 @@ struct TPUDeviceAssignment {
   llvm::Optional<xla::DeviceAssignmentProto> xla_device_assignment;
 };
 
+// Extracts device coordinates from a device assignment attribute on an op.
+StatusOr<llvm::SmallVector<int64_t, 8>> GetDeviceCoordinates(
+    mlir::ArrayAttr device_assignment_attr);
+
 // Finds the TPU compilation device and execution devices from `devices` for a
 // TPU computation subgraph. Compilation device is determined from looking up
 // all TPU_SYSTEM:0 devices and choosing the CPU device associated to the first
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 7ac5635a6e4..a70e93a0195 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <tuple>
 
 #include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/tpu/topology.pb.h"
@@ -596,5 +598,29 @@ TEST(TPURewriteDeviceUtilTest, ValidGeneralDeviceAssignmentMesh1x2x1x3) {
   EXPECT_EQ(computation_device_2.replica_device_ids(1), 3);
 }
 
+TEST(TPURewriteDeviceUtilTest, TestGetDeviceCoordinates) {
+  mlir::MLIRContext context;
+  mlir::Builder builder(&context);
+  auto device_assignment_attr = builder.getI64ArrayAttr({1, 2, 3});
+  auto status_or_device_coodinates =
+      GetDeviceCoordinates(device_assignment_attr);
+  ASSERT_TRUE(status_or_device_coodinates.ok());
+  auto device_coordinates = status_or_device_coodinates.ConsumeValueOrDie();
+  EXPECT_EQ(device_coordinates[0], 1);
+  EXPECT_EQ(device_coordinates[1], 2);
+  EXPECT_EQ(device_coordinates[2], 3);
+}
+
+TEST(TPURewriteDeviceUtilTest, TestInvalidAttrForDeviceAssignmentDisallowed) {
+  mlir::MLIRContext context;
+  mlir::Builder builder(&context);
+  auto device_assignment_attr = builder.getF32ArrayAttr({1.0, 2.0, 3.0});
+  auto status_or_device_coodinates =
+      GetDeviceCoordinates(device_assignment_attr);
+  ASSERT_TRUE(!status_or_device_coodinates.ok());
+  EXPECT_EQ(status_or_device_coodinates.status().error_message(),
+            "bad 'device_assignment' attribute at index 0, not an int");
+}
+
 }  // anonymous namespace
 }  // namespace tensorflow

From 20f064ffa6a0b40e76f8e7b37a0a647febe5a840 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 15 May 2020 14:30:51 -0700
Subject: [PATCH 296/412] optimize for int8 add.

PiperOrigin-RevId: 311802413
Change-Id: I14cd70984ae7a8cad89b9c4a1a5216fcb7609c0e
---
 .../internal/optimized/integer_ops/add.h      | 141 +++++++-----------
 1 file changed, 50 insertions(+), 91 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index 8937fe2b26e..a9dae4feac5 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -35,99 +35,58 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
-
 #ifdef USE_NEON
-  const int8x16_t output_activation_min_vector =
-      vdupq_n_s8(params.quantized_activation_min);
-  const int8x16_t output_activation_max_vector =
-      vdupq_n_s8(params.quantized_activation_max);
-
-  const int input1_left_shift = params.left_shift + params.input1_shift;
-  const int input2_left_shift = params.left_shift + params.input2_shift;
-  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
-  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
-
-  for (; i <= size - 16; i += 16) {
-    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
-    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
-
-    const int16x8_t input1_val_s16_high =
-        vmovl_s8(vget_high_s8(input1_val_original));
-    const int16x8_t input1_val_s16_low =
-        vmovl_s8(vget_low_s8(input1_val_original));
-
-    const int16x8_t input2_val_s16_high =
-        vmovl_s8(vget_high_s8(input2_val_original));
-    const int16x8_t input2_val_s16_low =
-        vmovl_s8(vget_low_s8(input2_val_original));
-    const int16x8_t input1_val_high =
-        vaddq_s16(input1_val_s16_high, vdupq_n_s16(params.input1_offset));
-    const int16x8_t input2_val_high =
-        vaddq_s16(input2_val_s16_high, vdupq_n_s16(params.input2_offset));
-    const int16x8_t input1_val_low =
-        vaddq_s16(input1_val_s16_low, vdupq_n_s16(params.input1_offset));
-    const int16x8_t input2_val_low =
-        vaddq_s16(input2_val_s16_low, vdupq_n_s16(params.input2_offset));
-    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
-    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
-    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
-    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
-    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
-    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
-    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
-    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
-    int32x4_t x111 = vmovl_s16(input1_val_low_low);
-    int32x4_t x112 = vmovl_s16(input1_val_low_high);
-    int32x4_t x121 = vmovl_s16(input1_val_high_low);
-    int32x4_t x122 = vmovl_s16(input1_val_high_high);
-    int32x4_t x211 = vmovl_s16(input2_val_low_low);
-    int32x4_t x212 = vmovl_s16(input2_val_low_high);
-    int32x4_t x221 = vmovl_s16(input2_val_high_low);
-    int32x4_t x222 = vmovl_s16(input2_val_high_high);
-
-    x111 = vshlq_s32(x111, input1_left_dup);
-    x112 = vshlq_s32(x112, input1_left_dup);
-    x121 = vshlq_s32(x121, input1_left_dup);
-    x122 = vshlq_s32(x122, input1_left_dup);
-    x211 = vshlq_s32(x211, input2_left_dup);
-    x212 = vshlq_s32(x212, input2_left_dup);
-    x221 = vshlq_s32(x221, input2_left_dup);
-    x222 = vshlq_s32(x222, input2_left_dup);
-    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
-    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
-    x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
-    x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
-    x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
-    x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
-    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
-    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
-    int32x4_t s11 = vaddq_s32(x111, x211);
-    int32x4_t s12 = vaddq_s32(x112, x212);
-    int32x4_t s21 = vaddq_s32(x121, x221);
-    int32x4_t s22 = vaddq_s32(x122, x222);
-    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
-    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
-    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
-    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
+  const int8x8_t output_activation_min_vector =
+      vdup_n_s8(params.quantized_activation_min);
+  const int8x8_t output_activation_max_vector =
+      vdup_n_s8(params.quantized_activation_max);
+  for (; i <= size - 8; i += 8) {
+    const int8x8_t input1_val_original = vld1_s8(input1_data + i);
+    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
+    const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
+    const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
+    const int16x8_t input1_val =
+        vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
+    const int16x8_t input2_val =
+        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input1_val_high = vget_high_s16(input1_val);
+    const int16x4_t input1_val_low = vget_low_s16(input1_val);
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x11 = vmovl_s16(input1_val_low);
+    int32x4_t x12 = vmovl_s16(input1_val_high);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
+    x11 = vshlq_s32(x11, left_shift_dup);
+    x12 = vshlq_s32(x12, left_shift_dup);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+    x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+    const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    x11 = vshlq_s32(x11, input1_shift_dup);
+    x12 = vshlq_s32(x12, input1_shift_dup);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    s11 = RoundingDivideByPOT(s11, -params.output_shift);
-    s12 = RoundingDivideByPOT(s12, -params.output_shift);
-    s21 = RoundingDivideByPOT(s21, -params.output_shift);
-    s22 = RoundingDivideByPOT(s22, -params.output_shift);
-    const int16x4_t s11_narrowed = vmovn_s32(s11);
-    const int16x4_t s12_narrowed = vmovn_s32(s12);
-    const int16x4_t s21_narrowed = vmovn_s32(s21);
-    const int16x4_t s22_narrowed = vmovn_s32(s22);
-    const int16x8_t s1 = vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed),
-                                   vdupq_n_s16(params.output_offset));
-    const int16x8_t s2 = vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed),
-                                   vdupq_n_s16(params.output_offset));
-    const int16x8_t s = vcombine_s16(vqmovn_s16(s1), vqmovn_s16(s2));
-
-    const int8x16_t clamped =
-        vmaxq_s8(output_activation_min_vector,
-                 vminq_s8(output_activation_max_vector, s));
-    vst1q_s8(output_data + i, clamped);
+    s1 = RoundingDivideByPOT(s1, -params.output_shift);
+    s2 = RoundingDivideByPOT(s2, -params.output_shift);
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
+                                  vdupq_n_s16(params.output_offset));
+    const int8x8_t clamped =
+        vmax_s8(output_activation_min_vector,
+                vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
+    vst1_s8(output_data + i, clamped);
   }
 #endif  // NEON
 

From c77c31d45d849ebdf6ab53f9238137ffebe07829 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Fri, 15 May 2020 14:36:46 -0700
Subject: [PATCH 297/412] Enable TextVectorization to be called on lists of
 strings (or lists of list-wrapped strings). Using NumPy arrays of characters
 is generally a bad practice because of their extreme memory usage.

PiperOrigin-RevId: 311803496
Change-Id: I179eee4a8a879f8871ef6cc1253c34c42da06983
---
 .../keras/engine/base_preprocessing_layer.py  | 14 +++--
 .../python/keras/layers/preprocessing/BUILD   |  1 +
 .../preprocessing/text_vectorization.py       | 29 ++++++++---
 .../preprocessing/text_vectorization_test.py  | 52 +++++++++++++++++++
 4 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_preprocessing_layer.py b/tensorflow/python/keras/engine/base_preprocessing_layer.py
index 84138dd0a00..efd8a0e621f 100644
--- a/tensorflow/python/keras/engine/base_preprocessing_layer.py
+++ b/tensorflow/python/keras/engine/base_preprocessing_layer.py
@@ -143,9 +143,12 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
       accumulator = self._combiner.restore(self._restore_updates())
 
     if not isinstance(data,
-                      (dataset_ops.DatasetV2, np.ndarray, ops.EagerTensor)):
+                      (dataset_ops.DatasetV2,
+                       np.ndarray,
+                       ops.Tensor,
+                       ragged_tensor.RaggedTensor)):
       raise ValueError(
-          '`adapt()` requires a batched Dataset, an EagerTensor, '
+          '`adapt()` requires a batched Dataset, a Tensor, '
           'or a Numpy array as input, '
           'got {}'.format(type(data)))
 
@@ -158,9 +161,14 @@ class CombinerPreprocessingLayer(PreprocessingLayer):
             'elements. Please use `dataset.take(...)` to make the number '
             'of elements finite.')
       next_data = self._get_dataset_iterator(data)
+      # TODO(fchollet): consider checking if the dataset is already batched
+      # and otherwise batching it.
+    elif isinstance(data, (ops.Tensor, ragged_tensor.RaggedTensor)):
+      next_data = self._get_dataset_iterator(
+          dataset_ops.Dataset.from_tensor_slices(data).batch(512))
     else:
       generator, _ = training_generator.convert_to_generator_like(
-          data, batch_size=len(data))
+          data, batch_size=512)
       # If the data is not a dataset, we can iterate over it using next(foo);
       # here, we wrap that into a callable.
       next_data = lambda: next(generator)
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index c1e1d5573e5..052a57b52f3 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -521,6 +521,7 @@ tf_py_test(
     size = "medium",
     srcs = ["text_vectorization_test.py"],
     python_version = "PY3",
+    shard_count = 4,
     deps = [
         ":preprocessing_test_utils",
         ":text_vectorization",
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 4156ba50c02..b1eff6e0bf3 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -346,11 +346,16 @@ class TextVectorization(CombinerPreprocessingLayer):
       return tensor_shape.TensorShape([input_shape[0], self._max_tokens])
 
     if self._output_mode == INT and self._split is None:
-      return input_shape
+      if len(input_shape) == 1:
+        input_shape = tuple(input_shape) + (1,)
+      return tensor_shape.TensorShape(input_shape)
 
     if self._output_mode == INT and self._split is not None:
       input_shape = list(input_shape)
-      input_shape[1] = self._output_sequence_length
+      if len(input_shape) == 1:
+        input_shape = input_shape + [self._output_sequence_length]
+      else:
+        input_shape[1] = self._output_sequence_length
       return tensor_shape.TensorShape(input_shape)
 
   def compute_output_signature(self, input_spec):
@@ -366,7 +371,7 @@ class TextVectorization(CombinerPreprocessingLayer):
 
     Arguments:
       data: The data to train on. It can be passed either as a tf.data Dataset,
-        or as a numpy array.
+        as a NumPy array, a string tensor, or as a list of texts.
       reset_state: Optional argument specifying whether to clear the state of
         the layer at the start of the call to `adapt`. This must be True for
         this layer, which does not support repeated calls to `adapt`.
@@ -377,24 +382,30 @@ class TextVectorization(CombinerPreprocessingLayer):
     # Build the layer explicitly with the original data shape instead of relying
     # on an implicit call to `build` in the base layer's `adapt`, since
     # preprocessing changes the input shape.
-    if isinstance(data, np.ndarray):
-      if data.ndim == 1:
-        data = np.expand_dims(data, axis=-1)
+    if isinstance(data, (list, tuple, np.ndarray)):
+      data = ops.convert_to_tensor(data)
+
+    if isinstance(data, ops.Tensor):
+      if data.shape.rank == 1:
+        data = array_ops.expand_dims(data, axis=-1)
       self.build(data.shape)
-      preprocessed_inputs = self._to_numpy(self._preprocess(data))
+      preprocessed_inputs = self._preprocess(data)
     elif isinstance(data, dataset_ops.DatasetV2):
       # TODO(momernick): Replace this with a more V2-friendly API.
       shape = dataset_ops.get_legacy_output_shapes(data)
       if not isinstance(shape, tensor_shape.TensorShape):
         raise ValueError("The dataset passed to 'adapt' must contain a single "
                          "tensor value.")
+      if shape.rank == 0:
+        data = data.map(lambda tensor: array_ops.expand_dims(tensor, 0))
+        shape = dataset_ops.get_legacy_output_shapes(data)
       if shape.rank == 1:
         data = data.map(lambda tensor: array_ops.expand_dims(tensor, -1))
       self.build(dataset_ops.get_legacy_output_shapes(data))
       preprocessed_inputs = data.map(self._preprocess)
     else:
       raise ValueError(
-          "adapt() requires a Dataset or a Numpy array as input, got {}".format(
+          "adapt() requires a Dataset or an array as input, got {}".format(
               type(data)))
     super(TextVectorization, self).adapt(preprocessed_inputs, reset_state)
 
@@ -561,6 +572,8 @@ class TextVectorization(CombinerPreprocessingLayer):
     return inputs
 
   def call(self, inputs):
+    if isinstance(inputs, (list, tuple, np.ndarray)):
+      inputs = ops.convert_to_tensor(inputs)
     if inputs.shape.rank == 1:
       inputs = array_ops.expand_dims(inputs, axis=-1)
 
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index f8a1f5b9434..5a9762719d5 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -29,6 +29,7 @@ from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import one_device_strategy
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import keras_parameterized
@@ -286,6 +287,57 @@ class TextVectorizationLayerTest(keras_parameterized.TestCase,
           adapt_data=vocab_data)
     self.assertAllClose(expected_output, output_data)
 
+  def test_list_inputs_1d(self):
+    vocab_data = ["two two two", "two three three", "three four four five"]
+    input_data = ["two three", "four five"]
+    layer = get_layer_class()()
+    layer.adapt(vocab_data)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    layer.set_vocabulary(["two", "three", "four", "five"])
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  def test_tensor_inputs(self):
+    vocab_data = constant_op.constant(
+        ["two two two", "two three three", "three four four five"])
+    input_data = constant_op.constant(["two three", "four five"])
+    layer = get_layer_class()()
+    layer.adapt(vocab_data)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    layer.set_vocabulary(["two", "three", "four", "five"])
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  def test_list_inputs_2d(self):
+    vocab_data = [
+        ["two two two"], ["two three three"], ["three four four five"]]
+    input_data = [["two three"], ["four five"]]
+    layer = get_layer_class()()
+    layer.adapt(vocab_data)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+    layer.set_vocabulary(["two", "three", "four", "five"])
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
+  def test_dataset_of_single_strings(self):
+    vocab_data = ["two two two", "two three three", "three four four five"]
+    input_data = ["two three", "four five"]
+    vocab_ds = dataset_ops.Dataset.from_tensor_slices(vocab_data)  # unbatched
+    layer = get_layer_class()()
+    layer.adapt(vocab_ds)
+    out = layer(input_data)
+    if context.executing_eagerly():
+      self.assertAllClose(out.numpy(), [[2, 3], [4, 5]])
+
 
 @keras_parameterized.run_all_keras_modes
 class TextVectorizationPreprocessingTest(

From cfb6d217c9963de69a31d543a373b9a39854108c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 14:41:08 -0700
Subject: [PATCH 298/412] Implement NNAPI QoS APIs in NNAPI delegate.

PiperOrigin-RevId: 311804298
Change-Id: Ia018050ca90fbc2cc12f363b5bc52727734e4abf
---
 .../lite/delegates/nnapi/nnapi_delegate.cc    |  37 +++++
 .../lite/delegates/nnapi/nnapi_delegate.h     |  36 +++++
 .../delegates/nnapi/nnapi_delegate_test.cc    |  17 ++
 tensorflow/lite/nnapi/NeuralNetworksTypes.h   |  24 +++
 tensorflow/lite/nnapi/nnapi_implementation.cc |  11 ++
 tensorflow/lite/nnapi/nnapi_implementation.h  | 148 ++++++++++++++++++
 6 files changed, 273 insertions(+)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index e6faea62bf6..39ab19aed2d 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -3256,6 +3256,22 @@ TfLiteStatus NNAPIDelegateKernel::Prepare(TfLiteContext* context,
     RETURN_TFLITE_ERROR_IF_NN_ERROR(context, set_caching_result,
                                     "configuring NNAPI caching", nnapi_errno);
   }
+  // Set compilation timeout if applicable.
+  if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI13) {
+    if (delegate_options.max_compilation_timeout_duration_ns > 0) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksCompilation_setTimeout(
+              compilation,
+              delegate_options.max_compilation_timeout_duration_ns),
+          "setting compilation timeout", nnapi_errno);
+    }
+    RETURN_TFLITE_ERROR_IF_NN_ERROR(
+        context,
+        nnapi_->ANeuralNetworksCompilation_setPriority(
+            compilation, delegate_options.execution_priority),
+        "setting compilation priority", nnapi_errno);
+  }
   const int finish_result =
       nnapi_->ANeuralNetworksCompilation_finish(compilation);
   if (finish_result != ANEURALNETWORKS_NO_ERROR) {
@@ -3322,6 +3338,27 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context,
   std::unique_ptr<ANeuralNetworksExecution, NNFreeExecution>
       execution_unique_ptr(execution, NNFreeExecution(nnapi_));
 
+  // Set compilation timeout if applicable.
+  const auto delegate_options =
+      StatefulNnApiDelegate::GetOptions(node->delegate);
+  if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI13) {
+    if (delegate_options.max_execution_timeout_duration_ns > 0) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksExecution_setTimeout(
+              execution, delegate_options.max_execution_timeout_duration_ns),
+          "setting execution timeout", nnapi_errno);
+    }
+    if (delegate_options.max_execution_loop_timeout_duration_ns > 0) {
+      RETURN_TFLITE_ERROR_IF_NN_ERROR(
+          context,
+          nnapi_->ANeuralNetworksExecution_setLoopTimeout(
+              execution,
+              delegate_options.max_execution_loop_timeout_duration_ns),
+          "setting execution loop timeout", nnapi_errno);
+    }
+  }
+
   // Set the input tensor buffers. Note: we access tflite tensors using
   // absolute indices but NN api indices inputs by relative indices.
   int relative_input_index = 0;
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index b94c6d66978..68c55e1aef4 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/nnapi/NeuralNetworksTypes.h"
 #include "tensorflow/lite/nnapi/nnapi_implementation.h"
 
 typedef struct ANeuralNetworksMemory ANeuralNetworksMemory;
@@ -92,6 +93,30 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
 
     // allow fp32 compuation to be run in fp16.
     bool allow_fp16 = false;
+
+    // Specifies the relative priority for executions of the model.
+    // Available values are {ANEURALNETWORKS_PRIORITY_LOW,
+    // ANEURALNETWORKS_PRIORITY_MEDIUM, ANEURALNETWORKS_PRIORITY_HIGH,
+    // ANEURALNETWORKS_PRIORITY_DEFAULT}.
+    int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
+
+    // Specifies the maximum expected duration in nanosecond for compiling the
+    // model. If the device is not able to complete the compilation within the
+    // specified duration, the compilation may be aborted. If set to 0, the
+    // timeout duration is considered infinite.
+    uint64_t max_compilation_timeout_duration_ns = 0;
+
+    // Specifies the maximum expected duration in nanosecond for executing the
+    // model. If the device is not able to complete the execution within the
+    // specified duration, the execution may be aborted. If set to 0, the
+    // timeout duration is considered infinite.
+    uint64_t max_execution_timeout_duration_ns = 0;
+
+    // Specifies the maximum expected duration in nanosecond for WHILE loops in
+    // the execution. If a WHILE loop condition model does not output false
+    // within the specified duration, the execution will be aborted. If set to
+    // 0, the default timeout for loops will be used.
+    uint64_t max_execution_loop_timeout_duration_ns = 0;
   };
 
   // Uses default options.
@@ -189,6 +214,17 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     int max_number_delegated_partitions;
     // allow fp32 computation to be run in fp16.
     bool allow_fp16;
+    // Specifies the relative priority for executions of the model.
+    int execution_priority = ANEURALNETWORKS_PRIORITY_DEFAULT;
+    // Specifies the maximum expected duration in nanosecond for compiling the
+    // model.
+    uint64_t max_compilation_timeout_duration_ns = 0;
+    // Specifies the maximum expected duration in nanosecond for executing the
+    // model.
+    uint64_t max_execution_timeout_duration_ns = 0;
+    // Specifies the maximum expected duration in nanosecond for WHILE loops in
+    // the execution
+    uint64_t max_execution_loop_timeout_duration_ns = 0;
 
     ~Data();
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index ea9111c4567..acfa0c77d30 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -304,6 +304,23 @@ TEST(NNAPIDelegate, StatefulDelegateWithCompilationCaching) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
 }
 
+// Sanity check for the state-ful NNAPI delegate with QoS hints.
+TEST(NNAPIDelegate, StatefulDelegateWithQoS) {
+  StatefulNnApiDelegate::Options options;
+  options.execution_priority = ANEURALNETWORKS_PRIORITY_HIGH;
+  options.max_compilation_timeout_duration_ns = UINT64_MAX;
+  options.max_execution_timeout_duration_ns = UINT64_MAX;
+  options.max_execution_loop_timeout_duration_ns = UINT64_MAX;
+
+  FloatAddOpModel m(options, {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
 // Sanity check for the state-ful NNAPI delegate using TfLiteBufferHandle.
 TEST(NNAPIDelegate, StatefulDelegateWithBufferHandles) {
   // Skip the test if Android specific functions could not be found.
diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
index a3dfd373405..6739838e4d1 100644
--- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h
+++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h
@@ -215,6 +215,18 @@ enum {
   ANEURALNETWORKS_DEVICE_ACCELERATOR = 4,
 };
 
+/**
+ * Relative execution priority.
+ *
+ * Available since API level 30.
+ */
+enum {
+  ANEURALNETWORKS_PRIORITY_LOW = 90,
+  ANEURALNETWORKS_PRIORITY_MEDIUM = 100,
+  ANEURALNETWORKS_PRIORITY_HIGH = 110,
+  ANEURALNETWORKS_PRIORITY_DEFAULT = ANEURALNETWORKS_PRIORITY_MEDIUM,
+};
+
 /**
  * ANeuralNetworksMemory is an opaque type that represents memory.
  *
@@ -528,9 +540,21 @@ typedef int (*ANeuralNetworksCompilation_setCaching_fn)(
     ANeuralNetworksCompilation* compilation, const char* cacheDir,
     const uint8_t* token);
 
+typedef int (*ANeuralNetworksCompilation_setTimeout_fn)(
+    ANeuralNetworksCompilation* compilation, uint64_t duration);
+
+typedef int (*ANeuralNetworksCompilation_setPriority_fn)(
+    ANeuralNetworksCompilation* compilation, int priority);
+
 typedef int (*ANeuralNetworksExecution_compute_fn)(
     ANeuralNetworksExecution* execution);
 
+typedef int (*ANeuralNetworksExecution_setTimeout_fn)(
+    ANeuralNetworksExecution* execution, uint64_t duration);
+
+typedef int (*ANeuralNetworksExecution_setLoopTimeout_fn)(
+    ANeuralNetworksExecution* execution, uint64_t duration);
+
 typedef int (*ANeuralNetworksExecution_getOutputOperandRank_fn)(
     ANeuralNetworksExecution* execution, int32_t index, uint32_t* rank);
 
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index accdfb6c7da..ad5869fec04 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -215,6 +215,17 @@ const NnApi LoadNnApi() {
                          ANeuralNetworksModel_getExtensionOperationType);
   LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
                          ANeuralNetworksModel_setOperandExtensionData);
+
+  // API 30 (NNAPI 1.3) methods.
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksCompilation_setTimeout);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksCompilation_setPriority);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_setTimeout);
+  LOAD_FUNCTION_OPTIONAL(libneuralnetworks,
+                         ANeuralNetworksExecution_setLoopTimeout);
+
   return nnapi;
 }
 
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.h b/tensorflow/lite/nnapi/nnapi_implementation.h
index a27f5ba661a..abee0fbdef3 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.h
+++ b/tensorflow/lite/nnapi/nnapi_implementation.h
@@ -789,6 +789,76 @@ struct NnApi {
       ANeuralNetworksCompilation* compilation, const char* cacheDir,
       const uint8_t* token);
 
+  /**
+   * Set the maximum expected duration for compiling the model.
+   *
+   * If the device is not able to complete the compilation within the specified
+   * duration, the compilation may be aborted. The timeout duration begins at
+   * the call to {@link ANeuralNetworksCompilation_finish}.
+   *
+   * This timeout duration acts as a hint to drivers, and can be used to both
+   * free up compute resources within the driver and return control back to the
+   * application quicker than is possible without the hint. It enables drivers
+   * that are able to estimate how long a compilation will take to abort the
+   * compilation before it has even started if the driver believes the
+   * compilation cannot be completed within the timeout duration. Similarly, it
+   * enables drivers to abort an ongoing compilation if it is taking too long.
+   * However, this call does not guarantee that the compilation will complete or
+   * abort within the timeout duration.
+   *
+   * By default (i.e., unless ANeuralNetworksCompilation_setTimeout is called),
+   * the timeout duration for compiling the model is considered infinite.
+   *
+   * The {@link ANeuralNetworksCompilation} must have been created with
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+   * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+   * device has a feature level reported by
+   * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then
+   * the timeout duration hint will be ignored.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * @param compilation The compilation to be modified.
+   * @param duration The maximum amount of time in nanoseconds that is expected
+   * to be spent finishing a compilation. If this duration is exceeded, the
+   *     compilation may be aborted. If set to 0, the timeout duration is
+   *     considered infinite.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksCompilation_setTimeout)(
+      ANeuralNetworksCompilation* compilation, uint64_t duration);
+
+  /**
+   * Set the execution priority.
+   *
+   * Execution priorities are relative to other executions created by the same
+   * application (specifically same uid) for the same device. Specifically,
+   * priorities of executions from one application will not affect executions
+   * from another application. Similarly, priorities of executions on one device
+   * will not affect executions on another device.
+   *
+   * Higher priority executions may use more compute resources than lower
+   * priority executions, and may preempt or starve lower priority executions.
+   *
+   * See {@link ANeuralNetworksCompilation} for information on multithreaded
+   * usage.
+   *
+   * Available since API level 30.
+   *
+   * @param compilation The compilation to be modified.
+   * @param priority The relative priority of the execution compared to other
+   *     executions created by the application. Must be one of
+   *     ANEURALNETWORKS_PRIORITY_*.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   */
+  int (*ANeuralNetworksCompilation_setPriority)(
+      ANeuralNetworksCompilation* compilation, int priority);
+
   /**
    * Schedule synchronous evaluation of the execution.
    *
@@ -813,6 +883,84 @@ struct NnApi {
    */
   int (*ANeuralNetworksExecution_compute)(ANeuralNetworksExecution* execution);
 
+  /**
+   * Set the maximum expected duration of the specified execution.
+   *
+   * If the device is not able to complete the execution within the specified
+   * duration, the execution may be aborted. The timeout duration begins at a
+   * call to one of:
+   * - {@link ANeuralNetworksExecution_burstCompute}
+   * - {@link ANeuralNetworksExecution_compute}
+   * - {@link ANeuralNetworksExecution_startCompute}
+   * - {@link ANeuralNetworksExecution_startComputeWithDependencies}
+   *
+   * This timeout duration acts as a hint to drivers, and can be used to both
+   * free up compute resources within the driver and return control back to the
+   * application quicker than is possible without the hint. It enables drivers
+   * that are able to estimate how long an execution will take to abort the
+   * execution before it has even started if the driver believes the execution
+   * cannot be completed within the timeout duration. Similarly, it enables
+   * drivers to abort an ongoing execution if it is taking too long. However,
+   * this call does not guarantee that the execution will complete or abort
+   * within the timeout duration.
+   *
+   * By default (i.e., unless ANeuralNetworksExecution_setTimeout is called),
+   * the timeout duration for execution is considered infinite.
+   *
+   * The {@link ANeuralNetworksExecution} must have been created from an
+   * {@link ANeuralNetworksCompilation} which in turn was created from
+   * {@link ANeuralNetworksCompilation_createForDevices} with numDevices = 1,
+   * otherwise this function will fail with ANEURALNETWORKS_BAD_DATA. If the
+   * device has a feature level reported by
+   * {@link ANeuralNetworksDevice_getFeatureLevel} that is lower than 30, then
+   * the timeout duration hint will be ignored.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param duration The maximum amount of time in nanoseconds that is expected
+   * to be spent executing a model. If this duration is exceeded, the execution
+   *     may be aborted. If set to 0, the timeout duration is considered
+   * infinite.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksExecution_setTimeout)(
+      ANeuralNetworksExecution* execution, uint64_t duration);
+
+  /**
+   * Set the maximum duration of WHILE loops in the specified execution.
+   *
+   * This is a fuzzy per-loop timeout intended to prevent infinite loops.
+   *
+   * If a WHILE loop condition model does not output false within the specified
+   * duration, the execution will be aborted.
+   *
+   * See {@link ANeuralNetworks_getDefaultLoopTimeout} and
+   * {@link ANeuralNetworks_getMaximumLoopTimeout} for the default
+   * and maximum timeout values.
+   *
+   * See {@link ANeuralNetworksExecution} for information on multithreaded
+   * usage.
+   *
+   * @param execution The execution to be modified.
+   * @param duration The maximum amount of time in nanoseconds that can be spent
+   *     executing a WHILE loop. If the specified duration value exceeds the
+   * value produced by {@link ANeuralNetworks_getMaximumLoopTimeout}, it will be
+   *     overridden by that value.
+   *
+   * @return ANEURALNETWORKS_NO_ERROR if successful.
+   *         ANEURALNETWORKS_BAD_STATE if execution has started.
+   *         ANEURALNETWORKS_UNEXPECTED_NULL if execution is NULL.
+   *
+   * Available since API level 30.
+   */
+  int (*ANeuralNetworksExecution_setLoopTimeout)(
+      ANeuralNetworksExecution* execution, uint64_t duration);
+
   /**
    * Get the dimensional information of the specified output operand of the
    * model of the

From 4ee27d9668f46c89b93fd5d306e25ede4e6a2f09 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Fri, 15 May 2020 14:46:00 -0700
Subject: [PATCH 299/412] Add benchmarks for scalar conversions

PiperOrigin-RevId: 311805249
Change-Id: Id1f499cfe6a1aac7ab1fc8d8339ce7b4031d4b6b
---
 tensorflow/python/eager/benchmarks_test.py | 40 ++++++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 227fca5ea6f..3056d1a98ea 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -1194,6 +1194,46 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
 
     self._run(fn, 10000)
 
+  def _benchmark_convert_constant(self, value, cached):
+    global GLOBAL_TEST_VALUE
+    GLOBAL_TEST_VALUE = value
+
+    def cached_func():
+      ops.convert_to_tensor(value)
+
+    def uncached_func():
+      global GLOBAL_TEST_VALUE
+      GLOBAL_TEST_VALUE += 1
+      ops.convert_to_tensor(GLOBAL_TEST_VALUE)
+
+    func = cached_func if cached else uncached_func
+
+    self._run(func, 10000)
+
+  def benchmark_convert_python_int(self):
+    self._benchmark_convert_constant(42, cached=True)
+
+  def benchmark_convert_python_int_uncached(self):
+    self._benchmark_convert_constant(42, cached=False)
+
+  def benchmark_convert_python_float(self):
+    self._benchmark_convert_constant(42.0, cached=True)
+
+  def benchmark_convert_python_float_uncached(self):
+    self._benchmark_convert_constant(42.0, cached=False)
+
+  def benchmark_convert_numpy_int(self):
+    self._benchmark_convert_constant(np.array(42), cached=True)
+
+  def benchmark_convert_numpy_int_uncached(self):
+    self._benchmark_convert_constant(np.array(42), cached=False)
+
+  def benchmark_convert_numpy_float(self):
+    self._benchmark_convert_constant(np.array(42.0), cached=True)
+
+  def benchmark_convert_numpy_float_uncached(self):
+    self._benchmark_convert_constant(np.array(42.0), cached=False)
+
   @test_util.disable_tfrt("convert to tensor not supported")
   def benchmark_convert_3x_list_to_tensor(self):
     xs = [1, 2, 3]

From c61bc6a4f32dc697b8eb51ef3bf490e8d0780228 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Fri, 15 May 2020 14:46:55 -0700
Subject: [PATCH 300/412] Support cancellation in multi-device and distributed
 function execution.

In executing a multi-device or distributed function, one component function failure could cause other component functions to hang due to dependencies (e.g., they are pending receiving tensors from the failed component function). This can often lead to issues that are hard to debug especially with a large number of workers.

This change cancels local and remote component functions in multi-device function execution if one component function fails, by cancelling the function rendezvous and the component function execution request RPCs. Since the cancelled errors are marked as derived, the original failure error message will be reported to users.

PiperOrigin-RevId: 311805431
Change-Id: I2f0b819e2b0a228fdeb242361b41ef4cadc7e3d2
---
 tensorflow/c/eager/BUILD                      |   3 +
 tensorflow/c/eager/c_api_remote_test.cc       | 180 ++++++++++++++++++
 .../process_function_library_runtime.cc       |  99 ++++++----
 .../core/distributed_runtime/eager/BUILD      |   2 +
 .../eager/cluster_function_library_runtime.cc |  24 ++-
 .../distributed_runtime/eager/eager_client.h  |   6 +-
 .../eager/eager_service_impl_test.cc          |   3 +-
 .../core/distributed_runtime/rpc/eager/BUILD  |   1 +
 .../rpc/eager/grpc_eager_client.cc            |  14 +-
 .../rpc/rpc_rendezvous_mgr.cc                 |  14 ++
 10 files changed, 305 insertions(+), 41 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index fe4d5ac6ffe..0180b4bdee2 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -357,10 +357,13 @@ tf_cuda_cc_test(
         ":c_api_test_util",
         ":tfe_tensorhandle_internal",
         "//tensorflow/c:c_test_util",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime:function_optimization_registry",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/c/eager/c_api_remote_test.cc b/tensorflow/c/eager/c_api_remote_test.cc
index d04e4ef4212..93d830d2c90 100644
--- a/tensorflow/c/eager/c_api_remote_test.cc
+++ b/tensorflow/c/eager/c_api_remote_test.cc
@@ -19,11 +19,16 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/function_optimization_registry.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/casts.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/cluster.pb.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
 namespace {
@@ -574,6 +579,181 @@ TEST(CAPI, TestRemoteFunctionWithPackedInput) {
   TestFunctionWithPackedInput(/*remote=*/true);
 }
 
+string VariableAddFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'VariableAddFunction'"
+      "      input_arg {"
+      "        name: 'var0'"
+      "        type: DT_RESOURCE"
+      "      }"
+      "      output_arg {"
+      "        name: 'var0_value'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'read0'"
+      "      op: 'ReadVariableOp'"
+      "      input: 'var0'"
+      "      attr {"
+      "        key: 'dtype'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'add'"
+      "      op: 'Add'"
+      "      input: 'read0:value:0'"
+      "      input: 'read0:value:0'"
+      "      device: '/job:localhost/task:1/device:CPU:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'identity'"
+      "      op: 'Identity'"
+      "      input: 'add:z:0'"
+      "      device: '/job:localhost/task:0/device:CPU:0'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'var0_value'"
+      "      value: 'identity:output:0'"
+      "    }",
+      &def));
+  return def.SerializeAsString();
+}
+
+class FunctionErrorInjectionPass : public tensorflow::FunctionOptimizationPass {
+ public:
+  FunctionErrorInjectionPass(string error_node, string error_device)
+      : error_node_(error_node), error_device_(error_device) {}
+  tensorflow::Status Run(const tensorflow::DeviceSet& device_set,
+                         const tensorflow::ConfigProto& config_proto,
+                         std::unique_ptr<tensorflow::Graph>* graph,
+                         tensorflow::FunctionLibraryDefinition* flib_def,
+                         std::vector<std::string>* control_ret_node_names,
+                         bool* control_rets_updated) override {
+    // Inject failure to function instantiation if finding a node that contains
+    // the given node name (error_node_) and requested device (error_device_).
+    for (const auto node : graph->get()->nodes()) {
+      if (node->name().find(error_node_) != string::npos &&
+          node->requested_device() == error_device_) {
+        return tensorflow::errors::Internal("Injected graph pass error.");
+      }
+    }
+    return tensorflow::Status::OK();
+  }
+
+ private:
+  const string error_node_;
+  const string error_device_;
+};
+
+void TestDistributedFunctionCancellation(bool inject_error) {
+  tensorflow::ServerDef server_def = GetServerDef(3);
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server1;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server1)
+                  .ok());
+  ASSERT_TRUE(worker_server1->Start().ok());
+  server_def.set_task_index(2);
+  std::unique_ptr<tensorflow::GrpcServer> worker_server2;
+  ASSERT_TRUE(tensorflow::GrpcServer::Create(
+                  server_def, tensorflow::Env::Default(), &worker_server2)
+                  .ok());
+  ASSERT_TRUE(worker_server2->Start().ok());
+  const char dev2_name[] = "/job:localhost/replica:0/task:2/device:CPU:0";
+
+  if (inject_error) {
+    // Inject a function optimization pass failure when it sees the 'read0' op
+    // having a requested device `dev2_name`. During execution:
+    //   * task:0 processes the main function `VariableAddFunction` and places
+    //     the read0 op on task:2
+    //   * task:0 partitions the main function with a subgraph containing read0
+    //     sent to task:2
+    //   * task:2 graph pass reports an error when it sees read0 with dev2_name
+    tensorflow::function_optimization_registration::
+        FunctionOptimizationPassRegistration register_test_pass(
+            std::make_unique<FunctionErrorInjectionPass>("read0", dev2_name));
+  }
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_ContextSetServerDef(ctx, 0, serialized.data(), serialized.size(), status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* var_handle = TestVariable(ctx, 2.0, dev2_name);
+  EXPECT_NE(var_handle, nullptr);
+
+  const string function_def = VariableAddFunction();
+  TFE_ContextAddFunctionDef(ctx, function_def.data(), function_def.size(),
+                            status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+
+  TFE_Op* func = TFE_NewOp(ctx, "VariableAddFunction", status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_OpAddInput(func, var_handle, status);
+  ASSERT_EQ(TF_GetCode(status), TF_OK) << TF_Message(status);
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+  TFE_Execute(func, &retvals[0], &num_retvals, status);
+
+  if (inject_error) {
+    ASSERT_EQ(TF_INTERNAL, TF_GetCode(status)) << TF_Message(status);
+  } else {
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    ASSERT_EQ(1, num_retvals);
+    TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+    ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+    TFE_DeleteTensorHandle(retvals[0]);
+    float sum = 0;
+    ASSERT_EQ(sizeof(sum), TF_TensorByteSize(t));
+    memcpy(&sum, TF_TensorData(t), TF_TensorByteSize(t));
+    TF_DeleteTensor(t);
+    ASSERT_EQ(sum, 4.0);
+  }
+
+  TFE_DeleteOp(func);
+  TFE_DeleteTensorHandle(var_handle);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+
+  // TODO(b/136478427): Figure out how to correctly shut the server down.
+  worker_server1.release();
+  worker_server2.release();
+}
+
+TEST(CAPI, DistributedFunctionNoError) {
+  TestDistributedFunctionCancellation(false);
+}
+
+TEST(CAPI, DistributedFunctionCancelledOnError) {
+  TestDistributedFunctionCancellation(true);
+}
+
 void TestRemoteExecuteDeleteContextWithOutstandingRPC(bool async) {
   tensorflow::ServerDef server_def = GetServerDef(2);
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 271169f2a5e..364750b6679 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
 #include "tensorflow/core/common_runtime/replicate_per_replica_nodes.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -230,7 +231,7 @@ FunctionLibraryRuntime* ProcessFunctionLibraryRuntime::GetFLR(
   Device* device = nullptr;
   if (device_name != kDefaultFLRDevice) {
     if (!device_mgr_->LookupDevice(device_name, &device).ok()) {
-      VLOG(1) << "Could not find device: " << device_name;
+      VLOG(4) << "Could not find device: " << device_name;
       return nullptr;
     }
   }
@@ -1046,7 +1047,37 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
     return;
   }
 
-  auto* refcounted_done = new ReffedStatusCallback(std::move(done));
+  // A locally created cancellation manager, used only when the caller does not
+  // provide one in argument.
+  std::shared_ptr<CancellationManager> local_cm;
+  CancellationManager* cm = opts.cancellation_manager;
+  if (cm == nullptr) {
+    local_cm = std::make_shared<CancellationManager>();
+    cm = local_cm.get();
+  }
+  auto token = cm->get_cancellation_token();
+  const auto cancelled_error = errors::Cancelled(
+      "ProcessFunctionLibraryRuntime::RunMultiDevice was cancelled.");
+  const bool already_cancelled = !cm->RegisterCallback(
+      token,
+      [rendez = opts.rendezvous, n_func = data->glue_.size(), cancelled_error] {
+        // Abort rendezvous only if there are more than one component functions
+        // to avoid reporting cancellation error directly to PartitionedCallOps
+        // that launch a single component function.
+        if (rendez && n_func > 1) {
+          rendez->StartAbort(cancelled_error);
+        }
+      });
+  if (already_cancelled) {
+    done(cancelled_error);
+    return;
+  }
+
+  auto* refcounted_done = new ReffedStatusCallback(
+      [cm, token, local_cm, done = std::move(done)](const Status& s) {
+        cm->TryDeregisterCallback(token);
+        done(s);
+      });
   for (int i = 0; i < data->glue_.size(); ++i) {
     refcounted_done->Ref();
   }
@@ -1059,7 +1090,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
 
     opts_copy.args_alloc_attrs = comp_data.arg_alloc_attrs;
     opts_copy.rets_alloc_attrs = comp_data.ret_alloc_attrs;
-    opts_copy.remote_execution = false;
+    opts_copy.cancellation_manager = cm;
 
     InternalArgs comp_args;
     Status s = get_component_args(comp_data, &comp_args);
@@ -1067,13 +1098,39 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
       VLOG(2) << "Failed to get component function arguments: " << s;
       refcounted_done->UpdateStatus(s);
       refcounted_done->Unref();
+      cm->StartCancel();
       continue;
     }
     std::vector<Tensor>* comp_rets = new std::vector<Tensor>;
     rets->resize(data->num_outputs_);
 
+    auto component_fn_callback = [comp_rets, rets, comp_data, refcounted_done,
+                                  cm, local_cm, data,
+                                  target](const Status& status) {
+      if (!status.ok()) {
+        VLOG(2) << "Component function execution on target " << target
+                << " failed: " << status;
+        const string function_and_msg = strings::StrCat(
+            errors::FormatFunctionForError(data->function_name_), " ",
+            status.error_message());
+        refcounted_done->UpdateStatus(Status(status.code(), function_and_msg));
+        // Cancel the execution of other component functions.
+        cm->StartCancel();
+      } else {
+        VLOG(2) << "Component function execution on target " << target
+                << " succeeded.";
+        for (int i = 0; i < comp_rets->size(); ++i) {
+          (*rets)[comp_data.ret_indices[i]] = (*comp_rets)[i];
+        }
+      }
+      delete comp_rets;
+      // refcounted_done is thread-safe
+      refcounted_done->Unref();
+    };
+
     FunctionLibraryRuntime* flr = GetFLR(target);
     if (flr != nullptr) {
+      opts_copy.remote_execution = false;
       // When target device has private thread pool, use the target device
       // runner
       thread::ThreadPool* pool = flr->device()->tensorflow_device_thread_pool();
@@ -1084,24 +1141,7 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
       VLOG(4) << "    with " << opts_copy.DebugString();
 
       flr->Run(opts_copy, handle, GetLocalArgs(comp_args.args), comp_rets,
-               [comp_rets, rets, comp_data, refcounted_done,
-                data](const Status& status) {
-                 if (!status.ok()) {
-                   VLOG(2) << "Component function execution failed: " << status;
-                   const string function_and_msg = strings::StrCat(
-                       errors::FormatFunctionForError(data->function_name_),
-                       " ", status.error_message());
-                   refcounted_done->UpdateStatus(
-                       Status(status.code(), function_and_msg));
-                 } else {
-                   for (int i = 0; i < comp_rets->size(); ++i) {
-                     (*rets)[comp_data.ret_indices[i]] = (*comp_rets)[i];
-                   }
-                 }
-                 delete comp_rets;
-                 // refcounted_done is thread-safe
-                 refcounted_done->Unref();
-               });
+               std::move(component_fn_callback));
     } else {
       opts_copy.remote_execution = true;
 
@@ -1109,21 +1149,8 @@ void ProcessFunctionLibraryRuntime::RunMultiDevice(
               << " with handle " << handle;
       VLOG(4) << "    with " << opts_copy.DebugString();
 
-      RunInternal(
-          opts_copy, handle, comp_args.args, comp_rets, cleanup_items,
-          [comp_rets, rets, comp_data, refcounted_done](const Status& status) {
-            if (!status.ok()) {
-              VLOG(2) << "Component function execution failed: " << status;
-              refcounted_done->UpdateStatus(status);
-            } else {
-              for (int i = 0; i < comp_rets->size(); ++i) {
-                (*rets)[comp_data.ret_indices[i]] = (*comp_rets)[i];
-              }
-            }
-            delete comp_rets;
-            // refcounted_done is thread-safe
-            refcounted_done->Unref();
-          });
+      RunInternal(opts_copy, handle, comp_args.args, comp_rets, cleanup_items,
+                  std::move(component_fn_callback));
     }
   }
   refcounted_done->Unref();
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index c7fdfa176b1..c27758cbb44 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -42,6 +42,7 @@ cc_library(
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:worker_session",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
@@ -68,6 +69,7 @@ cc_library(
         "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:call_options",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index ec129173833..55f0697d2b4 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -20,9 +20,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_mgr.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -189,13 +191,31 @@ void EagerClusterFunctionLibraryRuntime::Run(
   op->Attrs().FillAttrValueMap(remote_op->mutable_attrs());
   remote_op->set_device(function_data->target);
 
+  CancellationManager* cm = opts.cancellation_manager;
+  CancellationToken token = 0;
+  auto call_opts = std::make_shared<CallOptions>();
+  if (cm != nullptr) {
+    token = cm->get_cancellation_token();
+    const bool already_cancelled = !cm->RegisterCallback(
+        token,
+        [call_opts, request, response, done]() { call_opts->StartCancel(); });
+    if (already_cancelled) {
+      done(errors::Cancelled("EagerClusterFunctionLibraryRuntime::Run"));
+      return;
+    }
+  }
+
   // Execute component function on remote worker using RunComponentFunction RPC.
   // Different from executing remote functions with Enqueue, this method runs
   // a function on remote worker without tying up a thread (i.e., pure
   // asynchronously).
   eager_client->RunComponentFunctionAsync(
-      request.get(), response.get(),
-      [request, response, rets, done = std::move(done)](const Status& s) {
+      call_opts.get(), request.get(), response.get(),
+      [request, response, rets, call_opts, cm, token,
+       done = std::move(done)](const Status& s) {
+        if (cm != nullptr) {
+          cm->TryDeregisterCallback(token);
+        }
         if (!s.ok()) {
           done(s);
           return;
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
index 9ca802d8a72..d6cf0943176 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_client.h
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
 
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/env.h"
@@ -38,12 +39,15 @@ class EagerClient : public core::RefCounted {
   CLIENT_METHOD(UpdateContext);
   CLIENT_METHOD(Enqueue);
   CLIENT_METHOD(WaitQueueDone);
-  CLIENT_METHOD(RunComponentFunction);
   CLIENT_METHOD(KeepAlive);
   CLIENT_METHOD(CloseContext);
 
 #undef CLIENT_METHOD
 
+  virtual void RunComponentFunctionAsync(
+      CallOptions* call_opts, const RunComponentFunctionRequest* request,
+      RunComponentFunctionResponse* response, StatusCallback done) = 0;
+
   // Feeds `request` into the request stream of EagerService::StreamingEnqueue.
   // `response` will be filled with the response for this `request`. The
   // 1-to-1 correspondence between requests and responses is a property
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 46a6181cfa9..3c537d99a3a 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -90,7 +90,8 @@ class FakeEagerClient : public EagerClient {
   CLIENT_METHOD(CloseContext);
 #undef CLIENT_METHOD
 
-  void RunComponentFunctionAsync(const RunComponentFunctionRequest* request,
+  void RunComponentFunctionAsync(CallOptions* call_opts,
+                                 const RunComponentFunctionRequest* request,
                                  RunComponentFunctionResponse* response,
                                  StatusCallback done) override {
     impl_->RunComponentFunction(request, response, std::move(done));
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index d7251029d10..c1deabc23cd 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -29,6 +29,7 @@ cc_library(
         "//tensorflow/core:eager_service_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
         "//tensorflow/core/distributed_runtime/rpc:grpc_client_cq_tag",
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 752bfdf71a1..c8288f28c36 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h"
 
 #include "grpcpp/generic/generic_stub.h"
+#include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
@@ -135,7 +136,6 @@ class GrpcEagerClient : public EagerClient {
   CLIENT_METHOD(UpdateContext);
   CLIENT_METHOD(Enqueue);
   CLIENT_METHOD(WaitQueueDone);
-  CLIENT_METHOD(RunComponentFunction);
   CLIENT_METHOD(KeepAlive);
 
 #undef CLIENT_METHOD
@@ -164,6 +164,18 @@ class GrpcEagerClient : public EagerClient {
     }
   }
 
+  void RunComponentFunctionAsync(CallOptions* call_opts,
+                                 const RunComponentFunctionRequest* request,
+                                 RunComponentFunctionResponse* response,
+                                 StatusCallback done) override {
+    StatusCallback done_wrapped = callback_wrapper(std::move(done));
+    new RPCState<protobuf::Message>(
+        &stub_, cq_, "/tensorflow.eager.EagerService/RunComponentFunction",
+        *request, response, std::move(done_wrapped), call_opts,
+        /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true,
+        &target_);
+  }
+
   void StreamingEnqueueAsync(const EnqueueRequest* request,
                              EnqueueResponse* response,
                              StatusCallback done) override {
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 5bb61eb8cc1..b973421efa4 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -144,6 +144,20 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
       recv_done();
     };
     wi_->RecvTensorAsync(&opts_, &req_, &resp_, std::move(cb));
+
+    // NOTE: Check if the rendezvous was aborted after sending out the RPC. The
+    // ordering is important because `StartAbort` could be called right before
+    // the `RecvTensorAsync` request registers its RPC cancellation to `opts_`.
+    // In that case, the previous `StartAbort` would not trigger the
+    // cancellation of this call.
+    Status s;
+    {
+      mutex_lock l(mu_);
+      s = status_;
+    }
+    if (!s.ok()) {
+      opts_.StartCancel();
+    }
   }
 
   string src_worker_;

From cf35170ceaebd332683582eb08b4315708b55f76 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 14:47:11 -0700
Subject: [PATCH 301/412] Fixes linkage error on pre-18 Android where GLESv3 is
 not available

With this CL gpu delegate is linkable for Android Apps supporting pre-18 API level.
This solution works because tflite gpu delegate only weak-imports OpenGL ES 3+ symbols. However, it may results in a runtime crash if gpu delegate tries to use
GLES3 symbols on those devices. A reasonable solution for pre-18 API is refusing
to delegate?

Two symbols were behaving as strong symbols ("glUnmapBuffer", "glMapBufferRange") because they were defined in a template only class, which would get preprocessed before #define that redefines GLES symbols into weak symbols.

PiperOrigin-RevId: 311805477
Change-Id: Ia217ebe64a975092a43869ece7d42f64c33bf795
---
 tensorflow/lite/delegates/gpu/BUILD           | 12 ++++++++++--
 tensorflow/lite/delegates/gpu/gl/gl_buffer.cc | 13 +++++++++++++
 tensorflow/lite/delegates/gpu/gl/gl_buffer.h  |  6 ++----
 3 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index 2581232bc2b..c667c2056f4 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -32,7 +32,11 @@ cc_library(
     linkopts = select({
         "//tensorflow:android": [
             "-lEGL",
-            "-lGLESv3",
+            # We don't need to link libGLESv3, because if it exists,
+            # it is a symlink to libGLESv2.
+            # See Compatibility Definition Document:
+            # https://source.android.com/compatibility/10/android-10-cdd#7_1_4_1_opengl_es
+            "-lGLESv2",
         ],
         "//conditions:default": [],
     }),
@@ -220,7 +224,11 @@ cc_library(
     linkopts = select({
         "//tensorflow:android": [
             "-lEGL",
-            "-lGLESv3",
+            # We don't need to link libGLESv3, because if it exists,
+            # it is a symlink to libGLESv2.
+            # See Compatibility Definition Document:
+            # https://source.android.com/compatibility/10/android-10-cdd#7_1_4_1_opengl_es
+            "-lGLESv2",
         ],
         "//conditions:default": [],
     }),
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
index 1de49676219..344e494690a 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.cc
@@ -145,6 +145,19 @@ absl::Status CreatePersistentBuffer(size_t size,
   return absl::OkStatus();
 }
 
+namespace gl_buffer_internal {
+
+BufferMapper::BufferMapper(GLenum target, size_t offset, size_t bytes,
+                           GLbitfield access)
+    : target_(target),
+      data_(glMapBufferRange(target_, offset, bytes, access)) {}
+
+BufferMapper::~BufferMapper() {
+  TFLITE_GPU_CALL_GL(glUnmapBuffer, target_).IgnoreError();
+}
+
+};  // namespace gl_buffer_internal
+
 }  // namespace gl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
index 3225679ec5a..1877fb1f144 100644
--- a/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
+++ b/tensorflow/lite/delegates/gpu/gl/gl_buffer.h
@@ -229,11 +229,9 @@ class BufferBinder {
 // RAII for mapping and unmapping a buffer.
 class BufferMapper {
  public:
-  BufferMapper(GLenum target, size_t offset, size_t bytes, GLbitfield access)
-      : target_(target),
-        data_(glMapBufferRange(target_, offset, bytes, access)) {}
+  BufferMapper(GLenum target, size_t offset, size_t bytes, GLbitfield access);
 
-  ~BufferMapper() { TFLITE_GPU_CALL_GL(glUnmapBuffer, target_).IgnoreError(); }
+  ~BufferMapper();
 
   void* data() { return data_; }
 

From bc96c17ece9424c9d1f3f4d80675a1faadad247d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 14:58:07 -0700
Subject: [PATCH 302/412] Handle head extract outside compilation cluster
 device assignment.

PiperOrigin-RevId: 311807325
Change-Id: I0155a0d4e1aa62c29d0a58c3539b3eba22e7e85c
---
 ...extract_head_tail_outside_compilation.mlir | 195 +++++++++++-------
 ...u_extract_head_tail_outside_compilation.cc | 100 ++++++++-
 2 files changed, 218 insertions(+), 77 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
index eb67bdcc914..90fa8cff5dc 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_extract_head_tail_outside_compilation.mlir
@@ -2,80 +2,135 @@
 
 // Tests extraction of a outside compiled ops at head of TPU computation.
 
-func @single_head_outside_compilation(%arg0 : tensor<i32>) -> () {
-  // CHECK:      tf_device.launch
-  // CHECK:        "tf.A"
-  // CHECK-NEXT:   tf_device.return
-  //
-  // CHECK:      "tf_device.cluster"
-  // CHECK:        "tf.C"
-  // CHECK-NEXT:   tf_device.return
-  "tf_device.cluster"() ( {
-    "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
-    "tf.B"() : () -> ()
-    "tf.C"() : () -> ()
-    tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  return
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  // CHECK-LABEL: func @single_head_outside_compilation
+  func @single_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+    // CHECK:      tf_device.launch
+    //
+    // CHECK:        "tf.A"
+    // CHECK-NEXT:   tf_device.return
+    //
+    // CHECK:      device
+    // CHECK-SAME:  "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> ()
+      "tf.B"() : () -> ()
+      "tf.C"() : () -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
 }
 
-// CHECK-LABEL: func @multiple_head_outside_compilation
-func @multiple_head_outside_compilation(%arg0 : tensor<i32>) -> () {
-  // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
-  // CHECK:        %[[A_OUT:.*]] = "tf.A"
-  // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
-  // CHECK:        "tf.C"
-  // CHECK-NEXT:   tf_device.return %[[B_OUT]]
-  //
-  // CHECK:      "tf_device.cluster"
-  // CHECK:        "tf.D"(%[[LAUNCH_OUT]])
-  // CHECK-NEXT:   tf_device.return
-  "tf_device.cluster"() ( {
-    %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
-    %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
-    "tf.C"(%1, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
-    "tf.D"(%1) : (tensor<i32>) -> ()
-    tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  return
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} {
+  // CHECK-LABEL: func @multiple_head_outside_compilation
+  func @multiple_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+    // CHECK:        %[[B_OUT:.*]] = "tf.B"(%[[A_OUT]])
+    // CHECK:        "tf.C"
+    // CHECK-NEXT:   tf_device.return %[[B_OUT]]
+    // CHECK:      device
+    // CHECK-SAME:  "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        "tf.D"(%[[LAUNCH_OUT]])
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+      "tf.C"(%1, %arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> ()
+      "tf.D"(%1) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
 }
 
-// CHECK-LABEL: func @test_do_not_outside_compiled_ops_in_middle
-func @test_do_not_outside_compiled_ops_in_middle(%arg0 : tensor<i32>) -> () {
-  // CHECK-NOT:  tf_device.launch
-  // CHECK:      "tf_device.cluster"
-  // CHECK-NEXT:   "tf.A"
-  // CHECK-NEXT:   "tf.B"
-  // CHECK-NEXT:   "tf.C"
-  // CHECK-NEXT:   tf_device.return
-  "tf_device.cluster"() ( {
-    %0 = "tf.A"(%arg0) {} : (tensor<i32>) -> (tensor<i32>)
-    %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
-    "tf.C"(%1) : (tensor<i32>) -> ()
-    tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  return
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} { 
+  // CHECK-LABEL: func @test_do_not_outside_compiled_ops_in_middle
+  func @test_do_not_outside_compiled_ops_in_middle(%arg0 : tensor<i32>) -> () {
+    // CHECK-NOT:  tf_device.launch
+    // CHECK:      "tf_device.cluster"
+    // CHECK-NEXT:   "tf.A"
+    // CHECK-NEXT:   "tf.B"
+    // CHECK-NEXT:   "tf.C"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %0 = "tf.A"(%arg0) {} : (tensor<i32>) -> (tensor<i32>)
+      %1 = "tf.B"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+      "tf.C"(%1) : (tensor<i32>) -> ()
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
 }
 
-// CHECK-LABEL: func @test_ops_with_tpu_operands_not_extracted
-func @test_ops_with_tpu_operands_not_extracted(%arg0 : tensor<i32>) -> () {
-  // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
-  // CHECK:        %[[A_OUT:.*]] = "tf.A"
-  // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
-  // CHECK-NEXT:   tf_device.return %[[D_OUT]]
-  //
-  // CHECK:      "tf_device.cluster"
-  // CHECK:        "tf.B"
-  // CHECK:        "tf.C"
-  // CHECK:        "tf.E"
-  // CHECK-NEXT:   tf_device.return
-  "tf_device.cluster"() ( {
-    %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
-    %1 = "tf.B"() {} : () -> (tensor<i32>)
-    %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
-    %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
-    %4 = "tf.E"(%3) {} : (tensor<i32>) -> (tensor<i32>)
-    tf_device.return
-  }) {device = "tpu0", launch_attr = "launch_attr"} : () -> ()
-  return
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} { 
+  // CHECK-LABEL: func @test_ops_with_tpu_operands_not_extracted
+  func @test_ops_with_tpu_operands_not_extracted(%arg0 : tensor<i32>) -> () {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+    // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
+    // CHECK-NEXT:   tf_device.return %[[D_OUT]]
+    // CHECK:      device
+    // CHECK-SAME: "/job:worker/replica:0/task:0/device:CPU:0"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        "tf.B"
+    // CHECK:        "tf.C"
+    // CHECK:        "tf.E"
+    // CHECK-NEXT:   tf_device.return
+    "tf_device.cluster"() ( {
+      %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+      %1 = "tf.B"() {} : () -> (tensor<i32>)
+      %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+      %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+      %4 = "tf.E"(%3) {} : (tensor<i32>) -> (tensor<i32>)
+      tf_device.return
+    }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0"]} { 
+  // CHECK-LABEL: func @test_replicated_head_outside_compilation
+  func @test_replicated_head_outside_compilation(%arg0 : tensor<i32>) -> () {
+    // CHECK:      %[[LAUNCH_OUT:.*]] = "tf_device.launch"()
+    // CHECK:        %[[A_OUT:.*]] = "tf.A"
+    // CHECK:        %[[D_OUT:.*]] = "tf.D"(%[[A_OUT]])
+    // CHECK-NEXT:   tf_device.return %[[D_OUT]]
+    // CHECK:      device
+    // CHECK-SAME: "TPU_REPLICATED_HOST"
+    //
+    // CHECK:      "tf_device.cluster"
+    // CHECK:        "tf.B"
+    // CHECK:        "tf.C"
+    // CHECK:        "tf.E"
+    // CHECK-NEXT:   tf_device.return
+    tf_device.replicate() {n = 2 : i32} {
+      "tf_device.cluster"() ( {
+        %0 = "tf.A"(%arg0) {_xla_outside_compilation = "cluster1"} : (tensor<i32>) -> (tensor<i32>)
+        %1 = "tf.B"() {} : () -> (tensor<i32>)
+        %2 = "tf.C"(%arg0, %1) {_xla_outside_compilation = "cluster1"} : (tensor<i32>, tensor<i32>) -> (tensor<i32>)
+        %3 = "tf.D"(%0) {_xla_outside_compilation = "cluster1"}: (tensor<i32>) -> (tensor<i32>)
+        %4 = "tf.E"(%3) {} : (tensor<i32>) -> (tensor<i32>)
+        tf_device.return
+      }) {num_cores_per_replica = 1, step_marker_location = "", padding_map = [], topology = "", device_assignment = []} : () -> ()
+      tf_device.return
+    }
+    return
+  }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
index b9e214470cd..02d0c3e849b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_extract_head_tail_outside_compilation.cc
@@ -20,17 +20,22 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 
 namespace mlir {
 namespace TFTPU {
@@ -188,6 +193,82 @@ llvm::Optional<tf_device::LaunchOp> IsolateHeadExtractedOpsToLaunchOp(
   return host_launch_op;
 }
 
+// Parses TPU compilation and execution device form tpu cluster and assigns
+// host device to `host_launch` device attribute.
+LogicalResult SetCompilationDeviceToHostLaunch(
+    OpBuilder* builder, mlir::TF::RuntimeDevices devices,
+    tf_device::ClusterOp tpu_cluster, tf_device::LaunchOp host_launch) {
+  auto num_cores_per_replica_attr = tpu_cluster.getAttrOfType<IntegerAttr>(
+      tensorflow::kNumCoresPerReplicaAttr);
+  if (!num_cores_per_replica_attr)
+    return tpu_cluster.emitOpError(
+        "cluster op missing `num_cores_per_replica` attribute");
+
+  if (num_cores_per_replica_attr.getInt() != 1)
+    return tpu_cluster.emitOpError(
+        "outside compilation is not supported with model parallelism.");
+
+  auto topology_attr =
+      tpu_cluster.getAttrOfType<StringAttr>(tensorflow::kTopologyAttr);
+  if (!topology_attr)
+    return tpu_cluster.emitOpError("cluster op missing `topology` attribute");
+
+  auto device_assignment_attr = tpu_cluster.getAttrOfType<mlir::ArrayAttr>(
+      tensorflow::kDeviceAssignmentAttr);
+  if (!device_assignment_attr)
+    return tpu_cluster.emitOpError(
+        llvm::formatv("requires attribute '{0}'",
+                      tensorflow::kDeviceAssignmentAttr)
+            .str());
+
+  auto status_or_device_coodinates =
+      tensorflow::GetDeviceCoordinates(device_assignment_attr);
+
+  if (!status_or_device_coodinates.ok())
+    return tpu_cluster.emitError()
+           << "error in fetching tpu device coordinates: "
+           << status_or_device_coodinates.status().error_message();
+
+  // Determine compilation and execution devices.
+  auto status_or_tpu_device_assignment =
+      tensorflow::GetTPUCompilationAndExecutionDevices(
+          devices.device_names(), /*num_replicas=*/1,
+          /*num_cores_per_replica=*/1, topology_attr.getValue(),
+          status_or_device_coodinates.ConsumeValueOrDie());
+  if (!status_or_tpu_device_assignment.ok())
+    return tpu_cluster.emitError()
+           << "error in fetching TPU compilation/execution devices: "
+           << status_or_tpu_device_assignment.status().error_message();
+  auto& tpu_device_assignment = status_or_tpu_device_assignment.ValueOrDie();
+  host_launch.deviceAttr(
+      builder->getStringAttr(tpu_device_assignment.tpu_devices[0][0].host));
+
+  return success();
+}
+
+// Assigns host device attribute to host launch op or enclosing
+// tf_device.replicate op if TPU computation is replicated.
+LogicalResult HandleHostLaunchDeviceAssignment(
+    OpBuilder* builder, mlir::TF::RuntimeDevices devices,
+    tf_device::ClusterOp tpu_cluster, tf_device::LaunchOp host_launch) {
+  auto parent_replicate_op =
+      llvm::dyn_cast_or_null<tf_device::ReplicateOp>(host_launch.getParentOp());
+  // If computation is replicated, then add TPU_REPLICATED_HOST device alias
+  // to the host launch op. This device alias would later be a reference to
+  // host device string in the device map of tf_device.replicate op
+  // during tpu_rewrite pass.
+  if (parent_replicate_op) {
+    host_launch.deviceAttr(
+        builder->getStringAttr(tensorflow::kTPUReplicatedHost));
+  } else {
+    if (failed(SetCompilationDeviceToHostLaunch(builder, devices, tpu_cluster,
+                                                host_launch)))
+      return failure();
+  }
+
+  return success();
+}
+
 struct TPUExtractHeadTailOutsideCompilation
     : public PassWrapper<TPUExtractHeadTailOutsideCompilation,
                          OperationPass<ModuleOp>> {
@@ -202,17 +283,22 @@ void TPUExtractHeadTailOutsideCompilation::runOnOperation() {
     return signalPassFailure();
 
   OpBuilder builder(&getContext());
-  module.walk([&](tf_device::ClusterOp cluster) {
+  auto result = module.walk([&](tf_device::ClusterOp cluster) {
     auto head_outside_compiled_ops = IdentifyOutsideCompiledOpsAtHead(cluster);
-    IsolateHeadExtractedOpsToLaunchOp(&builder, cluster,
-                                      head_outside_compiled_ops);
-
-    // TODO(b/156030523): Update device attribute of newly created host launch
-    // op as well as enclosing Replicate op (if TPU computation is replicated)
-    // with host device names.
+    auto host_launch_op = IsolateHeadExtractedOpsToLaunchOp(
+        &builder, cluster, head_outside_compiled_ops);
+    if (host_launch_op) {
+      if (failed(HandleHostLaunchDeviceAssignment(&builder, devices, cluster,
+                                                  *host_launch_op))) {
+        return WalkResult::interrupt();
+      }
+    }
 
     // TODO(b/155115766): Implement tail outside compiled op extraction.
+    return WalkResult::advance();
   });
+
+  if (result.wasInterrupted()) signalPassFailure();
 }
 
 }  // anonymous namespace

From 0c7e5ac6c9666ab1d9be3076a14ce128c8ef3403 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 15:02:32 -0700
Subject: [PATCH 303/412] Enable SVD shape test in eager mode.

PiperOrigin-RevId: 311808080
Change-Id: I5d77485f0f17aae4647aca0b4512f231fd1f3290
---
 tensorflow/python/kernel_tests/svd_op_test.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index 6c2199cc591..eae42f55a3f 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -46,16 +47,16 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class SvdOpTest(test.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 0"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*0"):
       linalg_ops.svd(scalar)
     vector = constant_op.constant([1., 2.])
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 1"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*1"):
       linalg_ops.svd(vector)
 
   @test_util.run_in_graph_and_eager_modes(use_gpu=True)

From f1471bd25c127d5352eeb6c61c832c35b129e280 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Fri, 15 May 2020 15:03:35 -0700
Subject: [PATCH 304/412] Include what you use for the micro_framework bazel
 target.

PiperOrigin-RevId: 311808278
Change-Id: I2869b7c191c71461d44edc77dc1cd3999c6376d9
---
 tensorflow/lite/kernels/op_macros.h                 |  1 +
 tensorflow/lite/micro/BUILD                         |  2 ++
 tensorflow/lite/micro/memory_helpers.cc             |  4 ++++
 tensorflow/lite/micro/memory_helpers.h              |  3 +++
 tensorflow/lite/micro/micro_allocator.cc            |  2 ++
 tensorflow/lite/micro/micro_allocator.h             |  6 +++++-
 tensorflow/lite/micro/micro_error_reporter.cc       |  3 +++
 tensorflow/lite/micro/micro_error_reporter.h        |  3 ++-
 tensorflow/lite/micro/micro_interpreter.cc          | 10 +++++++---
 tensorflow/lite/micro/micro_interpreter.h           |  4 ++++
 tensorflow/lite/micro/micro_mutable_op_resolver.h   |  3 +++
 tensorflow/lite/micro/micro_optional_debug_tools.cc |  9 +++++++++
 tensorflow/lite/micro/simple_memory_allocator.cc    |  1 +
 tensorflow/lite/micro/simple_memory_allocator.h     |  2 +-
 tensorflow/lite/micro/test_helpers.cc               |  7 ++++++-
 tensorflow/lite/micro/test_helpers.h                |  4 +++-
 16 files changed, 56 insertions(+), 8 deletions(-)

diff --git a/tensorflow/lite/kernels/op_macros.h b/tensorflow/lite/kernels/op_macros.h
index 33d033b10b6..8c1a6b1be16 100644
--- a/tensorflow/lite/kernels/op_macros.h
+++ b/tensorflow/lite/kernels/op_macros.h
@@ -19,6 +19,7 @@ limitations under the License.
 // non-portable function.
 #ifdef TF_LITE_MCU_DEBUG_LOG
 
+#include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 
 #define DEBUG_LOG(x) \
diff --git a/tensorflow/lite/micro/BUILD b/tensorflow/lite/micro/BUILD
index 5742a383b0f..67471bc64a6 100644
--- a/tensorflow/lite/micro/BUILD
+++ b/tensorflow/lite/micro/BUILD
@@ -56,8 +56,10 @@ cc_library(
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/kernels/internal:compatibility",
         "//tensorflow/lite/kernels/internal:tensor",
+        "//tensorflow/lite/micro/memory_planner",
         "//tensorflow/lite/micro/memory_planner:greedy_memory_planner",
         "//tensorflow/lite/schema:schema_fbs",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
diff --git a/tensorflow/lite/micro/memory_helpers.cc b/tensorflow/lite/micro/memory_helpers.cc
index 302f160a235..c1b761bf088 100644
--- a/tensorflow/lite/micro/memory_helpers.cc
+++ b/tensorflow/lite/micro/memory_helpers.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/memory_helpers.h"
 
+#include <cstddef>
 #include <cstdint>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/micro/memory_helpers.h b/tensorflow/lite/micro/memory_helpers.h
index ef8205c8038..f52da062271 100644
--- a/tensorflow/lite/micro/memory_helpers.h
+++ b/tensorflow/lite/micro/memory_helpers.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MEMORY_HELPERS_H_
 #define TENSORFLOW_LITE_MICRO_MEMORY_HELPERS_H_
 
+#include <cstddef>
+#include <cstdint>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 54ce3383a08..1dd1fa4b63c 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
@@ -26,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 #include "tensorflow/lite/micro/memory_planner/greedy_memory_planner.h"
+#include "tensorflow/lite/micro/memory_planner/memory_planner.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/micro/micro_allocator.h b/tensorflow/lite/micro/micro_allocator.h
index 6a6e1e03e53..d05974f365a 100644
--- a/tensorflow/lite/micro/micro_allocator.h
+++ b/tensorflow/lite/micro/micro_allocator.h
@@ -15,9 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_ALLOCATOR_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_ALLOCATOR_H_
 
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/micro/simple_memory_allocator.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
diff --git a/tensorflow/lite/micro/micro_error_reporter.cc b/tensorflow/lite/micro/micro_error_reporter.cc
index bea3dc8db4c..6d8361cd25a 100644
--- a/tensorflow/lite/micro/micro_error_reporter.cc
+++ b/tensorflow/lite/micro/micro_error_reporter.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 
+#include <cstdarg>
+
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
+#include "tensorflow/lite/micro/debug_log.h"
 #include "tensorflow/lite/micro/micro_string.h"
 #endif
 
diff --git a/tensorflow/lite/micro/micro_error_reporter.h b/tensorflow/lite/micro/micro_error_reporter.h
index b18c47f4ecb..e2c073a465d 100644
--- a/tensorflow/lite/micro/micro_error_reporter.h
+++ b/tensorflow/lite/micro/micro_error_reporter.h
@@ -15,9 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_ERROR_REPORTER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_ERROR_REPORTER_H_
 
+#include <cstdarg>
+
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/micro/compatibility.h"
-#include "tensorflow/lite/micro/debug_log.h"
 
 namespace tflite {
 
diff --git a/tensorflow/lite/micro/micro_interpreter.cc b/tensorflow/lite/micro/micro_interpreter.cc
index 2d774d0a139..b46f9ecb9ea 100644
--- a/tensorflow/lite/micro/micro_interpreter.cc
+++ b/tensorflow/lite/micro/micro_interpreter.cc
@@ -14,12 +14,16 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/micro/micro_interpreter.h"
 
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/flatbuffer_conversions.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/core/api/tensor_utils.h"
-#include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/micro/micro_allocator.h"
-#include "tensorflow/lite/micro/micro_optional_debug_tools.h"
 
 namespace tflite {
 namespace {
diff --git a/tensorflow/lite/micro/micro_interpreter.h b/tensorflow/lite/micro/micro_interpreter.h
index 15f53b681a6..180a557668e 100644
--- a/tensorflow/lite/micro/micro_interpreter.h
+++ b/tensorflow/lite/micro/micro_interpreter.h
@@ -15,6 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_INTERPRETER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_INTERPRETER_H_
 
+#include <cstddef>
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
diff --git a/tensorflow/lite/micro/micro_mutable_op_resolver.h b/tensorflow/lite/micro/micro_mutable_op_resolver.h
index ead9be490a3..6c3e9a3331e 100644
--- a/tensorflow/lite/micro/micro_mutable_op_resolver.h
+++ b/tensorflow/lite/micro/micro_mutable_op_resolver.h
@@ -15,7 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
 #define TENSORFLOW_LITE_MICRO_MICRO_MUTABLE_OP_RESOLVER_H_
 
+#include <cstring>
+
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/micro/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/micro/micro_optional_debug_tools.cc b/tensorflow/lite/micro/micro_optional_debug_tools.cc
index 42c42aea9f8..daa5d007cdf 100644
--- a/tensorflow/lite/micro/micro_optional_debug_tools.cc
+++ b/tensorflow/lite/micro/micro_optional_debug_tools.cc
@@ -20,8 +20,17 @@ limitations under the License.
 #endif
 
 #include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <vector>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/micro_allocator.h"
+#include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+
 namespace tflite {
 namespace {
 
diff --git a/tensorflow/lite/micro/simple_memory_allocator.cc b/tensorflow/lite/micro/simple_memory_allocator.cc
index be7c469529e..911e1e404f7 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.cc
+++ b/tensorflow/lite/micro/simple_memory_allocator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/micro/memory_helpers.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/micro/simple_memory_allocator.h b/tensorflow/lite/micro/simple_memory_allocator.h
index ed73104a2c6..223ef8398a4 100644
--- a/tensorflow/lite/micro/simple_memory_allocator.h
+++ b/tensorflow/lite/micro/simple_memory_allocator.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
 #define TENSORFLOW_LITE_MICRO_SIMPLE_MEMORY_ALLOCATOR_H_
 
+#include <cstddef>
 #include <cstdint>
 
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc
index 77a1cc82f3b..c2607cd32c6 100644
--- a/tensorflow/lite/micro/test_helpers.cc
+++ b/tensorflow/lite/micro/test_helpers.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "tensorflow/lite/micro/test_helpers.h"
 
+#include <cstdarg>
+#include <cstddef>
+#include <cstdint>
 #include <initializer_list>
+#include <new>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/tensor_utils.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/lite/micro/test_helpers.h b/tensorflow/lite/micro/test_helpers.h
index 010e1f9e336..2d1d2895db0 100644
--- a/tensorflow/lite/micro/test_helpers.h
+++ b/tensorflow/lite/micro/test_helpers.h
@@ -18,8 +18,10 @@ limitations under the License.
 
 // Useful functions for writing tests.
 
+#include <cstdint>
+
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/micro/micro_utils.h"
 #include "tensorflow/lite/schema/schema_generated.h"

From 8f1e715482accc94f9859954ed8b334c88c2b0cb Mon Sep 17 00:00:00 2001
From: Michael Gester <mgester@google.com>
Date: Fri, 15 May 2020 15:04:55 -0700
Subject: [PATCH 305/412] Remove debug message

PiperOrigin-RevId: 311808620
Change-Id: I3c1ded522e5e2a9487ee9b1c2307d5e72820c9e6
---
 tensorflow/compiler/mlir/tensorflow/translate/import_model.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index a613ce1f920..37bbbbe5ee4 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -978,7 +978,6 @@ StatusOr<mlir::Type> ImporterBase::InferOutputType(const Node& node, int idx,
     if (dtype == DT_RESOURCE) {
       const AttrValue* dtype_attr = node.attrs().Find("_handle_dtypes");
       const AttrValue* shape_attr = node.attrs().Find("_handle_shapes");
-      LOG(INFO) << dtype_attr << " " << shape_attr;
       if (dtype_attr && shape_attr) {
         if (dtype_attr->list().type().empty()) {
           return errors::InvalidArgument(

From 40e0712354815c9b4fd695e3ff7c231e55abb64c Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 15 May 2020 15:10:59 -0700
Subject: [PATCH 306/412] [tf.lite] Avoid designated initializers

PiperOrigin-RevId: 311809643
Change-Id: I35af646b5e84d9ae7b25aa3cd52ae6b2eb5f0298
---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.cc          | 7 ++++---
 tensorflow/lite/delegates/nnapi/nnapi_delegate.h           | 7 ++++---
 tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc | 5 ++++-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 39ab19aed2d..b3967800b44 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -4019,6 +4019,8 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph(
 
 using ::tflite::delegate::nnapi::NNAPIDelegateKernel;
 
+StatefulNnApiDelegate::Data::Data(const NnApi* nnapi) : nnapi(nnapi) {}
+
 StatefulNnApiDelegate::Data::~Data() {
   std::for_each(std::begin(delegate_state_cache),
                 std::end(delegate_state_cache),
@@ -4056,9 +4058,7 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(Options options)
 
 StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
                                              Options options)
-    : TfLiteDelegate(TfLiteDelegateCreate()),
-      delegate_data_(Data{.execution_preference = options.execution_preference,
-                          .nnapi = nnapi}) {
+    : TfLiteDelegate(TfLiteDelegateCreate()), delegate_data_(nnapi) {
   if (options.accelerator_name) {
     delegate_data_.accelerator_name = options.accelerator_name;
   }
@@ -4068,6 +4068,7 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(const NnApi* nnapi,
   if (options.model_token) {
     delegate_data_.model_token = options.model_token;
   }
+  delegate_data_.execution_preference = options.execution_preference;
   delegate_data_.disallow_nnapi_cpu = options.disallow_nnapi_cpu;
   delegate_data_.max_number_delegated_partitions =
       options.max_number_delegated_partitions;
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
index 68c55e1aef4..7ef02bc5107 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.h
@@ -181,8 +181,6 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
  private:
   // Encapsulates all delegate data.
   struct Data {
-    // Preferred Power/perf trade-off.
-    Options::ExecutionPreference execution_preference;
     // Pointer to NNAPI implementation to be used by this delegate as
     // set when building the StatefulNnApiDelegate instance.
     // Will generally be the NnApiInstance() singleton but can be overridden
@@ -190,6 +188,8 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // The ownership of the nnapi instance is left to the caller of
     // the StatefulNnApiDelegate constructor.
     const NnApi* nnapi;
+    // Preferred Power/perf trade-off.
+    Options::ExecutionPreference execution_preference;
     // Selected NNAPI accelerator name.
     std::string accelerator_name;
     // The cache dir for NNAPI model.
@@ -202,7 +202,7 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     std::vector<MemoryRegistration> tensor_memory_map;
     // Contains a non zero value if any NNAPI method call
     // operation returned a non zero result code.
-    int nnapi_errno;
+    int nnapi_errno = ANEURALNETWORKS_NO_ERROR;
     // Cache of kernels already built in StatefulNnApiDelegate::DoPrepare
     // when trying to understand if all nodes are supported by the target
     // accelerators.
@@ -226,6 +226,7 @@ class StatefulNnApiDelegate : public TfLiteDelegate {
     // the execution
     uint64_t max_execution_loop_timeout_duration_ns = 0;
 
+    explicit Data(const NnApi* nnapi);
     ~Data();
 
     // Caches an initialised NNAPIDelegateKernel.
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
index 3c23054ea25..2bc7ae58449 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_disabled.cc
@@ -27,7 +27,8 @@ StatefulNnApiDelegate::StatefulNnApiDelegate(Options /* options */)
     : StatefulNnApiDelegate() {}
 
 StatefulNnApiDelegate::StatefulNnApiDelegate()
-    : TfLiteDelegate(TfLiteDelegateCreate()) {
+    : TfLiteDelegate(TfLiteDelegateCreate()),
+      delegate_data_(/*nnapi=*/nullptr) {
   Prepare = DoPrepare;
 }
 
@@ -46,6 +47,8 @@ int StatefulNnApiDelegate::GetNnApiErrno() const { return 0; }
 
 using ::tflite::delegate::nnapi::NNAPIDelegateKernel;
 
+StatefulNnApiDelegate::Data::Data(const NnApi* nnapi) : nnapi(nnapi) {}
+
 StatefulNnApiDelegate::Data::~Data() {}
 
 void StatefulNnApiDelegate::Data::CacheDelegateKernel(

From f7d5cb929b63ac6717f294f710cd235b5ec4ef75 Mon Sep 17 00:00:00 2001
From: Michael Gester <mgester@google.com>
Date: Fri, 15 May 2020 15:25:26 -0700
Subject: [PATCH 307/412] Print message about dumped MLIR modules

PiperOrigin-RevId: 311811882
Change-Id: I6c85e75c87d3ca413631927d11aff61f7ed9b39f
---
 tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
index 06805e633e2..d7b511094d3 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
@@ -38,6 +38,7 @@ inline static void Log(BridgeLoggerConfig::PrintCallbackFn print_callback,
   std::unique_ptr<llvm::raw_ostream> os;
   std::string filepath;
   if (CreateFileForDumping(name, &os, &filepath).ok()) print_callback(*os);
+  VLOG(1) << "Dumped MLIR module to " << filepath;
 }
 
 void BridgeLoggerConfig::printBeforeIfEnabled(mlir::Pass* pass,

From b6284742e41f0ce702e5a5bdefb18795f559568e Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 15 May 2020 15:40:20 -0700
Subject: [PATCH 308/412] Converting some of the dependencies in
 tensorflow/c/BUILD to use portable_tensorflow_lib_lite_no_runtime.

PiperOrigin-RevId: 311814443
Change-Id: I42e75403c81babba32d4b9bb99ab4eed21e6ba44
---
 tensorflow/c/BUILD                     | 10 ++++++++--
 tensorflow/core/platform/default/BUILD |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 05d5f9a3ed2..e2781afc3e5 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -394,8 +394,14 @@ tf_cuda_library(
     deps = [
         ":tf_status",
         ":tf_status_internal",
-        "//tensorflow/core:lib",
-    ],
+    ] + select({
+        "//tensorflow:android": [
+            "//tensorflow/core:portable_tensorflow_lib_lite",  # TODO(annarev): exclude runtime srcs
+        ],
+        "//conditions:default": [
+            "//tensorflow/core:lib",
+        ],
+    }),
 )
 
 tf_cc_test(
diff --git a/tensorflow/core/platform/default/BUILD b/tensorflow/core/platform/default/BUILD
index 49318fd0811..89231b0f206 100644
--- a/tensorflow/core/platform/default/BUILD
+++ b/tensorflow/core/platform/default/BUILD
@@ -509,6 +509,7 @@ filegroup(
 filegroup(
     name = "mobile_srcs_no_runtime",
     srcs = [
+        "casts.h",
         "context.h",
         "dynamic_annotations.h",
         "env.cc",

From ec52e0fcd3107c060ee116781c73e1cad4d19219 Mon Sep 17 00:00:00 2001
From: Aart Bik <ajcbik@google.com>
Date: Fri, 15 May 2020 15:52:55 -0700
Subject: [PATCH 309/412] added missing CHECK, whitespace cleanup

PiperOrigin-RevId: 311816309
Change-Id: I4181bf3d82e82f4ee60f2a894f487c19522490d1
---
 tensorflow/python/tf_program/tests/mlir_gen_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/tf_program/tests/mlir_gen_test.py b/tensorflow/python/tf_program/tests/mlir_gen_test.py
index 5e1ca5b36e0..49737352d73 100644
--- a/tensorflow/python/tf_program/tests/mlir_gen_test.py
+++ b/tensorflow/python/tf_program/tests/mlir_gen_test.py
@@ -83,7 +83,7 @@ class MLIRGenTest(MLIRGenTestBase):
       CHECK-LABEL: func @test_fn(%arg0: i1, %arg1: i1) -> i1
       CHECK: %[[r0:[0-9]+]] = "tfp.And"(%arg0, %arg0, %arg1) : (i1, i1, i1) -> tensor<*xi1>
       CHECK: %[[r1:[0-9]+]] = "tfp.Or"(%arg0, %arg1, %[[r0]]) : (i1, i1, tensor<*xi1>) -> tensor<*xi1>
-      return %[[r1]] : tensor<*xi1>
+      CHECK: return %[[r1]] : tensor<*xi1>
     """
     self._check_code(mlir_code, exp_mlir_code)
 
@@ -158,7 +158,7 @@ class MLIRGenTest(MLIRGenTestBase):
     mlir_code = mlir_gen(test_fn)
     exp_mlir_code = r"""
       CHECK-LABEL: func @test_fn(%arg0: tensor<*xi32>) -> i32
-      
+
       CHECK: %[[r1:[0-9]+]] = "tf.Greater"(%arg0, %{{[0-9]+}}) : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
       CHECK-NEXT: %[[r2:[0-9]+]] = "tfp.If"(%[[r1]]) ( {
         CHECK: return %{{[0-9]+}} : tensor<i32>
@@ -222,7 +222,7 @@ class MLIRGenTest(MLIRGenTestBase):
       CHECK: %[[r5:[0-9]+]] = "tf.Equal"(%arg0, %{{[0-9]+}}) {incompatible_shape_error = true} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
       CHECK: %[[r7:[0-9]+]] = "tf.Equal"(%arg0, %{{[0-9]+}}) {incompatible_shape_error = true} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
       CHECK: %[[r8:[0-9]+]] = "tfp.Or"(%[[r5]], %[[r7]]) : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-      
+
       CHECK: %[[r9:[0-9]+]]:4 = "tfp.If"(%[[r8]]) ( {
         CHECK-NEXT: return %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>, tensor<{{(\*x)?}}i32>
         CHECK-NEXT: },  {

From da27ac6878d739eed3d3bebdaf9be260c47df14a Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Fri, 15 May 2020 15:54:21 -0700
Subject: [PATCH 310/412] Support int16 quantization type

This patch is just changing a hard-coded 8 bits setting to be configured by the inference type.

PiperOrigin-RevId: 311816528
Change-Id: I8da61fb0751122e29134d13e5f8200c89980e131
---
 .../compiler/mlir/lite/transforms/prepare_quantize.cc       | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index a9e10a485bf..87cae3dd957 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -70,6 +70,7 @@ class PrepareQuantizePass
     : public PassWrapper<PrepareQuantizePass, FunctionPass> {
  public:
   // Constructor used by the PassRegistration and enforce uint8 quantization.
+  // This is only used by test.
   explicit PrepareQuantizePass() {
     if (quantize_signed)
       quant_specs_.inference_type = tensorflow::DT_QINT8;
@@ -257,15 +258,16 @@ void PrepareQuantizePass::runOnFunction() {
   // convert all of them to signed.
   OwningRewritePatternList patterns;
   bool is_signed = quant_specs_.IsSignedInferenceType();
+  int bit_width = quant_specs_.GetQuantizationTypeWidth();
   if (is_signed) {
     patterns.insert<quant::ConvertUnsignedToSigned<quant::QuantizeCastOp>>(ctx);
     // Convert quant stats to int8 quantization parameters.
     // Currently, only activation stats are imported, so narrow_range = false.
-    patterns.insert<PrepareQuantStats>(8, false, true, ctx);
+    patterns.insert<PrepareQuantStats>(bit_width, false, true, ctx);
   } else {
     // Convert quant stats to uint8 quantization parameters.
     // Currently, only activation stats are imported, so narrow_range = false.
-    patterns.insert<PrepareQuantStats>(8, false, false, ctx);
+    patterns.insert<PrepareQuantStats>(bit_width, false, false, ctx);
   }
   applyPatternsAndFoldGreedily(func, patterns);
 

From 55b36215c8b8b2223b079522938bbdde695bcaf5 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@google.com>
Date: Fri, 15 May 2020 15:59:38 -0700
Subject: [PATCH 311/412] include-what-you-use for lite/core/api:api

PiperOrigin-RevId: 311817318
Change-Id: If003542599d8901465dcafc605fd4df1d0c65add
---
 tensorflow/lite/core/api/BUILD                     | 1 +
 tensorflow/lite/core/api/flatbuffer_conversions.cc | 6 +++++-
 tensorflow/lite/core/api/flatbuffer_conversions.h  | 5 ++++-
 tensorflow/lite/core/api/op_resolver.cc            | 4 ++++
 tensorflow/lite/core/api/tensor_utils.cc           | 2 ++
 5 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/core/api/BUILD b/tensorflow/lite/core/api/BUILD
index 6681a3ed550..419a3b2486d 100644
--- a/tensorflow/lite/core/api/BUILD
+++ b/tensorflow/lite/core/api/BUILD
@@ -26,6 +26,7 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "@flatbuffers//:runtime_cc",
     ],
 )
 
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 63e04899ca3..c52fc9f690b 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 
-#include <cstdlib>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.h b/tensorflow/lite/core/api/flatbuffer_conversions.h
index d774afe8e85..2feddfaa8e6 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.h
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.h
@@ -19,9 +19,12 @@ limitations under the License.
 // flatbuffer serialization format into in-memory values that are used by the
 // runtime API and interpreter.
 
+#include <cstddef>
+#include <new>
+#include <type_traits>
+
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
-#include "tensorflow/lite/core/api/op_resolver.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/core/api/op_resolver.cc b/tensorflow/lite/core/api/op_resolver.cc
index 6424071f371..c239d9ed23e 100644
--- a/tensorflow/lite/core/api/op_resolver.cc
+++ b/tensorflow/lite/core/api/op_resolver.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/lite/core/api/op_resolver.h"
 
+#include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/api/error_reporter.h"
+
 namespace tflite {
 
 TfLiteStatus GetRegistrationFromOpCode(
diff --git a/tensorflow/lite/core/api/tensor_utils.cc b/tensorflow/lite/core/api/tensor_utils.cc
index d8d6fc46a18..3aac16b6878 100644
--- a/tensorflow/lite/core/api/tensor_utils.cc
+++ b/tensorflow/lite/core/api/tensor_utils.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string.h>
 
+#include "tensorflow/lite/c/common.h"
+
 namespace tflite {
 
 TfLiteStatus ResetVariableTensor(TfLiteTensor* tensor) {

From eb07fd848a2f0cf45623799f43f372d07ae9a59b Mon Sep 17 00:00:00 2001
From: Yanhua Sun <yanhuasun@google.com>
Date: Fri, 15 May 2020 16:02:58 -0700
Subject: [PATCH 312/412] Add a TFE_Py_Execute traceme, which is the entrance
 to TF c++

PiperOrigin-RevId: 311817887
Change-Id: If924b3f3273096c961e6cc24459a620ce3889963
---
 tensorflow/python/eager/pywrap_tfe_src.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 2d96ed57246..639f623bd1a 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -852,6 +852,8 @@ void TFE_Py_ExecuteCancelable(TFE_Context* ctx, const char* device_name,
                               TFE_CancellationManager* cancellation_manager,
                               TFE_OutputTensorHandles* outputs,
                               TF_Status* out_status) {
+  tensorflow::profiler::TraceMe activity(
+      "TFE_Py_ExecuteCancelable", tensorflow::profiler::TraceMeLevel::kInfo);
   TFE_Op* op = GetOp(ctx, op_name, device_name, out_status);
   auto cleaner = tensorflow::gtl::MakeCleanup([ctx, op] { ReturnOp(ctx, op); });
   if (!out_status->status.ok()) return;

From 763710df31acf4b5da8f3c27f1bf0dd0ebb50c91 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 16:05:39 -0700
Subject: [PATCH 313/412] Update (non-gradient) tests for tf.linalg.qr to also
 run in eager mode.

PiperOrigin-RevId: 311818375
Change-Id: I70d721522f4060f1a4997c271837fbd6f3629e9f
---
 tensorflow/python/kernel_tests/qr_op_test.py | 61 +++++++++++---------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index 4e0af934053..b1bbd0aaee3 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -20,9 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -30,7 +31,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -45,35 +46,37 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class QrOpTest(test.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
-    # The input to qr should be a tensor of at least rank 2.
+    # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 0"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*0"):
       linalg_ops.qr(scalar)
     vector = constant_op.constant([1., 2.])
-    with self.assertRaisesRegexp(ValueError,
-                                 "Shape must be at least rank 2 but is rank 1"):
+    with self.assertRaisesRegexp((ValueError, errors_impl.InvalidArgumentError),
+                                 "rank.* 2.*1"):
       linalg_ops.qr(vector)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      all_ops = []
-      for full_matrices_ in True, False:
-        for rows_ in 4, 5:
-          for cols_ in 4, 5:
-            matrix1 = random_ops.random_normal([rows_, cols_], seed=42)
-            matrix2 = random_ops.random_normal([rows_, cols_], seed=42)
-            q1, r1 = linalg_ops.qr(matrix1, full_matrices=full_matrices_)
-            q2, r2 = linalg_ops.qr(matrix2, full_matrices=full_matrices_)
-            all_ops += [q1, r1, q2, r2]
-      val = self.evaluate(all_ops)
-      for i in range(8):
-        q = 4 * i
-        self.assertAllClose(val[q], val[q + 2])  # q1 == q2
-        self.assertAllClose(val[q + 1], val[q + 3])  # r1 == r2
+    seed = [42, 24]
+    all_ops = []
+    for full_matrices_ in True, False:
+      for rows_ in 4, 5:
+        for cols_ in 4, 5:
+          matrix_shape = [rows_, cols_]
+          matrix1 = stateless_random_ops.stateless_random_normal(
+              matrix_shape, seed)
+          matrix2 = stateless_random_ops.stateless_random_normal(
+              matrix_shape, seed)
+          self.assertAllEqual(matrix1, matrix2)
+          q1, r1 = linalg_ops.qr(matrix1, full_matrices=full_matrices_)
+          q2, r2 = linalg_ops.qr(matrix2, full_matrices=full_matrices_)
+          all_ops += [q1, q2, r1, r2]
+    val = self.evaluate(all_ops)
+    for i in range(0, len(val), 2):
+      self.assertAllClose(val[i], val[i + 1])
 
 
 def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
@@ -121,8 +124,10 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       tol = 1e-14
     self.assertAllClose(identity, xx, atol=tol)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
+    if not use_static_shape_ and context.executing_eagerly():
+      return
     np.random.seed(1)
     x_np = np.random.uniform(
         low=-1.0, high=1.0, size=np.prod(shape_)).reshape(shape_).astype(dtype_)
@@ -131,7 +136,6 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
 
-    with self.session(use_gpu=True) as sess:
       if use_static_shape_:
         x_tf = constant_op.constant(x_np)
       else:
@@ -141,7 +145,8 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       if use_static_shape_:
         q_tf_val, r_tf_val = self.evaluate([q_tf, r_tf])
       else:
-        q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
+        with self.session(use_gpu=True) as sess:
+          q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
 
       q_dims = q_tf_val.shape
       np_q = np.ndarray(q_dims, dtype_)
@@ -266,7 +271,7 @@ if __name__ == "__main__":
         for full_matrices in False, True:
           for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
             # TF2 does not support placeholders under eager so we skip it
-            for use_static_shape in set([True, tf2.enabled()]):
+            for use_static_shape in [True, False]:
               shape = batch_dims + (rows, cols)
               name = "%s_%s_full_%s_static_%s" % (dtype.__name__,
                                                   "_".join(map(str, shape)),

From a133be3d31f215d669cfbfdc7df4f28edc99c50a Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 15 May 2020 16:08:10 -0700
Subject: [PATCH 314/412] Add TraceMeEncode helper

PiperOrigin-RevId: 311818799
Change-Id: I437235c1603a1e5be99a410376801771cfda0c66
---
 tensorflow/core/profiler/lib/BUILD            | 10 +++
 tensorflow/core/profiler/lib/traceme_encode.h | 82 +++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 tensorflow/core/profiler/lib/traceme_encode.h

diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 6316fd118fc..0aa1a5d6b67 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -102,6 +102,16 @@ cc_library(
     ]),
 )
 
+cc_library(
+    name = "traceme_encode",
+    hdrs = ["traceme_encode.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "annotated_traceme",
     hdrs = ["annotated_traceme.h"],
diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h
new file mode 100644
index 00000000000..772f56a2153
--- /dev/null
+++ b/tensorflow/core/profiler/lib/traceme_encode.h
@@ -0,0 +1,82 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_
+
+#include <string.h>
+
+#include <initializer_list>
+#include <string>
+#include <utility>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace internal {
+
+// Copies the contents of str to the address pointed by out.
+// Returns the address after the copy.
+// REQUIRED: The address range [out, out + str.size()] must have been allocated.
+TF_ATTRIBUTE_ALWAYS_INLINE inline char* Append(char* out,
+                                               absl::string_view str) {
+  const size_t str_size = str.size();
+  if (str_size > 0) {
+    memcpy(out, str.data(), str_size);
+    out += str_size;
+  }
+  return out;
+}
+
+}  // namespace internal
+
+// Encodes an event name and arguments into a string stored by TraceMe.
+// Use within a lambda to avoid expensive operations when tracing is inactive.
+// Example Usage:
+//   TraceMe trace_me([&name, value1]() {
+//     return TraceMeEncode(name, {{"key1", value1}, {"key2", 42}});
+//   });
+inline std::string TraceMeEncode(
+    std::string name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  if (TF_PREDICT_TRUE(args.size() > 0)) {
+    const auto old_size = name.size();
+    auto new_size = old_size + args.size() * 2 + 1;
+    for (const auto& arg : args) {
+      new_size += arg.first.size() + arg.second.size();
+    }
+    name.resize(new_size);
+    char* const begin = &name[0];
+    char* out = begin + old_size;
+    *out++ = '#';
+    for (const auto& arg : args) {
+      out = internal::Append(out, arg.first);
+      *out++ = '=';
+      out = internal::Append(out, arg.second.Piece());
+      *out++ = ',';
+    }
+    *(out - 1) = '#';
+    DCHECK_EQ(out, begin + new_size);
+  }
+  return name;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_TRACEME_ENCODE_H_

From 22608ca0c293ec1a2976dcfdc6f02d5ce2173cb5 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 15 May 2020 16:13:31 -0700
Subject: [PATCH 315/412] Update TPU rewrite pass to populate replicated host
 devices on tf_device.replicate.

Replicated host devices under data parallelism may be necessary if outside compilation is present.

PiperOrigin-RevId: 311819706
Change-Id: Iad2775559374d481e3b39ba1a8681f660ee6787e
---
 .../mlir/tensorflow/tests/tpu_rewrite.mlir    | 29 ++++++++++---------
 .../tensorflow/transforms/tpu_rewrite_pass.cc | 12 ++++++++
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index b8a48bbb379..332b46f427f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -747,7 +747,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests simple case of `tf_device.cluster_func` on TPU with replication.
+// Tests simple case of `tf_device.cluster_func` on TPU with replication. Under
+// data parallelism replicated host devices are also added to the
+// tf_device.replicate
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
   // CHECK-LABEL: func @replicated_tpu_cluster_func
@@ -758,7 +760,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
     // CHECK: %[[REPLICATE:[0-9]*]]:2 = tf_device.replicate
     // CHECK-SAME: ([%[[A_OUTPUT]], %[[ARG_0]]] as %[[RI_0:[a-z0-9]*]]: tensor<?xi32>)
-    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]}
+    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"], TPU_REPLICATED_HOST = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:CPU:0"]}
     // CHECK-SAME: n = 2
     %1:2 = tf_device.replicate([%0, %arg0] as %ri_0: tensor<?xi32>) {n = 2 : i32} {
       // CHECK: %[[A_SHAPE_OUTPUT:[0-9]*]] = "tf.Shape"(%[[RI_0]])
@@ -1222,7 +1224,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
 
 // -----
 
-// Tests simple case of `tf_device.cluster_func` on TPU with replication and parallel_execute.
+// Tests simple case of `tf_device.cluster_func` on TPU with replication and
+// parallel_execute.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:worker/replica:0/task:0/device:CPU:0", "/job:worker/replica:0/task:0/device:TPU_SYSTEM:0", "/job:worker/replica:0/task:0/device:TPU:0", "/job:worker/replica:0/task:0/device:TPU:1"]} {
   // CHECK-LABEL: func @replicated_parallel_tpu_cluster_func
@@ -1240,7 +1243,6 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
         tf_device.return
       }, {
         %4 = "tf_device.cluster_func"(%ri_0) {_tpu_replicate = "cluster0", func = @tpu0_func, num_cores_per_replica = 1, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", padding_map = ["\08\01\10\02\18\03"], topology = "", device_assignment = [], input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"], output_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]} : (tensor<?xi32>) -> tensor<?xi32>
-
         tf_device.return %4 : tensor<?xi32>
       }) : () -> (tensor<?xi32>)
       tf_device.return %3 : tensor<?xi32>
@@ -1317,15 +1319,14 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 //   "\0A\04\01\02\01\02\10\02\18\02\22\10\00\00\00\00\00\00\00\01\00\01\00\00\00\01\00\01"
 // -----
 
-// Tests devices are set properly for replicated model parallelism.
+// Tests devices are set properly for replicated model parallelism. No
+// replicated host device should be present.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @replicated_parallel_execute
   func @replicated_parallel_execute(%arg0: tensor<8xi32>, %arg1: tensor<8xi32>) -> (tensor<8xi32>, tensor<8xi32>) {
     // CHECK: tf_device.replicate
-    // CHECK-SAME: devices =
-    // CHECK-SAME: TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"]
-    // CHECK-SAME: TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]
+    // CHECK-SAME: devices = {TPU_REPLICATED_CORE_0 = ["/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1"], TPU_REPLICATED_CORE_1 = ["/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU:0"]}
     %0:2 = tf_device.replicate([%arg0, %arg1] as %ri: tensor<8xi32>) {n = 2 : i32} {
       // CHECK-NEXT: %[[COMPILE:[a-z0-9]+]]:3 = "tf_device.launch"
       // CHECK-NEXT:   "tf._TPUCompileMlir"()
@@ -1357,8 +1358,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
-// Tests that inputs are inputs with maximal and replicate sharding are set properly
-// for replicated model parallelism.
+// Tests that inputs are inputs with maximal and replicate sharding are set
+// properly for replicated model parallelism.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @parallel_execute_with_input_with_sharding_configurations
@@ -1392,8 +1393,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
-// Tests devices are set properly for replicated model parallelism with
-// outputs to TPU computation placed on logical device 0.
+// Tests devices are set properly for replicated model parallelism with outputs
+// to TPU computation placed on logical device 0.
 
 module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:1/device:CPU:0", "/job:localhost/replica:0/task:1/device:TPU:0", "/job:localhost/replica:0/task:1/device:TPU:1", "/job:localhost/replica:0/task:1/device:TPU_SYSTEM:0"]} {
   // CHECK-LABEL: func @parallel_execute_with_different_outputs
@@ -1469,8 +1470,8 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
 
 // -----
 
-// Tests inputs are correctly split and fed into TPU computation for
-// tiled input sharding.
+// Tests inputs are correctly split and fed into TPU computation for tiled input
+// sharding.
 
 // The following OpSharding is used for TPU computation inputs in below test:
 // Proto debug string:
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
index 986736a9502..a7ad6a964b9 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc
@@ -437,6 +437,18 @@ void AssignDevicesToReplicate(
                               builder->getStrArrayAttr(devices_by_core)));
   }
 
+  // For data parallelism, also add replicated host devices, as these are
+  // necessary for outside compilation.
+  if (num_cores_per_replica == 1) {
+    llvm::SmallVector<StringRef, 8> hosts;
+    hosts.reserve(num_replicas);
+    for (int replica = 0; replica < num_replicas; ++replica)
+      hosts.push_back(tpu_devices[replica][0].host);
+
+    device_attrs.push_back(builder->getNamedAttr(
+        tensorflow::kTPUReplicatedHost, builder->getStrArrayAttr(hosts)));
+  }
+
   replicate.setAttr(kDevicesAttr, builder->getDictionaryAttr(device_attrs));
 }
 

From a3746cc77a95db8acea7f7fbd5495fbdf0563139 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 15 May 2020 16:29:51 -0700
Subject: [PATCH 316/412] Remove LazyLoader dependency from lite.interpreter

The original motivation for using this when loading the native
lite.Interpreter deps appears to no longer hold.

PiperOrigin-RevId: 311822195
Change-Id: I2a6877dcd65cdc906d025722714fe209c8673d5d
---
 tensorflow/lite/python/interpreter.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/tensorflow/lite/python/interpreter.py b/tensorflow/lite/python/interpreter.py
index ccbba9014c8..04863b12853 100644
--- a/tensorflow/lite/python/interpreter.py
+++ b/tensorflow/lite/python/interpreter.py
@@ -27,20 +27,8 @@ import numpy as np
 # pylint: disable=g-import-not-at-top
 if not __file__.endswith('tflite_runtime/interpreter.py'):
   # This file is part of tensorflow package.
-  from tensorflow.python.util.lazy_loader import LazyLoader
+  from tensorflow.lite.python.interpreter_wrapper import _pywrap_tensorflow_interpreter_wrapper as _interpreter_wrapper
   from tensorflow.python.util.tf_export import tf_export as _tf_export
-
-  # Lazy load since some of the performance benchmark skylark rules
-  # break dependencies. Must use double quotes to match code internal rewrite
-  # rule.
-  # pylint: disable=g-inconsistent-quotes
-  _interpreter_wrapper = LazyLoader(
-      "_interpreter_wrapper", globals(),
-      "tensorflow.lite.python.interpreter_wrapper."
-      '_pywrap_tensorflow_interpreter_wrapper')
-  # pylint: enable=g-inconsistent-quotes
-
-  del LazyLoader
 else:
   # This file is part of tflite_runtime package.
   from tflite_runtime import _pywrap_tensorflow_interpreter_wrapper as _interpreter_wrapper

From c26ac449e0c798e5527f565e95078e42c662952f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 17:18:30 -0700
Subject: [PATCH 317/412] Enable tf.linalg.matrix_solve tests in eager mode.

PiperOrigin-RevId: 311829192
Change-Id: I8d8c0fb2e28c6dd497a99724d4e2bcd78f2d2ed6
---
 .../kernel_tests/matrix_solve_op_test.py      | 96 ++++++++++---------
 1 file changed, 50 insertions(+), 46 deletions(-)

diff --git a/tensorflow/python/kernel_tests/matrix_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
index 0b6b403210c..bbd909c8e58 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
@@ -21,14 +21,16 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -56,19 +58,19 @@ class MatrixSolveOpTest(test.TestCase):
           a_np = np.tile(a_np, batch_dims + [1, 1])
           b = np.tile(b, batch_dims + [1, 1])
         np_ans = np.linalg.solve(a_np, b)
-        for use_placeholder in False, True:
-          with self.cached_session(use_gpu=True) as sess:
-            if use_placeholder:
-              a_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
-              b_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
-              tf_ans = linalg_ops.matrix_solve(a_ph, b_ph, adjoint=adjoint)
+        for use_placeholder in set((False, not context.executing_eagerly())):
+          if use_placeholder:
+            a_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
+            b_ph = array_ops.placeholder(dtypes.as_dtype(np_type))
+            tf_ans = linalg_ops.matrix_solve(a_ph, b_ph, adjoint=adjoint)
+            with self.cached_session(use_gpu=True) as sess:
               out = sess.run(tf_ans, {a_ph: a, b_ph: b})
-            else:
-              tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
-              out = self.evaluate(tf_ans)
-              self.assertEqual(tf_ans.get_shape(), out.shape)
-            self.assertEqual(np_ans.shape, out.shape)
-            self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
+          else:
+            tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
+            out = self.evaluate(tf_ans)
+            self.assertEqual(tf_ans.get_shape(), out.shape)
+          self.assertEqual(np_ans.shape, out.shape)
+          self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
 
   def _generateMatrix(self, m, n):
     matrix = (np.random.normal(-5, 5,
@@ -77,7 +79,7 @@ class MatrixSolveOpTest(test.TestCase):
         [m, n]))
     return matrix
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSolve(self):
     for n in 1, 2, 4, 9:
       matrix = self._generateMatrix(n, n)
@@ -85,7 +87,7 @@ class MatrixSolveOpTest(test.TestCase):
         rhs = self._generateMatrix(n, nrhs)
         self._verifySolve(matrix, rhs)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testSolveBatch(self):
     for n in 2, 5:
       matrix = self._generateMatrix(n, n)
@@ -94,48 +96,50 @@ class MatrixSolveOpTest(test.TestCase):
         for batch_dims in [[2], [2, 2], [7, 4]]:
           self._verifySolve(matrix, rhs, batch_dims=batch_dims)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNonSquareMatrix(self):
     # When the solve of a non-square matrix is attempted we should return
     # an error
-    with self.session(use_gpu=True):
-      with self.assertRaises(ValueError):
-        matrix = constant_op.constant([[1., 2., 3.], [3., 4., 5.]])
-        linalg_ops.matrix_solve(matrix, matrix)
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+      matrix = constant_op.constant([[1., 2., 3.], [3., 4., 5.]])
+      self.evaluate(linalg_ops.matrix_solve(matrix, matrix))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
-    with self.session(use_gpu=True):
-      matrix = constant_op.constant([[1., 0.], [0., 1.]])
-      rhs = constant_op.constant([[1., 0.]])
-      with self.assertRaises(ValueError):
-        linalg_ops.matrix_solve(matrix, rhs)
+    matrix = constant_op.constant([[1., 0.], [0., 1.]])
+    rhs = constant_op.constant([[1., 0.]])
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
+      self.evaluate(linalg_ops.matrix_solve(matrix, rhs))
 
   def testNotInvertible(self):
     # The input should be invertible.
-    with self.session(use_gpu=True):
-      with self.assertRaisesOpError("Input matrix is not invertible."):
-        # All rows of the matrix below add to zero
-        matrix = constant_op.constant([[1., 0., -1.], [-1., 1., 0.],
-                                       [0., -1., 1.]])
-        linalg_ops.matrix_solve(matrix, matrix).eval()
+    with self.assertRaisesOpError("Input matrix is not invertible."):
+      # All rows of the matrix below add to zero
+      matrix = constant_op.constant([[1., 0., -1.], [-1., 1., 0.],
+                                     [0., -1., 1.]])
+      self.evaluate(linalg_ops.matrix_solve(matrix, matrix))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrent(self):
-    with self.session(use_gpu=True) as sess:
-      all_ops = []
-      for adjoint_ in False, True:
-        lhs1 = random_ops.random_normal([3, 3], seed=42)
-        lhs2 = random_ops.random_normal([3, 3], seed=42)
-        rhs1 = random_ops.random_normal([3, 3], seed=42)
-        rhs2 = random_ops.random_normal([3, 3], seed=42)
-        s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint_)
-        s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint_)
-        all_ops += [s1, s2]
-      val = self.evaluate(all_ops)
-      self.assertAllEqual(val[0], val[1])
-      self.assertAllEqual(val[2], val[3])
+    seed = [42, 24]
+    matrix_shape = [3, 3]
+    all_ops = []
+    for adjoint_ in False, True:
+      lhs1 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      lhs2 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      rhs1 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      rhs2 = stateless_random_ops.stateless_random_normal(
+          matrix_shape, seed=seed)
+      s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint_)
+      s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint_)
+      all_ops += [s1, s2]
+    val = self.evaluate(all_ops)
+    for i in range(0, len(all_ops), 2):
+      self.assertAllEqual(val[i], val[i + 1])
 
 
 class MatrixSolveBenchmark(test.Benchmark):

From c1ac8f2b817ce772e0da53f017cd662143d8ec38 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 17:31:15 -0700
Subject: [PATCH 318/412] Enable tf.linalg.matrix_solve_ls tests in eager mode.

PiperOrigin-RevId: 311830778
Change-Id: I63aca8ab80b63201b3fe12e5e0af31f5760b3fad
---
 .../kernel_tests/matrix_solve_ls_op_test.py   | 92 ++++++++++---------
 1 file changed, 50 insertions(+), 42 deletions(-)

diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index b99c8f6d256..b7a159e2eff 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -89,6 +90,8 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
     if not fast and l2_regularizer != 0:
       # The slow path does not support regularization.
       return
+    if use_placeholder and context.executing_eagerly():
+      return
     maxdim = np.max(x.shape)
     if dtype == np.float32 or dtype == np.complex64:
       tol = maxdim * 5e-4
@@ -109,64 +112,70 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
         b = np.tile(b, batch_shape + (1, 1))
         np_ans = np.tile(np_ans, batch_shape + (1, 1))
         np_r_norm = np.tile(np_r_norm, batch_shape)
-      with self.cached_session(use_gpu=fast) as sess:
-        if use_placeholder:
-          a_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
-          b_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
-          feed_dict = {a_ph: a, b_ph: b}
-          tf_ans = linalg_ops.matrix_solve_ls(
-              a_ph, b_ph, fast=fast, l2_regularizer=l2_regularizer)
-        else:
-          tf_ans = linalg_ops.matrix_solve_ls(
-              a, b, fast=fast, l2_regularizer=l2_regularizer)
-          feed_dict = {}
-          self.assertEqual(np_ans.shape, tf_ans.get_shape())
-        if l2_regularizer == 0:
-          # The least squares solution should satisfy A^H * (b - A*x) = 0.
-          tf_r = b - math_ops.matmul(a, tf_ans)
-          tf_r = math_ops.matmul(a, tf_r, adjoint_a=True)
-          tf_r_norm = linalg_ops.norm(tf_r, ord="fro", axis=[-2, -1])
-          tf_ans_val, tf_r_norm_val = sess.run(
-              [tf_ans, tf_r_norm], feed_dict=feed_dict)
-          self.assertAllClose(np_r_norm, tf_r_norm_val, atol=tol, rtol=tol)
-        else:
+      if use_placeholder:
+        a_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
+        b_ph = array_ops.placeholder(dtypes.as_dtype(dtype))
+        feed_dict = {a_ph: a, b_ph: b}
+        tf_ans = linalg_ops.matrix_solve_ls(
+            a_ph, b_ph, fast=fast, l2_regularizer=l2_regularizer)
+      else:
+        tf_ans = linalg_ops.matrix_solve_ls(
+            a, b, fast=fast, l2_regularizer=l2_regularizer)
+        feed_dict = None
+        self.assertEqual(np_ans.shape, tf_ans.get_shape())
+      if feed_dict:
+        with self.session(use_gpu=True) as sess:
           tf_ans_val = sess.run(tf_ans, feed_dict=feed_dict)
-
+      else:
+        tf_ans_val = self.evaluate(tf_ans)
       self.assertEqual(np_ans.shape, tf_ans_val.shape)
       self.assertAllClose(np_ans, tf_ans_val, atol=2 * tol, rtol=2 * tol)
 
-  @test_util.run_v1_only("b/120545219")
+      if l2_regularizer == 0:
+        # The least squares solution should satisfy A^H * (b - A*x) = 0.
+        tf_r = b - math_ops.matmul(a, tf_ans)
+        tf_r = math_ops.matmul(a, tf_r, adjoint_a=True)
+        tf_r_norm = linalg_ops.norm(tf_r, ord="fro", axis=[-2, -1])
+        if feed_dict:
+          with self.session(use_gpu=True) as sess:
+            tf_ans_val, tf_r_norm_val = sess.run([tf_ans, tf_r_norm],
+                                                 feed_dict=feed_dict)
+        else:
+          tf_ans_val, tf_r_norm_val = self.evaluate([tf_ans, tf_r_norm])
+        self.assertAllClose(np_r_norm, tf_r_norm_val, atol=tol, rtol=tol)
+
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
     with self.session(use_gpu=True):
       matrix = constant_op.constant([[1., 0.], [0., 1.]])
       rhs = constant_op.constant([[1., 0.]])
-      with self.assertRaises(ValueError):
+      with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
         linalg_ops.matrix_solve_ls(matrix, rhs)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     full = np.array([[1., 2.], [3., 4.], [5., 6.]])
     empty0 = np.empty([3, 0])
     empty1 = np.empty([0, 2])
     for fast in [True, False]:
-      with self.cached_session(use_gpu=True):
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast))
-        self.assertEqual(tf_ans.shape, (0, 0))
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(empty0, full, fast=fast))
-        self.assertEqual(tf_ans.shape, (0, 2))
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(full, empty0, fast=fast))
-        self.assertEqual(tf_ans.shape, (2, 0))
-        tf_ans = self.evaluate(
-            linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast))
-        self.assertEqual(tf_ans.shape, (2, 2))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast))
+      self.assertEqual(tf_ans.shape, (0, 0))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(empty0, full, fast=fast))
+      self.assertEqual(tf_ans.shape, (0, 2))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(full, empty0, fast=fast))
+      self.assertEqual(tf_ans.shape, (2, 0))
+      tf_ans = self.evaluate(
+          linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast))
+      self.assertEqual(tf_ans.shape, (2, 2))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testBatchResultSize(self):
     # 3x3x3 matrices, 3x3x1 right-hand sides.
-    matrix = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9.] * 3).reshape(3, 3, 3)
+    matrix = np.array([1., 0., 0., 0., 1., 0., 0., 0., 1.] * 3).reshape(3, 3, 3)
     rhs = np.array([1., 2., 3.] * 3).reshape(3, 3, 1)
     answer = linalg_ops.matrix_solve(matrix, rhs)
     ls_answer = linalg_ops.matrix_solve_ls(matrix, rhs)
@@ -358,8 +367,7 @@ if __name__ == "__main__":
     # ROCm does not support BLAS operations for complex types
     dtypes_to_test += [np.complex64, np.complex128]
   for dtype_ in dtypes_to_test:
-    # TF2 does not support placeholders under eager so we skip it
-    for use_placeholder_ in set([False, not tf2.enabled()]):
+    for use_placeholder_ in set([False, True]):
       for fast_ in [True, False]:
         l2_regularizers = [0] if dtype_ == np.complex128 else [0, 0.1]
         for l2_regularizer_ in l2_regularizers:

From 96f1bbe90a58b94fe760ff748afa1aff20e16696 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Fri, 15 May 2020 18:04:14 -0700
Subject: [PATCH 319/412] [XLA:GPU] [NFC] Add more logging output to explain
 fusion decisions

PiperOrigin-RevId: 311834483
Change-Id: I13a0c23f1da4f7080eff4852b0e470a9d86c26b5
---
 tensorflow/compiler/xla/service/gpu/gpu_fusible.cc    | 11 +++++++++++
 .../compiler/xla/service/gpu/instruction_fusion.cc    |  4 ++++
 tensorflow/compiler/xla/service/instruction_fusion.cc |  6 +++++-
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 1316e8ad1aa..bb4184ff76f 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -351,6 +351,9 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
                            const HloInstruction& instr2) {
   if (SharedMemoryUsage(instr1) + SharedMemoryUsage(instr2) >
       kSharedMemoryBudgetInBytes) {
+    VLOG(5) << "Shared memory usage of fusion of " << instr1.ToString()
+            << " and " << instr2.ToString() << " would be over the budget of "
+            << kSharedMemoryBudgetInBytes << "B";
     return true;
   }
 
@@ -383,6 +386,14 @@ bool FusionWouldBeTooLarge(const HloInstruction& instr1,
           num_output_buffers <=
       kMaxOperandsAndOutputsPerFusion) {
     return false;
+  } else {
+    VLOG(5) << "Operand count of "
+            << "(" << instr1.ToString() << " ) = " << instr1.operand_count()
+            << " and ( " << instr2.ToString()
+            << " ) = " << instr2.operand_count()
+            << " and num_output_buffers = " << num_output_buffers
+            << " is bigger than the bound of "
+            << kMaxOperandsAndOutputsPerFusion;
   }
 
   // Compute the precise number of operands to the new fusion.
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index fc1c1bb4ab1..a0580e2ab04 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -65,12 +65,16 @@ bool GpuInstructionFusion::ShouldFuseInexpensiveChecks(HloInstruction* consumer,
 bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   if (!ShouldFuseInexpensiveChecks(consumer, operand_index)) {
+    VLOG(5) << "Not fusing inexpensive checks of operand " << operand_index
+            << " of " << consumer->ToString();
     return false;
   }
   auto producer = consumer->operand(operand_index);
 
   // The following checks are potentially expensive.
   if (FusionWouldBeTooLarge(*consumer, *producer)) {
+    VLOG(5) << "Fusion of (" << producer->ToString() << ") into ("
+            << consumer->ToString() << ") would be too large";
     return false;
   }
   if (consumer->opcode() != HloOpcode::kFusion) {
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 1bc3d24274c..5de081c6343 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -502,7 +502,7 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     while (true) {
       auto next_entry =
           fusion_queue->DequeueNextInstructionAndOperandsToFuseInOrder();
-      auto instruction = next_entry.first;
+      HloInstruction* instruction = next_entry.first;
       if (instruction == nullptr) {
         break;
       }
@@ -512,12 +512,14 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
         continue;
       }
 
+      VLOG(5) << "Considering fusion of: " << instruction->ToString();
       std::vector<int64>& sorted_operand_numbers = next_entry.second;
 
       for (int64 i : sorted_operand_numbers) {
         HloInstruction* operand = instruction->mutable_operand(i);
 
         if (!operand->IsFusible()) {
+          VLOG(3) << "Operand (" << operand->ToString() << ") is not fusible";
           continue;
         }
 
@@ -691,6 +693,8 @@ bool InstructionFusion::ShouldFuse(HloInstruction* consumer,
   if (FusionWouldDuplicate(*producer, *consumer) &&
       (!may_duplicate_ || is_expensive_(*producer)) &&
       !IsAlwaysDuplicable(*producer)) {
+    VLOG(4) << "Stopping: fusion may duplicate operand ("
+            << producer->ToString() << ") , and this is expensive";
     return false;
   }
 

From cbc4d5442e946306ef5f2ed88ec1ec3c4c9ec765 Mon Sep 17 00:00:00 2001
From: Mehmet Deveci <deveci@google.com>
Date: Fri, 15 May 2020 18:07:38 -0700
Subject: [PATCH 320/412] Adding an option to tensor tracer to create a suffix
 folder based on the fingerprint of the tf.graph.

If use_fingerprint_subdirectory is provided, then the TensorTracer summaries will be written under <trace_dir>/<fingerprint>. If there are changes to the graph, the changes will be listed under different fingerprints.

PiperOrigin-RevId: 311834837
Change-Id: I9dfbabfeb7fbe58a2a47c2581474ed86647781dc
---
 tensorflow/python/tpu/tensor_tracer.proto     |  4 +++
 tensorflow/python/tpu/tensor_tracer.py        | 21 +++++++++++-
 tensorflow/python/tpu/tensor_tracer_flags.py  |  4 ++-
 tensorflow/python/tpu/tensor_tracer_report.py | 34 +++++++++++++++++--
 4 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/tpu/tensor_tracer.proto b/tensorflow/python/tpu/tensor_tracer.proto
index ad5392d65fe..7b745f0f45b 100644
--- a/tensorflow/python/tpu/tensor_tracer.proto
+++ b/tensorflow/python/tpu/tensor_tracer.proto
@@ -21,6 +21,10 @@ message TensorTracerReport {
   // A map from tensor name to its TracedTensorDef.
   map<string, TracedTensorDef> tensordef = 3;
 
+  // The fingerprint of the TensorTracerReport (fingerprint calculation excludes
+  // this field and graphdef).
+  string fingerprint = 4;
+
   message TensorTracerConfig {
     // Tensor tracer version, e.g. hostcall, outside compilation.
     string version = 1;
diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index bd96de42f3a..b4f99897094 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -100,7 +100,7 @@ _TT_TENSORBOARD_PLUGIN_NAME = 'tensor_tracer'
 _TT_HOSTCALL_KEY = 'tensor_tracer_host_call'
 _TT_EVENT_FILE_SUFFIX = '.tensor_tracer'
 
-_TT_SUMMARY_MAX_QUEUE = 100
+_TT_SUMMARY_MAX_QUEUE = 10
 
 
 def set_parameters(tensor_tracer_params=None):
@@ -206,6 +206,9 @@ def set_parameters(tensor_tracer_params=None):
           -> op2 -> op1 -> op0, if op0 has a NaN and trace_stack_size is 1, the
           result of op1 will also be printed. trace_stack_size is 2, the result
           of op1 and op2 will be printed.
+        - use_fingerprint_subdirectory: The trace directory will be chosen as
+          using the fingerprint of the trace metadata under the provided
+          trace_dir.
   """
   flags = '--%s=1' % tensor_tracer_flags.FLAG_NAME_ENABLE
   if tensor_tracer_params:
@@ -547,6 +550,7 @@ class TensorTracer(object):
     self._traced_op_names = set()
     self._report_proto = None
     self._temp_cache_var = []
+    self._report_proto_path = ''
 
   def report_proto(self):
     """Getter for tensor_tracer.proto object for summary and full_tensor_summary modes.
@@ -564,6 +568,14 @@ class TensorTracer(object):
                        'Report proto only exists for '
                        'trace_mode=[summary|full_tensor_summary]')
 
+  def report_proto_path(self):
+    """Getter for path where tensor_tracer.proto object should be written.
+
+    Returns:
+      A string path.
+    """
+    return self._report_proto_path
+
   def _get_all_cache_variables(self):
     return self._cache_variables
 
@@ -1366,6 +1378,13 @@ class TensorTracer(object):
       self._report_proto = report_handler.create_report_proto(
           self._tt_config, self._parameters, tensor_trace_order,
           tensor_trace_points, self._signature_types())
+      if self._parameters.use_fingerprint_subdir:
+        self._parameters.trace_dir = os.path.join(
+            self._parameters.trace_dir, self._report_proto.fingerprint)
+        logging.info('TensorTracer updating trace_dir to %s',
+                     self._parameters.trace_dir)
+      self._report_proto_path = tensor_tracer_report.report_proto_path(
+          self._parameters.trace_dir)
       if self._parameters.report_file_path != _SKIP_REPORT_FILE:
         report_handler.write_report_proto(self._report_proto, self._parameters)
     else:
diff --git a/tensorflow/python/tpu/tensor_tracer_flags.py b/tensorflow/python/tpu/tensor_tracer_flags.py
index c5e3e88597b..4e412c46e82 100644
--- a/tensorflow/python/tpu/tensor_tracer_flags.py
+++ b/tensorflow/python/tpu/tensor_tracer_flags.py
@@ -74,6 +74,7 @@ FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS = 'dump_graphs'
 FLAG_NAME_SUMMARY_SIGNATURES = 'signatures'
 FLAG_NAME_SUMMARY_PER_CORE = 'collect_summary_per_core'
 FLAG_NAME_TEMP_CACHE_VAR = 'use_temp_cache'
+FLAG_NAME_FINGERPRINT_DIR = 'use_fingerprint_subdirectory'
 
 _OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
 _TEST_UNDECLARED_OUTPUTS_DIR_ENV_VAR = 'TEST_UNDECLARED_OUTPUTS_DIR'
@@ -127,6 +128,7 @@ class TTParameters(object):
     self.trace_scalar_ops = self.is_flag_on(FLAG_NAME_TRACE_SCALAR_OPS)
     self.use_compact_trace = self.is_flag_on(FLAG_NAME_USE_COMPACT_TRACE)
     self.use_temp_cache_var = self.is_flag_on(FLAG_NAME_TEMP_CACHE_VAR)
+    self.use_fingerprint_subdir = self.is_flag_on(FLAG_NAME_FINGERPRINT_DIR)
 
     # _trace_ops_before_included and _trace_ops_after_included denotes to depth
     # of tracing relative to the ops given in --included_opnames or
@@ -274,7 +276,7 @@ class TTParameters(object):
         FLAG_NAME_INCLUDE_LESS_INTERESTING_OPS, FLAG_NAME_OP_RANGE,
         FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS, FLAG_NAME_TRACE_LEVEL,
         FLAG_NAME_SUMMARY_SIGNATURES, FLAG_NAME_SUMMARY_PER_CORE,
-        FLAG_NAME_TEMP_CACHE_VAR
+        FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR
     ]
     tensor_tracer_flags = self._env.get(FLAGS_ENV_VAR)
     if not tensor_tracer_flags:
diff --git a/tensorflow/python/tpu/tensor_tracer_report.py b/tensorflow/python/tpu/tensor_tracer_report.py
index e8a122d981f..3270b2a2fd3 100644
--- a/tensorflow/python/tpu/tensor_tracer_report.py
+++ b/tensorflow/python/tpu/tensor_tracer_report.py
@@ -19,8 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import hashlib
 import os
 
+
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import tensor_tracer_pb2
@@ -53,6 +55,18 @@ _CURRENT_VERSION = 'use-outside-compilation'
 _TT_REPORT_PROTO = 'tensor_tracer_report.report_pb'
 
 
+def report_proto_path(trace_dir):
+  """Returns the path where report proto should be written.
+
+  Args:
+     trace_dir: String denoting the trace directory.
+
+  Returns:
+     A string denoting the path to the report proto.
+  """
+  return os.path.join(trace_dir, _TT_REPORT_PROTO)
+
+
 def topological_sort(g):
   """Performs topological sort on the given graph.
 
@@ -206,6 +220,12 @@ class OpenReportFile(object):
       self._report_file.close()
 
 
+def proto_fingerprint(message_proto):
+  serialized_message = message_proto.SerializeToString()
+  hasher = hashlib.sha256(serialized_message)
+  return hasher.hexdigest()
+
+
 class TTReportHandle(object):
   """Utility class responsible from creating a tensor tracer report."""
 
@@ -255,8 +275,6 @@ class TTReportHandle(object):
                                     key=lambda x: x[1]):
       report.config.signatures.append(signature_name)
 
-    tf_graph = tensor_trace_order.graph_order.graph
-    report.graphdef.CopyFrom(tf_graph.as_graph_def())
     for tensor in tensor_trace_order.graph_order.tensors:
       tensor_def = tensor_tracer_pb2.TensorTracerReport.TracedTensorDef()
       tensor_def.name = tensor.name
@@ -265,6 +283,11 @@ class TTReportHandle(object):
         tensor_def.cache_index = (
             tensor_trace_order.tensorname_to_cache_idx[tensor.name])
       else:
+        # To prevent small changes affecting the fingerprint calculation, avoid
+        # writing the untraced tensors to metadata. Fingerprints will be
+        # different only when the list of the traced tensors are different.
+        if tt_parameters.use_fingerprint_subdir:
+          continue
         tensor_def.is_traced = False
 
       if tensor.name in tensor_trace_points:
@@ -274,12 +297,17 @@ class TTReportHandle(object):
       elif tensor.op.name in self.instrument_records:
         tensor_def.explanation = self.instrument_records[tensor.op.name]
       report.tensordef[tensor.name].CopyFrom(tensor_def)
+    report.fingerprint = proto_fingerprint(report)
+    logging.info('TensorTracerProto fingerprint is %s.',
+                 report.fingerprint)
+    tf_graph = tensor_trace_order.graph_order.graph
+    report.graphdef.CopyFrom(tf_graph.as_graph_def())
     return report
 
   def write_report_proto(self, report_proto, tt_parameters):
     """Writes the given report proto under trace_dir."""
     gfile.MakeDirs(tt_parameters.trace_dir)
-    report_path = os.path.join(tt_parameters.trace_dir, _TT_REPORT_PROTO)
+    report_path = report_proto_path(tt_parameters.trace_dir)
     with gfile.GFile(report_path, 'wb') as f:
       f.write(report_proto.SerializeToString())
 

From 4fc945e30a0dfaa53848ad4393cb78dcb3283ef4 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 15 May 2020 18:09:14 -0700
Subject: [PATCH 321/412] [tf.data] Reduce verbosity of a warning as it is not
 actionable and appears for all programs that use tf.data with tf.distribute.

PiperOrigin-RevId: 311834993
Change-Id: Iafb60c31008369f48e986f9ff3b400a9d5ada36d
---
 tensorflow/core/kernels/data/captured_function.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 28738e3e2fe..adba99d37a4 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -466,17 +466,15 @@ Status FunctionMetadata::Create(
 
   auto attr = fdef->attr().find(FunctionLibraryDefinition::kIntsOnDeviceAttr);
   if (attr != fdef->attr().end() && attr->second.b()) {
-    LOG(WARNING)
-        << "Disabling multi-device execution for a function that uses the "
-        << FunctionLibraryDefinition::kIntsOnDeviceAttr << " attribute.";
+    VLOG(1) << "Disabling multi-device execution for a function that uses the "
+            << FunctionLibraryDefinition::kIntsOnDeviceAttr << " attribute.";
     (*out_metadata)->use_multi_device_function_ = false;
     return Status::OK();
   }
   auto validate_arg = [](const OpDef::ArgDef& arg) {
     if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) {
-      LOG(WARNING) << "Disabling multi-device execution for a function with "
-                      "a vector argument "
-                   << arg.name() << ".";
+      VLOG(1) << "Disabling multi-device execution for a function with "
+              << "a vector argument " << arg.name() << ".";
       return false;
     }
     return true;

From 312079996985b7a15ad7ff27c39ece6625e30121 Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Fri, 15 May 2020 18:50:51 -0700
Subject: [PATCH 322/412] [XLA] Calculate and print statistics about prefetches
 and evictions.

PiperOrigin-RevId: 311839276
Change-Id: Iec0b0318825c665cfca067d4edf30b56e8f9c833
---
 .../xla/service/memory_space_assignment.cc    | 40 +++++++++++++++----
 .../xla/service/memory_space_assignment.h     | 14 +++++--
 .../service/memory_space_assignment_test.cc   | 25 +++++++++---
 3 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 742de71e74c..431e6af2dc0 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -1706,20 +1706,39 @@ AlternateMemoryBestFitHeap::FindBestChunkCandidate(
   return absl::nullopt;
 }
 
-/*static*/ int64 MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(
-    const HloModule& module) {
-  int64 max_copies = 0;
+StatusOr<MemorySpaceAssignment::AsyncCopyStats>
+MemorySpaceAssignment::CalculateAsyncCopyStats() const {
+  AsyncCopyStats stats;
+  stats.max_outstanding_async_copies = 0;
+  stats.num_prefetches = 0;
+  stats.prefetch_bytes = 0;
+  stats.num_evictions = 0;
+  stats.eviction_bytes = 0;
   int64 current_copies = 0;
-  for (HloInstruction* instruction :
-       module.schedule().sequence(module.entry_computation()).instructions()) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> dataflow_analysis,
+                      HloDataflowAnalysis::Run(*module_));
+  for (HloInstruction* instruction : module_->schedule()
+                                         .sequence(module_->entry_computation())
+                                         .instructions()) {
     if (instruction->opcode() == HloOpcode::kCopyStart) {
       current_copies++;
     } else if (instruction->opcode() == HloOpcode::kCopyDone) {
       current_copies--;
+      int64 size =
+          options_.size_fn(dataflow_analysis->GetUniqueValueAt(instruction));
+      if (instruction->shape().layout().memory_space() ==
+          options_.alternate_memory_space) {
+        ++stats.num_prefetches;
+        stats.prefetch_bytes += size;
+      } else {
+        ++stats.num_evictions;
+        stats.eviction_bytes += size;
+      }
     }
-    max_copies = std::max(max_copies, current_copies);
+    stats.max_outstanding_async_copies =
+        std::max(stats.max_outstanding_async_copies, current_copies);
   }
-  return max_copies;
+  return stats;
 }
 
 /*static*/ MemorySpaceAssignment::BufferIntervalCompare
@@ -1851,8 +1870,13 @@ MemorySpaceAssignment::RunMemorySpaceAssignment(
   VLOG(3) << "Module after memory space assignment: ";
   XLA_VLOG_LINES(3, module_->ToString());
   TF_CHECK_OK(module_->schedule().Verify());
+  TF_ASSIGN_OR_RETURN(AsyncCopyStats stats, CalculateAsyncCopyStats());
   VLOG(1) << "Maximum number of outstanding async copies: "
-          << CountMaximumOutstandingAsyncCopies(*module_);
+          << stats.max_outstanding_async_copies;
+  VLOG(1) << "Number of prefetches: " << stats.num_prefetches
+          << ", in bytes: " << stats.prefetch_bytes;
+  VLOG(1) << "Number of evictions: " << stats.num_evictions
+          << ", in bytes: " << stats.eviction_bytes;
 
   TF_RETURN_IF_ERROR(VerifyAndExportHeapSimulatorTrace());
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index eb16db90600..727b8da6c08 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -604,6 +604,15 @@ class MemorySpaceAssignment {
     AllocationSequence allocation_sequence_;
   };
 
+  // Statistics of asynchronous copies.
+  struct AsyncCopyStats {
+    int64 max_outstanding_async_copies;
+    int64 num_prefetches;
+    int64 prefetch_bytes;
+    int64 num_evictions;
+    int64 eviction_bytes;
+  };
+
   virtual ~MemorySpaceAssignment() = default;
 
   // Runs the MemorySpaceAssignment pass.
@@ -611,9 +620,8 @@ class MemorySpaceAssignment {
       HloModule* module, const HloLiveRange& hlo_live_range,
       const HloAliasAnalysis& alias_analysis, const Options& options);
 
-  // Returns the maximum number of outstanding asynchronous copies in the
-  // module.
-  static int64 CountMaximumOutstandingAsyncCopies(const HloModule& module);
+  // Calculates asynchronous copy statistics.
+  StatusOr<AsyncCopyStats> CalculateAsyncCopyStats() const;
 
   static BufferIntervalCompare GetMemoryBoundednessBufferIntervalCompare(
       const MemorySpaceAssignmentCostAnalysis& cost_analysis);
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index b2125d318d0..984f2e7b4ea 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -184,6 +184,22 @@ class MemorySpaceAssignmentTest : public HloTestBase,
     }
   }
 
+  /*static*/ int64 CountMaximumOutstandingAsyncCopies(const HloModule& module) {
+    int64 max_copies = 0;
+    int64 current_copies = 0;
+    for (HloInstruction* instruction : module.schedule()
+                                           .sequence(module.entry_computation())
+                                           .instructions()) {
+      if (instruction->opcode() == HloOpcode::kCopyStart) {
+        current_copies++;
+      } else if (instruction->opcode() == HloOpcode::kCopyDone) {
+        current_copies--;
+      }
+      max_copies = std::max(max_copies, current_copies);
+    }
+    return max_copies;
+  }
+
   std::unique_ptr<HloModule> CreateEvictAndPrefetchModule() {
     HloComputation::Builder builder(TestName());
     Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
@@ -391,8 +407,7 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies0) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/0);
 
-  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
-            0);
+  EXPECT_EQ(CountMaximumOutstandingAsyncCopies(*module), 0);
 }
 
 TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies1) {
@@ -400,8 +415,7 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies1) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/1);
 
-  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
-            1);
+  EXPECT_EQ(CountMaximumOutstandingAsyncCopies(*module), 1);
 }
 
 TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies2) {
@@ -409,8 +423,7 @@ TEST_P(MemorySpaceAssignmentTest, EvictAndPrefetchLimitAsyncCopies2) {
 
   AssignMemorySpace(module.get(), /*max_outstanding_async_copies=*/2);
 
-  EXPECT_EQ(MemorySpaceAssignment::CountMaximumOutstandingAsyncCopies(*module),
-            2);
+  EXPECT_EQ(CountMaximumOutstandingAsyncCopies(*module), 2);
 }
 
 // TODO(berkin): This test is broken with some prefetch timing improvements.

From 90dc8696e7cc810a0c5df8f1cc6cbc4cd0d70ccf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 20:22:59 -0700
Subject: [PATCH 323/412] Fix the quantile accumulator to always return at most
 the number of requested boundaries.

PiperOrigin-RevId: 311846622
Change-Id: I6a3d4bf3efbf4ce171f62e1af2dc29bd77cd4063
---
 .../kernels/boosted_trees/quantile_ops.cc     |  5 ++++-
 .../boosted_trees/quantile_ops_test.py        | 22 ++++++++++++++-----
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
index 0065bdd66aa..0de08bcff2d 100644
--- a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
@@ -65,7 +65,8 @@ std::vector<float> GenerateBoundaries(const QuantileStream& stream,
 
   // Uniquify elements as we may get dupes.
   auto end_it = std::unique(boundaries.begin(), boundaries.end());
-  boundaries.resize(std::distance(boundaries.begin(), end_it));
+  boundaries.resize(std::min<size_t>(std::distance(boundaries.begin(), end_it),
+                                     num_boundaries));
   return boundaries;
 }
 
@@ -421,6 +422,8 @@ class BoostedTreesQuantileStreamResourceFlushOp : public OpKernel {
             generate_quantiles_ ? GenerateQuantiles(*stream, num_buckets)
                                 : GenerateBoundaries(*stream, num_buckets),
             stream_idx);
+        VLOG(1) << "Created " << stream_resource->boundaries(stream_idx).size()
+                << " boundaries.";
       }
     };
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index fb44c33d602..7c3a382c955 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -82,7 +82,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     self.eps = 0.01
     self.max_elements = 1 << 16
-    self.num_quantiles = constant_op.constant(3, dtype=dtypes.int64)
+    self.num_quantiles = constant_op.constant(4, dtype=dtypes.int64)
 
   def testBasicQuantileBucketsSingleResource(self):
     with self.cached_session() as sess:
@@ -183,7 +183,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -202,7 +205,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()
@@ -215,7 +221,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -233,7 +242,10 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
+          num_streams=2,
+          num_quantiles=self.num_quantiles,
+          epsilon=self.eps,
+          name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()

From b60f79dad844f2b63d17d86ac46ff982b1e43057 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 21:18:33 -0700
Subject: [PATCH 324/412] Fix the quantile accumulator to always return at most
 the number of requested boundaries.

PiperOrigin-RevId: 311850905
Change-Id: If188c30fdb6e9968c809c60bffdd0c8a31297cac
---
 .../kernels/boosted_trees/quantile_ops.cc     |  5 +----
 .../boosted_trees/quantile_ops_test.py        | 22 +++++--------------
 2 files changed, 6 insertions(+), 21 deletions(-)

diff --git a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
index 0de08bcff2d..0065bdd66aa 100644
--- a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
@@ -65,8 +65,7 @@ std::vector<float> GenerateBoundaries(const QuantileStream& stream,
 
   // Uniquify elements as we may get dupes.
   auto end_it = std::unique(boundaries.begin(), boundaries.end());
-  boundaries.resize(std::min<size_t>(std::distance(boundaries.begin(), end_it),
-                                     num_boundaries));
+  boundaries.resize(std::distance(boundaries.begin(), end_it));
   return boundaries;
 }
 
@@ -422,8 +421,6 @@ class BoostedTreesQuantileStreamResourceFlushOp : public OpKernel {
             generate_quantiles_ ? GenerateQuantiles(*stream, num_buckets)
                                 : GenerateBoundaries(*stream, num_buckets),
             stream_idx);
-        VLOG(1) << "Created " << stream_resource->boundaries(stream_idx).size()
-                << " boundaries.";
       }
     };
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index 7c3a382c955..fb44c33d602 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -82,7 +82,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     self.eps = 0.01
     self.max_elements = 1 << 16
-    self.num_quantiles = constant_op.constant(4, dtype=dtypes.int64)
+    self.num_quantiles = constant_op.constant(3, dtype=dtypes.int64)
 
   def testBasicQuantileBucketsSingleResource(self):
     with self.cached_session() as sess:
@@ -183,10 +183,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2,
-          num_quantiles=self.num_quantiles,
-          epsilon=self.eps,
-          name="q0")
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -205,10 +202,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2,
-          num_quantiles=self.num_quantiles,
-          epsilon=self.eps,
-          name="q0")
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()
@@ -221,10 +215,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2,
-          num_quantiles=self.num_quantiles,
-          epsilon=self.eps,
-          name="q0")
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
 
       save = saver.Saver()
       resources.initialize_resources(resources.shared_resources()).run()
@@ -242,10 +233,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
     with self.session(graph=ops.Graph()) as sess:
       accumulator = boosted_trees_ops.QuantileAccumulator(
-          num_streams=2,
-          num_quantiles=self.num_quantiles,
-          epsilon=self.eps,
-          name="q0")
+          num_streams=2, num_quantiles=3, epsilon=self.eps, name="q0")
       save = saver.Saver()
       save.restore(sess, save_path)
       buckets = accumulator.get_bucket_boundaries()

From eb71191f7b03e8248d760ea02776582536b7492d Mon Sep 17 00:00:00 2001
From: Xunkai Zhang <xunkai@google.com>
Date: Fri, 15 May 2020 21:32:46 -0700
Subject: [PATCH 325/412] [tfls.metadata] Use java7 version opts to build
 metadata lib.

PiperOrigin-RevId: 311851755
Change-Id: I853e23a60f37cb89cc57653e85ff708ed467d512
---
 third_party/flatbuffers/build_defs.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/flatbuffers/build_defs.bzl b/third_party/flatbuffers/build_defs.bzl
index d07ad18630f..9be627119cf 100644
--- a/third_party/flatbuffers/build_defs.bzl
+++ b/third_party/flatbuffers/build_defs.bzl
@@ -472,6 +472,7 @@ def flatbuffer_java_library(
     native.java_library(
         name = name,
         srcs = [out_srcjar],
+        javacopts = ["-source 7 -target 7"],
         deps = [
             "@flatbuffers//:runtime_java",
         ],
@@ -562,7 +563,6 @@ def flatbuffer_android_library(
         srcs,
         custom_package = "",
         package_prefix = "",
-        javacopts = None,
         include_paths = DEFAULT_INCLUDE_PATHS,
         flatc_args = DEFAULT_FLATC_ARGS,
         visibility = None):
@@ -575,7 +575,6 @@ def flatbuffer_android_library(
           namespace in the schema files will be used. (optional)
       package_prefix: like custom_package, but prefixes to the existing
           namespace. (optional)
-      javacopts: List of options to pass to javac.
       include_paths: List of paths that includes files can be found in. (optional)
       flatc_args: List of additional arguments to pass to flatc. (optional)
       visibility: Visibility setting for the android_library rule. (optional)
@@ -604,6 +603,7 @@ def flatbuffer_android_library(
     android_library(
         name = name,
         srcs = [out_srcjar],
+        javacopts = ["-source 7 -target 7"],
         visibility = visibility,
         deps = [
             "@flatbuffers//:runtime_android",

From e234c0a44e526dd79d782ad5623ea9f3f3298139 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 15 May 2020 23:01:01 -0700
Subject: [PATCH 326/412] [tf.data] Update output time functions to solve the
 stack overflow problem. Also update some mathematics computation in the code.

PiperOrigin-RevId: 311858942
Change-Id: Iafa345b5d235c60a455671c924af594396a361ad
---
 tensorflow/core/framework/model.cc            | 658 +++++++++++-------
 tensorflow/core/framework/model.h             |  82 ++-
 tensorflow/core/framework/model_test.cc       | 114 +--
 .../python/data/kernel_tests/options_test.py  |   2 +-
 4 files changed, 533 insertions(+), 323 deletions(-)

diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index b4a54029a4f..658be94b9bb 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -25,10 +25,6 @@ namespace data {
 namespace model {
 namespace {
 
-// Key of the derivative w.r.t. the last input time in the gradient of
-// `OutputTime`.
-constexpr char kInputTimeDerivativeKey[] = "last_input_time";
-
 // Wrapper for the square function to reduce verbosity.
 inline double Square(double x) { return x * x; }
 
@@ -50,34 +46,60 @@ class InterleaveMany : public Node {
         Args{id_, name_, std::move(output)});
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double old_input_time;
+    if (output_) {
+      old_input_time = (*input_times)[output_->long_name()];
+    } else {
+      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+
+    if (num_inputs() <= 1) {
+      (*input_times)[long_name()] = old_input_time;
+      return;
+    }
+    double new_input_time =
+        old_input_time +
+        SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
+    (*input_times)[long_name()] = new_input_time;
+  }
+
   // The output time is the sum of the self processing time and the average
   // output time of inputs comprising the interleave "cycle".
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
     if (num_inputs() <= 1) {
-      return SelfProcessingTimeLocked();
-    }
-    double delta = SelfProcessingTimeLocked() * (num_inputs() - 1);
-    input_times->back() += delta;
-    auto cleanup = gtl::MakeCleanup(
-        [input_times, delta]() { input_times->back() -= delta; });
-    double output_time;
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
-      output_time =
-          (OutputTimeForInputs(input_times, &inputs_gradient) -
-           inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr)) /
-          static_cast<double>(num_inputs() - 1);
-      for (auto& pair : inputs_gradient) {
-        (*gradient)[pair.first] =
-            pair.second / static_cast<double>(num_inputs() - 1);
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
+        }
       }
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + inputs_gradient[kInputTimeDerivativeKey] /
-                                    static_cast<double>(num_inputs() - 1);
+      return;
+    }
+
+    double output_time = (OutputTimeForInputs(*output_times) -
+                          (*output_times)[inputs_.front()->long_name()]) /
+                         static_cast<double>(num_inputs() - 1);
+    if (gradients) {
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient /= static_cast<double>(num_inputs() - 1);
+        }
+      }
+
+      (*output_time_gradients)[long_name()] =
+          (OutputTimeGradientsForInputs(*output_time_gradients) -
+           (*output_time_gradients)[inputs_.front()->long_name()]) /
+          static_cast<double>(num_inputs() - 1);
+
       // Set derivatives w.r.t. tunable parameters of the subtree rooted in the
       // first input equal to 0 since its output time is excluded from
       // computations.
@@ -85,15 +107,10 @@ class InterleaveMany : public Node {
           first_input_parameters;
       inputs_.front()->CollectTunableParameters(&first_input_parameters);
       for (auto& pair : first_input_parameters) {
-        (*gradient)[pair.first] = 0.0L;
+        (*gradients)[pair.first] = 0.0L;
       }
-    } else {
-      output_time =
-          (OutputTimeForInputs(input_times, /*gradient=*/nullptr) -
-           inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr)) /
-          static_cast<double>(num_inputs() - 1);
     }
-    return SelfProcessingTimeLocked() + output_time;
+    (*output_times)[long_name()] = self_processing_time + output_time;
   }
 
   // The processing time is the sum of the self processing time and the average
@@ -107,16 +124,15 @@ class InterleaveMany : public Node {
       (*processing_times)[long_name()] = self_processing_time;
     }
     if (num_inputs() <= 1) {
-      total_processing_times->insert(
-          std::make_pair(long_name(), self_processing_time));
+      (*total_processing_times)[long_name()] = self_processing_time;
       return;
     }
     double processing_time =
         (TotalProcessingTimeForInputs(*total_processing_times) -
          (*total_processing_times)[inputs_.front()->long_name()]) /
         static_cast<double>(num_inputs() - 1);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 };
 
@@ -148,55 +164,85 @@ class AsyncInterleaveMany : public Node {
         Args{id_, name_, std::move(output)}, parameters);
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double input_time;
+
+    if (num_inputs() <= 1) {
+      if (output_) {
+        input_time = (*input_times)[output_->long_name()];
+      } else {
+        input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      }
+    } else {
+      input_time =
+          SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
+    }
+    (*input_times)[long_name()] = input_time;
+  }
+
   // The output time is estimated using `ComputeWaitTime(output_time,
   // input_time, parallelism, ...)`, where `output_time` is the sum of the
   // self-processing time and the average output time of inputs comprising the
   // interleave "cycle", `input_time` is specified through `input_times` and
   // `buffer_size` is derived from parallelism.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
     if (num_inputs() <= 1) {
-      return SelfProcessingTimeLocked();
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
+        }
+      }
+      return;
     }
-    double old_input_time = input_times->back();
-    double new_input_time =
-        SelfProcessingTimeLocked() * static_cast<double>(num_inputs() - 1);
-    input_times->push_back(new_input_time);
-    auto cleanup =
-        gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
+
+    double input_time;
+    if (output_) {
+      input_time = input_times.at(output_->long_name());
+    } else {
+      input_time = gtl::FindWithDefault(input_times, kInputTimeKey, 0.0L);
+    }
+
     double parallelism = num_inputs() - 1;  // default to cycle length
     auto* parameter = gtl::FindOrNull(parameters_, kParallelism);
     if (parameter) {
       parallelism = std::min(parallelism, (*parameter)->value);
     }
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
-      double output_time_for_inputs =
-          OutputTimeForInputs(input_times, &inputs_gradient) -
-          inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr);
-      double output_time = output_time_for_inputs /
-                           static_cast<double>(num_inputs() - 1) / parallelism;
+
+    double output_time_for_inputs =
+        OutputTimeForInputs(*output_times) -
+        (*output_times)[inputs_.front()->long_name()];
+    double output_time = output_time_for_inputs /
+                         static_cast<double>(num_inputs() - 1) / parallelism;
+    double result;
+
+    if (gradients) {
       double output_time_der = 0.0L;
       double input_time_der = 0.0L;
       double buffer_size_der = 0.0L;
-      double result = ComputeWaitTime(
-          SelfProcessingTimeLocked() + output_time, old_input_time, parallelism,
-          &output_time_der, &input_time_der, &buffer_size_der);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + input_time_der;
+      result = ComputeWaitTime(self_processing_time + output_time, input_time,
+                               parallelism, &output_time_der, &input_time_der,
+                               &buffer_size_der);
+      (*output_time_gradients)[long_name()] = input_time_der;
       double parallelism_der = -output_time_for_inputs /
                                static_cast<double>(num_inputs() - 1) /
                                Square(parallelism);
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = output_time_der * pair.second /
-                                    static_cast<double>(num_inputs() - 1) /
-                                    parallelism;
+
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= (output_time_der /
+                        static_cast<double>(num_inputs() - 1) / parallelism);
         }
       }
+
       // Set derivatives w.r.t. tunable parameters of the subtree rooted in the
       // first input equal to 0 since its output time is excluded from
       // computations.
@@ -204,23 +250,21 @@ class AsyncInterleaveMany : public Node {
           first_input_parameters;
       inputs_.front()->CollectTunableParameters(&first_input_parameters);
       for (auto& pair : first_input_parameters) {
-        (*gradient)[pair.first] = 0.0L;
+        (*gradients)[pair.first] = 0.0L;
       }
       // Add derivative w.r.t. own parallelism parameter.
       if (parameter && (*parameter)->state->tunable) {
-        (*gradient)[long_name()] =
+        (*gradients)[long_name()] =
             output_time_der * parallelism_der + buffer_size_der;
       }
-      return result;
+    } else {
+      result = ComputeWaitTime(self_processing_time + output_time, input_time,
+                               parallelism,
+                               /*output_time_derivative=*/nullptr,
+                               /*input_time_derivative=*/nullptr,
+                               /*buffer_size_derivative=*/nullptr);
     }
-    double output_time =
-        (OutputTimeForInputs(input_times, /*gradient=*/nullptr) -
-         inputs_.front()->OutputTime(input_times, /*gradient=*/nullptr)) /
-        static_cast<double>(num_inputs() - 1) / parallelism;
-    return ComputeWaitTime(
-        SelfProcessingTimeLocked() + output_time, old_input_time, parallelism,
-        /*output_time_derivative=*/nullptr,
-        /*input_time_derivative=*/nullptr, /*buffer_size_derivative=*/nullptr);
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the average
@@ -234,16 +278,15 @@ class AsyncInterleaveMany : public Node {
       (*processing_times)[long_name()] = self_processing_time;
     }
     if (num_inputs() <= 1) {
-      total_processing_times->insert(
-          std::make_pair(long_name(), self_processing_time));
+      (*total_processing_times)[long_name()] = self_processing_time;
       return;
     }
     double processing_time =
         (TotalProcessingTimeForInputs(*total_processing_times) -
          (*total_processing_times)[inputs_.front()->long_name()]) /
         static_cast<double>(num_inputs() - 1);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 };
 
@@ -260,41 +303,55 @@ class KnownRatio : public Node {
                                         ratio_);
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double old_input_time;
+    if (output_) {
+      old_input_time = (*input_times)[output_->long_name()];
+    } else {
+      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+
+    if (ratio_ == 0) {
+      (*input_times)[long_name()] = old_input_time;
+      return;
+    }
+    double new_input_time =
+        (old_input_time + SelfProcessingTimeLocked()) / ratio_;
+    (*input_times)[long_name()] = new_input_time;
+  }
+
   // The output time is the sum of the self processing time and the product of
   // `ratio_` and the sum of output times of inputs.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
     if (ratio_ == 0) {
-      return SelfProcessingTimeLocked();
-    }
-    double old_input_time = input_times->back();
-    input_times->back() =
-        (old_input_time + SelfProcessingTimeLocked()) / ratio_;
-    auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
-      input_times->back() = old_input_time;
-    });
-    double result;
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
-      result = SelfProcessingTimeLocked() +
-               ratio_ * OutputTimeForInputs(input_times, &inputs_gradient);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + ratio_ *
-                                    inputs_gradient[kInputTimeDerivativeKey] *
-                                    (1.0L + 1.0L / ratio_);
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = pair.second * ratio_;
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
         }
       }
-    } else {
-      result = SelfProcessingTimeLocked() +
-               ratio_ * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
+      return;
     }
-    return result;
+    double result =
+        self_processing_time + ratio_ * OutputTimeForInputs(*output_times);
+    if (gradients) {
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= ratio_;
+        }
+      }
+      (*output_time_gradients)[long_name()] =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+    }
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the product
@@ -309,8 +366,8 @@ class KnownRatio : public Node {
     }
     double processing_time =
         ratio_ * TotalProcessingTimeForInputs(*total_processing_times);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 
  private:
@@ -340,6 +397,29 @@ class AsyncKnownRatio : public Node {
         Args{id_, name_, std::move(output)}, ratio_, parameters);
   }
 
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double input_time;
+
+    if (ratio_ == 0.0) {
+      if (output_) {
+        input_time = (*input_times)[output_->long_name()];
+      } else {
+        input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+      }
+      (*input_times)[long_name()] = input_time;
+      return;
+    }
+
+    double parallelism = 1.0;
+    auto* parallelism_parameter = gtl::FindOrNull(parameters_, kParallelism);
+    if (parallelism_parameter) {
+      parallelism = (*parallelism_parameter)->value;
+    }
+    input_time = SelfProcessingTimeLocked() / ratio_ / parallelism;
+    (*input_times)[long_name()] = input_time;
+  }
+
   // The output time is estimated using `ComputeWaitTime(output_time,
   // input_time, parallelism, ...)`, where `output_time` is the sum of the self
   // processing time and the product of `ratio_` and the sum of output times of
@@ -347,9 +427,12 @@ class AsyncKnownRatio : public Node {
   // has parallelism parameter, then `buffer_size` is derived from parallelism.
   //
   // Current implementation assumes that there is at most 1 parameter per node.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
     double parallelism = 1.0;
     double buffer_size = 0.0;
     auto* parallelism_parameter = gtl::FindOrNull(parameters_, kParallelism);
@@ -361,80 +444,85 @@ class AsyncKnownRatio : public Node {
       buffer_size = (*buffer_size_parameter)->value;
     }
     double self_processing_time = SelfProcessingTimeLocked();
+    double result;
+    double input_time;
+    if (output_) {
+      input_time = input_times.at(output_->long_name());
+    } else {
+      input_time = gtl::FindWithDefault(input_times, kInputTimeKey, 0.0L);
+    }
+
     if (ratio_ == 0.0) {
       double output_time = self_processing_time / parallelism;
-      if (gradient) {
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
+        }
+
         double output_time_der = 0.0L;
         double input_time_der = 0.0L;
         double buffer_size_der = 0.0L;
-        double result = ComputeWaitTime(output_time, input_times->back(),
-                                        buffer_size, &output_time_der,
-                                        &input_time_der, &buffer_size_der);
-        auto last_input_time_der =
-            gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-        (*gradient)[kInputTimeDerivativeKey] =
-            last_input_time_der + input_time_der;
+        result = ComputeWaitTime(output_time, input_time, buffer_size,
+                                 &output_time_der, &input_time_der,
+                                 &buffer_size_der);
+        (*output_time_gradients)[long_name()] = input_time_der;
         // Add derivative w.r.t. own parameter if it's tunable.
         if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
-          (*gradient)[long_name()] =
+          (*gradients)[long_name()] =
               -output_time_der * self_processing_time / Square(parallelism) +
               buffer_size_der;
         } else if (buffer_size_parameter &&
                    (*buffer_size_parameter)->state->tunable) {
-          (*gradient)[long_name()] = buffer_size_der;
+          (*gradients)[long_name()] = buffer_size_der;
         }
-        return result;
+      } else {
+        result = ComputeWaitTime(output_time, input_time, buffer_size,
+                                 /*output_time_derivative=*/nullptr,
+                                 /*input_time_derivative=*/nullptr,
+                                 /*buffer_size_derivative=*/nullptr);
       }
-      return ComputeWaitTime(output_time, input_times->back(), buffer_size,
-                             /*output_time_derivative=*/nullptr,
-                             /*input_time_derivative=*/nullptr,
-                             /*buffer_size_derivative=*/nullptr);
+      (*output_times)[long_name()] = result;
+      return;
     }
-    double old_input_time = input_times->back();
-    double new_input_time = self_processing_time / ratio_ / parallelism;
-    input_times->push_back(new_input_time);
-    auto cleanup =
-        gtl::MakeCleanup([input_times]() { input_times->pop_back(); });
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
+
+    double output_time = self_processing_time / parallelism +
+                         ratio_ * OutputTimeForInputs(*output_times);
+    if (gradients) {
       double output_time_der = 0.0L;
       double input_time_der = 0.0L;
       double buffer_size_der = 0.0L;
-      double output_time =
-          self_processing_time / parallelism +
-          ratio_ * OutputTimeForInputs(input_times, &inputs_gradient);
-      double result =
-          ComputeWaitTime(output_time, old_input_time, buffer_size,
+      result =
+          ComputeWaitTime(output_time, input_time, buffer_size,
                           &output_time_der, &input_time_der, &buffer_size_der);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der + input_time_der;
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = pair.second * ratio_ * output_time_der;
+      (*output_time_gradients)[long_name()] = input_time_der;
+
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= (ratio_ * output_time_der);
         }
       }
+
       // Add derivative w.r.t. own parameter if it's tunable.
       if (parallelism_parameter && (*parallelism_parameter)->state->tunable) {
-        (*gradient)[long_name()] =
+        double inputs_time_der_sum =
+            OutputTimeGradientsForInputs(*output_time_gradients);
+        (*gradients)[long_name()] =
             -output_time_der * self_processing_time / Square(parallelism) +
             buffer_size_der -
-            output_time_der * inputs_gradient[kInputTimeDerivativeKey] *
-                self_processing_time / Square(parallelism);
+            output_time_der * inputs_time_der_sum * self_processing_time /
+                Square(parallelism);
       } else if (buffer_size_parameter &&
                  (*buffer_size_parameter)->state->tunable) {
-        (*gradient)[long_name()] = buffer_size_der;
+        (*gradients)[long_name()] = buffer_size_der;
       }
-      return result;
+    } else {
+      result = ComputeWaitTime(output_time, input_time, buffer_size,
+                               /*output_time_derivative=*/nullptr,
+                               /*input_time_derivative=*/nullptr,
+                               /*buffer_size_derivative=*/nullptr);
     }
-    double output_time =
-        self_processing_time / parallelism +
-        ratio_ * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
-    return ComputeWaitTime(output_time, old_input_time, buffer_size,
-                           /*output_time_derivative=*/nullptr,
-                           /*input_time_derivative=*/nullptr,
-                           /*buffer_size_derivative=*/nullptr);
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the product
@@ -449,8 +537,8 @@ class AsyncKnownRatio : public Node {
     }
     double processing_time =
         ratio_ * TotalProcessingTimeForInputs(*total_processing_times);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 
  private:
@@ -469,44 +557,64 @@ class UnknownRatio : public Node {
     return std::make_shared<UnknownRatio>(Args{id_, name_, std::move(output)});
   }
 
-  // The output time is the sum of the self processing time and the product of
-  // the ratio estimate and the sum of output times of inputs.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const override TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double old_input_time;
+    if (output_) {
+      old_input_time = (*input_times)[output_->long_name()];
+    } else {
+      old_input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+
     if (num_elements_ == 0 || inputs_.empty() ||
         inputs_.front()->num_elements() == 0) {
-      return SelfProcessingTimeLocked();
+      (*input_times)[long_name()] = old_input_time;
+      return;
     }
-    // TODO(jsimsa): The current implementation assumes that the number of input
-    // elements consumed per output is the same across all inputs.
     std::shared_ptr<Node> input = inputs_.front();
     double ratio = static_cast<double>(input->num_elements()) /
                    static_cast<double>(num_elements_);
-    double old_input_time = input_times->back();
-    input_times->back() = (old_input_time + SelfProcessingTimeLocked()) / ratio;
-    auto cleanup = gtl::MakeCleanup([input_times, old_input_time]() {
-      input_times->back() = old_input_time;
-    });
-    if (gradient) {
-      absl::flat_hash_map<string, double> inputs_gradient;
-      double result =
-          SelfProcessingTimeLocked() +
-          ratio * OutputTimeForInputs(input_times, &inputs_gradient);
-      auto last_input_time_der =
-          gtl::FindWithDefault(*gradient, kInputTimeDerivativeKey, 0.0L);
-      (*gradient)[kInputTimeDerivativeKey] =
-          last_input_time_der +
-          inputs_gradient[kInputTimeDerivativeKey] / ratio;
-      for (auto& pair : inputs_gradient) {
-        if (pair.first != kInputTimeDerivativeKey) {
-          (*gradient)[pair.first] = pair.second * ratio;
+    double new_input_time =
+        (old_input_time + SelfProcessingTimeLocked()) / ratio;
+    (*input_times)[long_name()] = new_input_time;
+  }
+
+  // The output time is the sum of the self processing time and the product of
+  // the ratio estimate and the sum of output times of inputs.
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double self_processing_time = SelfProcessingTimeLocked();
+    if (num_elements_ == 0 || inputs_.empty() ||
+        inputs_.front()->num_elements() == 0) {
+      (*output_times)[long_name()] = self_processing_time;
+      if (gradients) {
+        for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+          gradients->erase(node->long_name());
         }
       }
-      return result;
+      return;
     }
-    return SelfProcessingTimeLocked() +
-           ratio * OutputTimeForInputs(input_times, /*gradient=*/nullptr);
+    // TODO(jsimsa): The current implementation assumes that the number of input
+    // elements consumed per output is the same across all inputs.
+    double ratio = static_cast<double>(inputs_.front()->num_elements()) /
+                   static_cast<double>(num_elements_);
+    double result =
+        self_processing_time + ratio * OutputTimeForInputs(*output_times);
+    if (gradients) {
+      for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+        auto* gradient = gtl::FindOrNull(*gradients, node->long_name());
+        if (gradient) {
+          *gradient *= ratio;
+        }
+      }
+      (*output_time_gradients)[long_name()] =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+    }
+    (*output_times)[long_name()] = result;
   }
 
   // The processing time is the sum of the self processing time and the product
@@ -520,8 +628,7 @@ class UnknownRatio : public Node {
       (*processing_times)[long_name()] = self_processing_time;
     }
     if (inputs_.empty() || num_elements_ == 0) {
-      total_processing_times->insert(
-          std::make_pair(long_name(), self_processing_time));
+      (*total_processing_times)[long_name()] = self_processing_time;
       return;
     }
     // TODO(jsimsa): The current implementation assumes that the number of input
@@ -531,8 +638,8 @@ class UnknownRatio : public Node {
                    static_cast<double>(num_elements_);
     double processing_time =
         ratio * TotalProcessingTimeForInputs(*total_processing_times);
-    total_processing_times->insert(
-        std::make_pair(long_name(), self_processing_time + processing_time));
+    (*total_processing_times)[long_name()] =
+        self_processing_time + processing_time;
   }
 };
 
@@ -548,11 +655,30 @@ class Unknown : public Node {
     return std::make_shared<Unknown>(Args{id_, name_, std::move(output)});
   }
 
-  // The output time is the sum of output times of inputs.
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    return OutputTimeForInputs(input_times, gradient);
+    double input_time;
+    if (output_) {
+      input_time = (*input_times)[output_->long_name()];
+    } else {
+      input_time = gtl::FindWithDefault(*input_times, kInputTimeKey, 0.0L);
+    }
+    (*input_times)[long_name()] = input_time;
+  }
+
+  // The output time is the sum of output times of inputs.
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    double result = OutputTimeForInputs(*output_times);
+    (*output_times)[long_name()] = result;
+    if (gradients) {
+      (*output_time_gradients)[long_name()] =
+          OutputTimeGradientsForInputs(*output_time_gradients);
+    }
   }
 
   // The processing time is the sum of processing times of inputs.
@@ -562,8 +688,7 @@ class Unknown : public Node {
       TF_SHARED_LOCKS_REQUIRED(mu_) {
     double processing_time =
         TotalProcessingTimeForInputs(*total_processing_times);
-    total_processing_times->insert(
-        std::make_pair(long_name(), processing_time));
+    (*total_processing_times)[long_name()] = processing_time;
   }
 };
 
@@ -751,19 +876,21 @@ double Node::ComputeWaitTime(const double& output_time,
 
 void Node::CollectTunableParameters(
     absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const {
-  CollectTunableParametersHelper(parameters);
-
+  tf_shared_lock l(mu_);
   // Collect tunable parameters from the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
     node->CollectTunableParametersHelper(parameters);
   }
+  CollectTunableParametersHelper(parameters);
 }
 
 string Node::DebugString() const {
   absl::flat_hash_map<string, string> debug_strings;
-
+  tf_shared_lock l(mu_);
   // Build up the debug string from the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
     node->DebugStringHelper(&debug_strings);
   }
   DebugStringHelper(&debug_strings);
@@ -780,10 +907,35 @@ void Node::FlushMetrics() {
   metrics_.record_num_elements(num_elements_);
 }
 
-double Node::OutputTime(std::vector<double>* input_times,
-                        absl::flat_hash_map<string, double>* gradient) const {
+double Node::OutputTime(absl::flat_hash_map<string, double>* input_times,
+                        absl::flat_hash_map<string, double>* gradients) const {
+  // To store the output time gradient w.r.t. input time (if `gradients` is not
+  // `nullptr`) and the output time for each node.
+  absl::flat_hash_map<string, double> output_time_gradients, output_times;
   tf_shared_lock l(mu_);
-  return OutputTimeLocked(input_times, gradient);
+  auto nodes = CollectNodes(TraversalOrder::BFS);
+
+  // Computes and stores input time for each node from the root to leaves of the
+  // nodes tree.
+  InputTimeLocked(input_times);
+  for (const auto& node : nodes) {
+    tf_shared_lock l(node->mu_);
+    node->InputTimeLocked(input_times);
+  }
+
+  std::reverse(nodes.begin(), nodes.end());
+  // Computes and stores the output time and output time gradient w.r.t. input
+  // time (if `gradients` is not `nullptr`) for each node from leaves of the
+  // nodes tree to the root.
+  for (const auto& node : nodes) {
+    tf_shared_lock l(node->mu_);
+    node->OutputTimeLocked(*input_times, gradients, &output_times,
+                           &output_time_gradients);
+  }
+  OutputTimeLocked(*input_times, gradients, &output_times,
+                   &output_time_gradients);
+
+  return output_times[long_name()];
 }
 
 std::shared_ptr<Node> Node::Snapshot(std::shared_ptr<Node> output) const {
@@ -808,9 +960,10 @@ double Node::SelfProcessingTime() const {
 
 double Node::TotalBufferedBytes() const {
   absl::flat_hash_map<string, double> total_bytes;
-
+  tf_shared_lock l(mu_);
   // Compute total buffered bytes from the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
     node->TotalBufferedBytesHelper(&total_bytes);
   }
   TotalBufferedBytesHelper(&total_bytes);
@@ -820,10 +973,11 @@ double Node::TotalBufferedBytes() const {
 
 double Node::TotalMaximumBufferedBytes() const {
   absl::flat_hash_map<string, double> total_bytes;
-
+  tf_shared_lock l(mu_);
   // Compute total maximum buffered bytes from the leaves of the nodes tree
   // to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
+    tf_shared_lock l(node->mu_);
     node->TotalMaximumBufferedBytesHelper(&total_bytes);
   }
   TotalMaximumBufferedBytesHelper(&total_bytes);
@@ -836,17 +990,16 @@ double Node::TotalProcessingTime(
   // Create a hash map to store the per-element CPU time spent in the subtree
   // rooted in each node.
   absl::flat_hash_map<string, double> total_processing_times;
+  tf_shared_lock l(mu_);
 
   // Computes per-element CPU time spent in the subtree rooted in the node from
   // the leaves of the nodes tree to the root.
-  for (const auto& node : CollectNodes()) {
+  for (const auto& node : CollectNodes(TraversalOrder::REVERSE_BFS)) {
     tf_shared_lock l(node->mu_);
     node->TotalProcessingTimeLocked(processing_times, &total_processing_times);
   }
-  {
-    tf_shared_lock l(mu_);
-    TotalProcessingTimeLocked(processing_times, &total_processing_times);
-  }
+  TotalProcessingTimeLocked(processing_times, &total_processing_times);
+
   return total_processing_times[long_name()];
 }
 
@@ -859,13 +1012,25 @@ double Node::AverageBufferedElementSize() const {
 }
 
 double Node::OutputTimeForInputs(
-    std::vector<double>* input_times,
-    absl::flat_hash_map<string, double>* gradient) const {
+    const absl::flat_hash_map<string, double>& output_times) const {
   double sum = 0;
   for (auto& input : inputs_) {
     // Inputs for which autotuning is disabled are excluded.
     if (input->autotune()) {
-      sum += input->OutputTime(input_times, gradient);
+      sum += output_times.at(input->long_name());
+    }
+  }
+  return sum;
+}
+
+double Node::OutputTimeGradientsForInputs(
+    const absl::flat_hash_map<string, double>& output_time_gradients) const {
+  double sum = 0;
+  for (auto& input : inputs_) {
+    // Inputs for which autotuning is disabled are excluded.
+    if (input->autotune()) {
+      sum +=
+          gtl::FindWithDefault(output_time_gradients, input->long_name(), 0.0L);
     }
   }
   return sum;
@@ -919,12 +1084,12 @@ double Node::SelfProcessingTimeLocked() const {
          static_cast<double>(num_elements_);
 }
 
-Node::NodeVector Node::CollectNodes() const {
+Node::NodeVector Node::CollectNodes(TraversalOrder order) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
   NodeVector node_vector;
   std::list<std::shared_ptr<Node>> temp_list;
 
   {
-    tf_shared_lock l(mu_);
     for (auto& input : inputs_) {
       node_vector.push_back(input);
       temp_list.push_back(input);
@@ -942,16 +1107,19 @@ Node::NodeVector Node::CollectNodes() const {
       }
     }
   }
-  std::reverse(node_vector.begin(), node_vector.end());
+
+  if (order == TraversalOrder::REVERSE_BFS) {
+    std::reverse(node_vector.begin(), node_vector.end());
+  }
   return node_vector;
 }
 
 void Node::CollectTunableParametersHelper(
-    absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const {
+    absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
   if (!autotune_) {
     return;
   }
-  tf_shared_lock l(mu_);
   for (auto& pair : parameters_) {
     if (pair.second->state->tunable) {
       parameters->insert(std::make_pair(long_name(), pair.second));
@@ -959,9 +1127,8 @@ void Node::CollectTunableParametersHelper(
   }
 }
 
-void Node::DebugStringHelper(
-    absl::flat_hash_map<string, string>* debug_strings) const {
-  tf_shared_lock l(mu_);
+void Node::DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
+    const TF_SHARED_LOCKS_REQUIRED(mu_) {
   string result;
   strings::StrAppend(&result, long_name(), ":\n");
   strings::StrAppend(&result, "  autotune=", autotune_.load(), "\n");
@@ -1011,13 +1178,13 @@ std::shared_ptr<Node> Node::SnapshotHelper(
 }
 
 void Node::TotalBufferedBytesHelper(
-    absl::flat_hash_map<string, double>* total_bytes) const {
+    absl::flat_hash_map<string, double>* total_bytes) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
   if (!autotune_) {
     total_bytes->insert(std::make_pair(long_name(), 0));
     return;
   }
 
-  tf_shared_lock l(mu_);
   double result = 0;
   auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
   if (!parameter) {
@@ -1033,13 +1200,13 @@ void Node::TotalBufferedBytesHelper(
 }
 
 void Node::TotalMaximumBufferedBytesHelper(
-    absl::flat_hash_map<string, double>* total_bytes) const {
+    absl::flat_hash_map<string, double>* total_bytes) const
+    TF_SHARED_LOCKS_REQUIRED(mu_) {
   if (!autotune_) {
     total_bytes->insert(std::make_pair(long_name(), 0));
     return;
   }
 
-  tf_shared_lock l(mu_);
   double result = 0;
   auto* parameter = gtl::FindOrNull(parameters_, kBufferSize);
   if (!parameter) {
@@ -1181,8 +1348,8 @@ void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
   double new_output_time;
   double new_value;
   for (int i = 0; i < kMaxIterations; ++i) {
-    absl::flat_hash_map<string, double> gradient;
-    new_output_time = OutputTime(snapshot, &gradient);
+    absl::flat_hash_map<string, double> gradients;
+    new_output_time = OutputTime(snapshot, &gradients);
     int64 model_parallelism = 0;
     for (auto& pair : essential_parameters) {
       model_parallelism += std::round(pair.second->value);
@@ -1199,12 +1366,12 @@ void Model::OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget) {
     for (auto& pair : parameters) {
       if (pair.second->value != pair.second->max) {
         max_abs_derivative =
-            std::max(max_abs_derivative, std::abs(gradient[pair.first]));
+            std::max(max_abs_derivative, std::abs(gradients[pair.first]));
       }
     }
     for (auto& pair : parameters) {
       new_value = pair.second->value -
-                  kDescentStep * gradient[pair.first] / max_abs_derivative;
+                  kDescentStep * gradients[pair.first] / max_abs_derivative;
       // Projection on a feasible interval.
       if (new_value > pair.second->max) {
         pair.second->value = pair.second->max;
@@ -1248,7 +1415,7 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
     pair.second->value = pair.second->min;
   }
   while (true) {
-    const double output_time = OutputTime(snapshot, /*gradient=*/nullptr);
+    const double output_time = OutputTime(snapshot, /*gradients=*/nullptr);
     bool all_max = true;
     for (auto& pair : parameters) {
       if (pair.second->value < pair.second->max) {
@@ -1267,7 +1434,7 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
         continue;
       }
       pair.second->value++;
-      double new_output_time = OutputTime(snapshot, /*gradient=*/nullptr);
+      double new_output_time = OutputTime(snapshot, /*gradients=*/nullptr);
       double delta = output_time - new_output_time;
       if (delta > best_delta &&
           (delta > kBufferSizeMinDelta || pair.second->name != kBufferSize)) {
@@ -1297,15 +1464,18 @@ void Model::OptimizeHillClimb(int64 cpu_budget, int64 ram_budget) {
 }
 
 double Model::OutputTime(std::shared_ptr<Node> node,
-                         absl::flat_hash_map<string, double>* gradient) {
-  std::vector<double> input_times(1, 0);
+                         absl::flat_hash_map<string, double>* gradients) {
+  // To store the input time for each node.
+  absl::flat_hash_map<string, double> input_times;
+
   // TODO(jsimsa): Now that we are accounting for buffer size in wait time
   // computation, assuming that the input is infinitely fast will result in
   // inaccurate estimates of the output latency.
   //
   // We should compute the output latency as a fix-point of the following
   // equation: `output_time = node(OutputTime(input_times(1, output_time))`.
-  return node->OutputTime(&input_times, gradient);
+
+  return node->OutputTime(&input_times, gradients);
 }
 
 double Model::TotalBufferedBytes(std::shared_ptr<Node> node) {
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index a4af549fad2..e325056f0c4 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -42,11 +42,19 @@ constexpr int64 kAutotune = -1;
 constexpr char kParallelism[] = "parallelism";
 constexpr char kBufferSize[] = "buffer_size";
 
+// A key used to identify input time gradient.
+constexpr char kInputTimeKey[] = "input_time";
+
 enum class AutotuneAlgorithm {
   HILL_CLIMB = 0,
   GRADIENT_DESCENT = 1,
 };
 
+enum class TraversalOrder {
+  BFS = 0,
+  REVERSE_BFS = 1,
+};
+
 // Represents thread-safe state that can be shared between an input pipeline and
 // the performance model.
 struct SharedState {
@@ -316,11 +324,11 @@ class Node {
   // Flushes the metrics recorded by this node.
   void FlushMetrics() TF_LOCKS_EXCLUDED(mu_);
 
-  // Returns the per-element output time for this node and if `gradient` is not
-  // `nullptr`, collects the gradient of the output time w.r.t. tunable
-  // parameters of the subtree rooted in this node and the last input time.
-  double OutputTime(std::vector<double>* input_times,
-                    absl::flat_hash_map<string, double>* gradient) const
+  // Returns the per-element output time for this node and if `gradients` is not
+  // `nullptr`, collects the output time gradient w.r.t. tunable parameters of
+  // the subtree rooted in this node.
+  double OutputTime(absl::flat_hash_map<string, double>* input_times,
+                    absl::flat_hash_map<string, double>* gradients) const
       TF_LOCKS_EXCLUDED(mu_);
 
   // Returns a copy of this node, making a deep copy of its inputs and a
@@ -414,20 +422,34 @@ class Node {
   // Returns the average size of an element buffered in this node.
   double AverageBufferedElementSize() const TF_SHARED_LOCKS_REQUIRED(mu_);
 
-  // Returns the sum of per-element output time for the inputs of this node and
-  // if `gradient` is not `nullptr`, collects gradients of output times w.r.t.
-  // tunable parameters and the last input time.
-  double OutputTimeForInputs(std::vector<double>* input_times,
-                             absl::flat_hash_map<string, double>* gradient)
-      const TF_SHARED_LOCKS_REQUIRED(mu_);
+  // Returns the sum of per-element output time for the tunable inputs of this
+  // node.
+  double OutputTimeForInputs(
+      const absl::flat_hash_map<string, double>& output_times) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
-  // Returns the per-element output time for this node and if `gradient` is not
-  // `nullptr`, collects the gradient of the output time w.r.t. tunable
-  // parameters of the subtree rooted in this node and the last input time.
-  virtual double OutputTimeLocked(std::vector<double>* input_times,
-                                  absl::flat_hash_map<string, double>* gradient)
+  // Returns the sum of output time gradient w.r.t. input time for the tunable
+  // inputs of this node.
+  double OutputTimeGradientsForInputs(
+      const absl::flat_hash_map<string, double>& output_time_gradients) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
+
+  // Computes the input time for this node and stores it in `input_times`.
+  virtual void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
       const TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
+  // Computes the per-element output time for this node and stores it in
+  // `output_times`. If `gradients` is not `nullptr`, computes the output time
+  // gradient w.r.t. tunable parameters of the subtree rooted in this node and
+  // stores it in `gradients`, also computes the output time gradient w.r.t.
+  // input time and stores it in `output_time_gradients`.
+  virtual void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const
+      TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
+
   // Returns the sum of per-element processing time for the inputs of this node
   // by adding values for input nodes in `total_processing_times`. Processing
   // time for a given input is a weighted combination of a statistic based on
@@ -452,18 +474,20 @@ class Node {
       absl::flat_hash_map<string, double>* total_processing_times)
       TF_SHARED_LOCKS_REQUIRED(mu_) = 0;
 
-  // Returns a vector of nodes of the subtree rooted in this node.
-  // The nodes are in the reverse breadth-first search order.
-  NodeVector CollectNodes() const;
+  // Returns a vector of nodes of the subtree rooted in this node. The nodes are
+  // either in breadth-first search or reverse breadth-first search order
+  // depending on the `order` argument. The root node itself is not collected.
+  NodeVector CollectNodes(TraversalOrder order) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Collect tunable parameters for the node.
   void CollectTunableParametersHelper(
-      absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters)
-      const;
+      absl::flat_hash_map<string, std::shared_ptr<Parameter>>* parameters) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Build up debug string for the node and store in the debug strings map.
-  void DebugStringHelper(
-      absl::flat_hash_map<string, string>* debug_strings) const;
+  void DebugStringHelper(absl::flat_hash_map<string, string>* debug_strings)
+      const TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Copy the node and add the (input, copy) pairs to the NodePairList.
   std::shared_ptr<Node> SnapshotHelper(std::shared_ptr<Node> clone_base,
@@ -471,12 +495,14 @@ class Node {
 
   // Compute total buffered bytes for the node and store in the total bytes map.
   void TotalBufferedBytesHelper(
-      absl::flat_hash_map<string, double>* total_bytes) const;
+      absl::flat_hash_map<string, double>* total_bytes) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Compute total maximum buffered bytes for the node and store in the total
   // bytes map.
   void TotalMaximumBufferedBytesHelper(
-      absl::flat_hash_map<string, double>* total_bytes) const;
+      absl::flat_hash_map<string, double>* total_bytes) const
+      TF_SHARED_LOCKS_REQUIRED(mu_);
 
   // Stores the time passed to the last call to `Node::record_start()` on the
   // current thread.
@@ -619,11 +645,11 @@ class Model {
   // an element divided by CPU budget.
   void OptimizeGradientDescent(int64 cpu_budget, int64 ram_budget);
 
-  // Collects the output time and if `gradient` is not `nullptr`, the output
+  // Collects the output time and if `gradients` is not `nullptr`, the output
   // time gradient w.r.t. tunable parameters of the subtree rooted in the given
-  // node and the last input time.
+  // node.
   double OutputTime(std::shared_ptr<Node> node,
-                    absl::flat_hash_map<string, double>* gradient);
+                    absl::flat_hash_map<string, double>* gradients);
 
   // Collects the processing time for the given node.
   double TotalProcessingTime(std::shared_ptr<Node> node);
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 898594b7c81..688dd0083e9 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -44,18 +44,19 @@ TEST_P(AsyncInterleaveManyTest, Model) {
     async_interleave_many->remove_input(meta_source);
   });
   std::shared_ptr<Node> source1 =
-      model::MakeSourceNode({1, "source1", async_interleave_many});
+      model::MakeSourceNode({2, "source1", async_interleave_many});
   async_interleave_many->add_input(source1);
   auto cleanup1 = gtl::MakeCleanup([async_interleave_many, source1]() {
     async_interleave_many->remove_input(source1);
   });
   std::shared_ptr<Node> source2 =
-      model::MakeSourceNode({2, "source2", async_interleave_many});
+      model::MakeSourceNode({3, "source2", async_interleave_many});
   async_interleave_many->add_input(source2);
   auto cleanup2 = gtl::MakeCleanup([async_interleave_many, source2]() {
     async_interleave_many->remove_input(source2);
   });
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   EXPECT_EQ(async_interleave_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_interleave_many->TotalMaximumBufferedBytes(), 0);
   async_interleave_many->record_buffer_event(110, 10);
@@ -123,7 +124,8 @@ TEST_P(AsyncKnownRatioTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
   async_known_many->add_input(source2);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   EXPECT_EQ(async_known_many->TotalBufferedBytes(), 0);
   EXPECT_EQ(async_known_many->TotalMaximumBufferedBytes(), 0);
   async_known_many->record_buffer_event(110, 10);
@@ -194,12 +196,12 @@ TEST(InterleaveManyTest, Model) {
       model::MakeSourceNode({1, "meta_source", interleave_many});
   interleave_many->add_input(meta_source);
   std::shared_ptr<Node> source1 =
-      model::MakeSourceNode({1, "source1", interleave_many});
+      model::MakeSourceNode({2, "source1", interleave_many});
   interleave_many->add_input(source1);
   std::shared_ptr<Node> source2 =
-      model::MakeSourceNode({2, "source2", interleave_many});
+      model::MakeSourceNode({3, "source2", interleave_many});
   interleave_many->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   interleave_many->add_processing_time(100);
   EXPECT_EQ(interleave_many->processing_time(), 100);
   EXPECT_EQ(interleave_many->TotalProcessingTime(/*processing_times=*/nullptr),
@@ -238,7 +240,7 @@ TEST_P(KnownRatioTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", known_many});
   known_many->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   source1->add_processing_time(100);
   EXPECT_EQ(known_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(known_many->OutputTime(&input_times, nullptr), 0);
@@ -286,7 +288,7 @@ INSTANTIATE_TEST_SUITE_P(Test, KnownRatioTest, ::testing::Values(0, 1, 2, 4));
 
 TEST(SourceTest, Model) {
   std::shared_ptr<Node> source = model::MakeSourceNode({0, "source", nullptr});
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   source->add_processing_time(100);
   EXPECT_EQ(source->processing_time(), 100);
   EXPECT_EQ(source->TotalProcessingTime(/*processing_times=*/nullptr), 0);
@@ -310,7 +312,7 @@ TEST(UnknownRatioTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown_many});
   unknown_many->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   unknown_many->add_processing_time(100);
   EXPECT_EQ(unknown_many->processing_time(), 100);
   EXPECT_EQ(unknown_many->TotalProcessingTime(/*processing_times=*/nullptr), 0);
@@ -345,7 +347,7 @@ TEST(UnknownTest, Model) {
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", unknown});
   unknown->add_input(source2);
-  std::vector<double> input_times(1, 0);
+  absl::flat_hash_map<string, double> input_times;
   source1->add_processing_time(100);
   EXPECT_EQ(unknown->TotalProcessingTime(/*processing_times=*/nullptr), 0);
   EXPECT_EQ(unknown->OutputTime(&input_times, nullptr), 0);
@@ -390,17 +392,23 @@ class TestNode : public model::Node {
     return nullptr;
   }
 
-  double OutputTimeLocked(std::vector<double>* input_times,
-                          absl::flat_hash_map<string, double>* gradient)
-      const override TF_SHARED_LOCKS_REQUIRED(mu_) {
-    return 0;
+  void InputTimeLocked(absl::flat_hash_map<string, double>* input_times)
+      const override TF_SHARED_LOCKS_REQUIRED(mu_) {}
+
+  void OutputTimeLocked(
+      const absl::flat_hash_map<string, double>& input_times,
+      absl::flat_hash_map<string, double>* gradients,
+      absl::flat_hash_map<string, double>* output_times,
+      absl::flat_hash_map<string, double>* output_time_gradients) const override
+      TF_SHARED_LOCKS_REQUIRED(mu_) {
+    (*output_times)[long_name()] = 0;
   }
 
   void TotalProcessingTimeLocked(
       absl::flat_hash_map<string, double>* processing_times,
       absl::flat_hash_map<string, double>* total_processing_times) override
       TF_SHARED_LOCKS_REQUIRED(mu_) {
-    total_processing_times->insert(std::make_pair(long_name(), 0));
+    (*total_processing_times)[long_name()] = 0;
   }
 };
 
@@ -504,7 +512,7 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
     async_interleave_many->remove_input(meta_source);
   });
   std::shared_ptr<Node> source1 = model::MakeAsyncInterleaveManyNode(
-      {0, "async_interleave_many", nullptr},
+      {2, "async_interleave_many", async_interleave_many},
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -514,12 +522,13 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
     async_interleave_many->remove_input(source1);
   });
   std::shared_ptr<Node> source2 =
-      model::MakeSourceNode({2, "source2", async_interleave_many});
+      model::MakeSourceNode({3, "source2", async_interleave_many});
   async_interleave_many->add_input(source2);
   auto cleanup2 = gtl::MakeCleanup([async_interleave_many, source2]() {
     async_interleave_many->remove_input(source2);
   });
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
   async_interleave_many->CollectTunableParameters(&parameters);
   async_interleave_many->record_element();
@@ -532,13 +541,13 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
   parameters[source1->long_name()]->value = 1;
 
   // Test gradient of own parameters.
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   double output_time =
-      async_interleave_many->OutputTime(&input_times, &gradient);
+      async_interleave_many->OutputTime(&input_times, &gradients);
   parameters[async_interleave_many->long_name()]->value += kParameterStep;
   double new_output_time =
       async_interleave_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_interleave_many->long_name()],
+  EXPECT_NEAR(gradients[async_interleave_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 
@@ -546,7 +555,7 @@ TEST(AsyncInterleaveManyGradientTest, Model) {
   parameters[async_interleave_many->long_name()]->value -= kParameterStep;
   parameters[source1->long_name()]->value += kParameterStep;
   new_output_time = async_interleave_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[source1->long_name()],
+  EXPECT_NEAR(gradients[source1->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -565,7 +574,7 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
           std::make_shared<SharedState>(parameter_value, nullptr, nullptr), 1,
           parameter_value)});
   std::shared_ptr<Node> source1 = model::MakeAsyncKnownRatioNode(
-      {0, "source1", nullptr}, num_inputs_per_output,
+      {1, "source1", async_known_many}, num_inputs_per_output,
       {model::MakeParameter(
           parameter_name,
           std::make_shared<SharedState>(parameter_value, nullptr, nullptr), 1,
@@ -573,7 +582,8 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
   async_known_many->add_input(source1);
   std::shared_ptr<Node> source2 =
       model::MakeSourceNode({2, "source2", async_known_many});
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   async_known_many->add_input(source2);
   source1->record_element();
   source1->add_processing_time(100);
@@ -584,14 +594,14 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
 
   // Test gradient of own parameters.
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   async_known_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
   parameters[source1->long_name()]->value = 1;
-  double output_time = async_known_many->OutputTime(&input_times, &gradient);
+  double output_time = async_known_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = async_known_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 
@@ -599,7 +609,7 @@ TEST_P(AsyncKnownRatioGradientTest, Model) {
   parameters[async_known_many->long_name()]->value -= kParameterStep;
   parameters[source1->long_name()]->value += kParameterStep;
   new_output_time = async_known_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[source1->long_name()],
+  EXPECT_NEAR(gradients[source1->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -614,28 +624,29 @@ TEST(InterleaveManyGradientTest, Model) {
   std::shared_ptr<Node> interleave_many =
       model::MakeInterleaveManyNode({0, "interleave_many", nullptr});
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", interleave_many}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
           parallelism)});
   std::shared_ptr<Node> source1 =
-      model::MakeSourceNode({2, "source1", async_known_many});
+      model::MakeSourceNode({2, "source1", interleave_many});
   interleave_many->record_element();
   interleave_many->add_processing_time(100);
   interleave_many->add_input(source1);
   interleave_many->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   interleave_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = interleave_many->OutputTime(&input_times, &gradient);
+  double output_time = interleave_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = interleave_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -647,7 +658,7 @@ TEST(KnownRatioGradientTest, Model) {
   std::shared_ptr<Node> known_many = model::MakeKnownRatioNode(
       {0, "known_many", nullptr}, num_inputs_per_output);
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", known_many}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -657,15 +668,16 @@ TEST(KnownRatioGradientTest, Model) {
   known_many->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   known_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = known_many->OutputTime(&input_times, &gradient);
+  double output_time = known_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = known_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -677,7 +689,7 @@ TEST(UnknownRatioGradientTest, Model) {
   std::shared_ptr<Node> unknown_many =
       model::MakeUnknownRatioNode({0, "unknown_many", nullptr});
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", unknown_many}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -687,15 +699,16 @@ TEST(UnknownRatioGradientTest, Model) {
   unknown_many->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   unknown_many->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = unknown_many->OutputTime(&input_times, &gradient);
+  double output_time = unknown_many->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = unknown_many->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
@@ -707,7 +720,7 @@ TEST(UnknownGradientTest, Model) {
   std::shared_ptr<Node> unknown =
       model::MakeUnknownNode({0, "unknown", nullptr});
   std::shared_ptr<Node> async_known_many = model::MakeAsyncKnownRatioNode(
-      {0, "async_known_many", nullptr}, num_inputs_per_output,
+      {1, "async_known_many", unknown}, num_inputs_per_output,
       {model::MakeParameter(
           "parallelism",
           std::make_shared<SharedState>(parallelism, nullptr, nullptr), 1,
@@ -717,15 +730,16 @@ TEST(UnknownGradientTest, Model) {
   unknown->add_input(async_known_many);
   async_known_many->record_element();
   async_known_many->add_processing_time(300);
-  std::vector<double> input_times(1, input_time);
+  absl::flat_hash_map<string, double> input_times;
+  input_times[kInputTimeKey] = input_time;
   absl::flat_hash_map<string, std::shared_ptr<Parameter>> parameters;
-  absl::flat_hash_map<string, double> gradient;
+  absl::flat_hash_map<string, double> gradients;
   unknown->CollectTunableParameters(&parameters);
   parameters[async_known_many->long_name()]->value = 1;
-  double output_time = unknown->OutputTime(&input_times, &gradient);
+  double output_time = unknown->OutputTime(&input_times, &gradients);
   parameters[async_known_many->long_name()]->value += kParameterStep;
   double new_output_time = unknown->OutputTime(&input_times, nullptr);
-  EXPECT_NEAR(gradient[async_known_many->long_name()],
+  EXPECT_NEAR(gradients[async_known_many->long_name()],
               (new_output_time - output_time) / kParameterStep,
               kComparisonPrecision);
 }
diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
index dea217367dc..9ab3de788fc 100644
--- a/tensorflow/python/data/kernel_tests/options_test.py
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -108,7 +108,7 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
     for _ in range(999):
       result = result.concatenate(ds)
     options = dataset_ops.Options()
-    options.experimental_optimization.autotune = False
+    options.experimental_optimization.autotune = True
     result = result.with_options(options)
     self.assertDatasetProduces(result, [0]*1000)
 

From 47fbe120b00942287075c5175747f0023cc9409d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 16 May 2020 02:02:56 -0700
Subject: [PATCH 327/412] Update GraphDef version to 403.

PiperOrigin-RevId: 311868879
Change-Id: I65b672d01f04083f61d3fe61f4c9778eedbd7d87
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 8f0967c1eaa..c3a1fe1ed16 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 402  // Updated: 2020/5/15
+#define TF_GRAPH_DEF_VERSION 403  // Updated: 2020/5/16
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 0102aeaa137467d5b273c61390795558d90c4b73 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 16 May 2020 02:02:57 -0700
Subject: [PATCH 328/412] compat: Update forward compatibility horizon to
 2020-05-16

PiperOrigin-RevId: 311868882
Change-Id: I6909dffc42b26ede9ace7d810b234929a7275ca0
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 29ba7317747..f1c599c15c6 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 15)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 16)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 74396bcd3096c075e24a62f34c3f5d6c0ad3c454 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Sat, 16 May 2020 02:12:45 -0700
Subject: [PATCH 329/412] Fix windows build.

PiperOrigin-RevId: 311869440
Change-Id: Ic2b9f5da404bb7049627271c291349c1ad1fec25
---
 tensorflow/compiler/aot/tfcompile.bzl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index f0c3e7da0ba..208b01c49d5 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -208,14 +208,14 @@ def tf_library(
         srcs.append(debug_info)
         debug_info_flag = " --debug_info=$(location " + debug_info + ")"
 
-    default_fast_math_xla_flags = "XLA_FLAGS=\"\
-      --xla_cpu_enable_fast_math=true \
-      --xla_cpu_fast_math_honor_nans=false \
-      --xla_cpu_fast_math_honor_infs=false \
-      --xla_cpu_fast_math_honor_functions=false \
-      --xla_cpu_fast_math_honor_division=false \
-      --xla_cpu_enable_fast_min_max=true \
-      $${XLA_FLAGS:-}\" "
+    default_fast_math_xla_flags = ("XLA_FLAGS='" +
+                                   "--xla_cpu_enable_fast_math=true " +
+                                   "--xla_cpu_fast_math_honor_nans=false " +
+                                   "--xla_cpu_fast_math_honor_infs=false " +
+                                   "--xla_cpu_fast_math_honor_functions=false " +
+                                   "--xla_cpu_fast_math_honor_division=false " +
+                                   "--xla_cpu_enable_fast_min_max=true " +
+                                   "$${XLA_FLAGS:-}' ")
 
     native.genrule(
         name = ("gen_" + name),

From fd976b2defe66ac368b8cc5c96500bf5fe7b1d12 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Sat, 16 May 2020 02:21:00 -0700
Subject: [PATCH 330/412] optimize for int8 add.

PiperOrigin-RevId: 311869888
Change-Id: I4009635592941be39aa5c71e185e3eecbc2ec49c
---
 .../internal/optimized/integer_ops/add.h      | 141 +++++++++++-------
 1 file changed, 91 insertions(+), 50 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index a9dae4feac5..ff8e4687d58 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -35,58 +35,99 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_GT(params.input2_offset, -256);
   TFLITE_DCHECK_LT(params.input1_offset, 256);
   TFLITE_DCHECK_LT(params.input2_offset, 256);
+
 #ifdef USE_NEON
-  const int8x8_t output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
-  const int8x8_t output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
-  for (; i <= size - 8; i += 8) {
-    const int8x8_t input1_val_original = vld1_s8(input1_data + i);
-    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
-    const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
-    const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
-    const int16x8_t input1_val =
-        vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
-    const int16x8_t input2_val =
-        vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
-    const int16x4_t input1_val_high = vget_high_s16(input1_val);
-    const int16x4_t input1_val_low = vget_low_s16(input1_val);
-    const int16x4_t input2_val_high = vget_high_s16(input2_val);
-    const int16x4_t input2_val_low = vget_low_s16(input2_val);
-    int32x4_t x11 = vmovl_s16(input1_val_low);
-    int32x4_t x12 = vmovl_s16(input1_val_high);
-    int32x4_t x21 = vmovl_s16(input2_val_low);
-    int32x4_t x22 = vmovl_s16(input2_val_high);
-    const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
-    x11 = vshlq_s32(x11, left_shift_dup);
-    x12 = vshlq_s32(x12, left_shift_dup);
-    x21 = vshlq_s32(x21, left_shift_dup);
-    x22 = vshlq_s32(x22, left_shift_dup);
-    x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
-    x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
-    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
-    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
-    const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
-    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
-    x11 = vshlq_s32(x11, input1_shift_dup);
-    x12 = vshlq_s32(x12, input1_shift_dup);
-    x21 = vshlq_s32(x21, input2_shift_dup);
-    x22 = vshlq_s32(x22, input2_shift_dup);
-    int32x4_t s1 = vaddq_s32(x11, x21);
-    int32x4_t s2 = vaddq_s32(x12, x22);
-    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
-    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+  const int8x16_t output_activation_min_vector =
+      vdupq_n_s8(params.quantized_activation_min);
+  const int8x16_t output_activation_max_vector =
+      vdupq_n_s8(params.quantized_activation_max);
+
+  const int input1_left_shift = params.left_shift + params.input1_shift;
+  const int input2_left_shift = params.left_shift + params.input2_shift;
+  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
+  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
+
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high =
+        vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low =
+        vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low =
+        vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high =
+        vaddq_s16(input1_val_s16_high, vdupq_n_s16(params.input1_offset));
+    const int16x8_t input2_val_high =
+        vaddq_s16(input2_val_s16_high, vdupq_n_s16(params.input2_offset));
+    const int16x8_t input1_val_low =
+        vaddq_s16(input1_val_s16_low, vdupq_n_s16(params.input1_offset));
+    const int16x8_t input2_val_low =
+        vaddq_s16(input2_val_s16_low, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+    int32x4_t x111 = vmovl_s16(input1_val_low_low);
+    int32x4_t x112 = vmovl_s16(input1_val_low_high);
+    int32x4_t x121 = vmovl_s16(input1_val_high_low);
+    int32x4_t x122 = vmovl_s16(input1_val_high_high);
+    int32x4_t x211 = vmovl_s16(input2_val_low_low);
+    int32x4_t x212 = vmovl_s16(input2_val_low_high);
+    int32x4_t x221 = vmovl_s16(input2_val_high_low);
+    int32x4_t x222 = vmovl_s16(input2_val_high_high);
+
+    x111 = vshlq_s32(x111, input1_left_dup);
+    x112 = vshlq_s32(x112, input1_left_dup);
+    x121 = vshlq_s32(x121, input1_left_dup);
+    x122 = vshlq_s32(x122, input1_left_dup);
+    x211 = vshlq_s32(x211, input2_left_dup);
+    x212 = vshlq_s32(x212, input2_left_dup);
+    x221 = vshlq_s32(x221, input2_left_dup);
+    x222 = vshlq_s32(x222, input2_left_dup);
+    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
+    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
+    x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
+    x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
+    x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
+    x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
+    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
+    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
+    int32x4_t s11 = vaddq_s32(x111, x211);
+    int32x4_t s12 = vaddq_s32(x112, x212);
+    int32x4_t s21 = vaddq_s32(x121, x221);
+    int32x4_t s22 = vaddq_s32(x122, x222);
+    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
+    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
+    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
+    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
-    s1 = RoundingDivideByPOT(s1, -params.output_shift);
-    s2 = RoundingDivideByPOT(s2, -params.output_shift);
-    const int16x4_t s1_narrowed = vmovn_s32(s1);
-    const int16x4_t s2_narrowed = vmovn_s32(s2);
-    const int16x8_t s = vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed),
-                                  vdupq_n_s16(params.output_offset));
-    const int8x8_t clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
-    vst1_s8(output_data + i, clamped);
+    s11 = RoundingDivideByPOT(s11, -params.output_shift);
+    s12 = RoundingDivideByPOT(s12, -params.output_shift);
+    s21 = RoundingDivideByPOT(s21, -params.output_shift);
+    s22 = RoundingDivideByPOT(s22, -params.output_shift);
+    const int16x4_t s11_narrowed = vmovn_s32(s11);
+    const int16x4_t s12_narrowed = vmovn_s32(s12);
+    const int16x4_t s21_narrowed = vmovn_s32(s21);
+    const int16x4_t s22_narrowed = vmovn_s32(s22);
+    const int16x8_t s1 = vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int16x8_t s2 = vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed),
+                                   vdupq_n_s16(params.output_offset));
+    const int8x16_t s = vcombine_s8(vqmovn_s16(s1), vqmovn_s16(s2));
+
+    const int8x16_t clamped =
+        vmaxq_s8(output_activation_min_vector,
+                 vminq_s8(output_activation_max_vector, s));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 

From d70dc548b58c56a6a510b8d676cbc08ffdad3189 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Sat, 16 May 2020 03:51:08 -0700
Subject: [PATCH 331/412] Optimize trivial RealDiv ops

PiperOrigin-RevId: 311874492
Change-Id: I8084b4a0a913d4585420bff20a21688ae8d41286
---
 .../mlir/tensorflow/ir/tf_generated_ops.td        |  2 ++
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc  | 15 ++++++++++-----
 .../mlir/tensorflow/tests/constant-fold.mlir      |  9 +++++++++
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 82282bb925a..d53bafff638 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -6331,6 +6331,8 @@ If `x` and `y` are reals, this will return the floating-point division.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 
   let hasCanonicalizer = 1;
+
+  let hasFolder = 1;
 }
 
 def TF_ReciprocalOp : TF_Op<"Reciprocal", [NoSideEffect, SameOperandsAndResultType]> {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 2007824369c..78623ca3c61 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -110,7 +110,6 @@ static inline bool HasRankAtMost(Value value, int64_t rank) {
   return !type || type.getRank() <= rank;
 }
 
-
 static bool IsUnknownDimOrRank(int64_t dim_or_rank) {
   return dim_or_rank == -1;
 }
@@ -462,9 +461,10 @@ LogicalResult FoldOperandsPermutation(
 namespace {
 // Folder that returns LHS of an Arithmetic Op if the RHS is a constant
 // known to be Identity (e.g X+0)
-template <typename OpT,
-          typename std::enable_if<llvm::is_one_of<
-              OpT, AddV2Op, SubOp, MulOp, DivOp>::value>::type * = nullptr>
+template <
+    typename OpT,
+    typename std::enable_if<llvm::is_one_of<
+        OpT, AddV2Op, SubOp, MulOp, DivOp, RealDivOp>::value>::type * = nullptr>
 OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
                                         ArrayRef<Attribute> operands) {
   auto result_op_type = arithmetic_op.getResult().getType();
@@ -479,7 +479,8 @@ OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
   // Mul and Div ops have identity value one while AddV2 and SubOp have identity
   // value zero.
   int identity =
-      (std::is_same<OpT, MulOp>::value || std::is_same<OpT, DivOp>::value);
+      (std::is_same<OpT, MulOp>::value || std::is_same<OpT, DivOp>::value ||
+       std::is_same<OpT, RealDivOp>::value);
 
   Type element_ty = lhs_type.getElementType();
   Attribute identity_attr;
@@ -2408,6 +2409,10 @@ void RealDivOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
   results.insert<RealDivWithSqrtDivisor>(context);
 }
 
+OpFoldResult RealDivOp::fold(ArrayRef<Attribute> operands) {
+  return IdentityArithmeticOpFolder<RealDivOp>(*this, operands);
+}
+
 //===----------------------------------------------------------------------===//
 // ReshapeOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index bccb8923134..32815956ff7 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -384,6 +384,15 @@ func @RemoveTrivialDiv(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor
   // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
 }
 
+func @RemoveTrivialRealDiv(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+  %cst = constant dense<1.0> : tensor<2x2xf32>
+  %0 = "tf.RealDiv"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+
+  // CHECK-LABEL: RemoveTrivialRealDiv
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
+}
+
 func @RemoveTrivialDivBf16RHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
   %cst = constant dense<1.0> : tensor<2x2xbf16>
   %0 = "tf.Div"(%arg0, %cst) : (tensor<2x2xbf16>, tensor<2x2xbf16>) -> tensor<2x2xbf16>

From 766f2968fcecbd815e7090aea70ad79d471a1332 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Sat, 16 May 2020 15:00:44 -0700
Subject: [PATCH 332/412] Simplify some tests

PiperOrigin-RevId: 311910223
Change-Id: I751aa9344c08a490261822dc8010d1704da95a7c
---
 .../mlir/tensorflow/tests/constant-fold.mlir  | 70 ++++++++-----------
 1 file changed, 28 insertions(+), 42 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index 32815956ff7..2119e78bd1e 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -302,15 +302,13 @@ func @testTensorListElementShape(%arg0: tensor<!tf.variant<tensor<2x4xf32>>>) ->
   return %0: tensor<2xi32>
 }
 
-func @RemoveTrivialAdd(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialAdd(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.Add"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.Add"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.Add"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialAdd
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
 func @RemoveTrivialAddBf16RHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
@@ -331,26 +329,22 @@ func @RemoveTrivialAddBf16LHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
   // CHECK-NEXT: return %arg0 : tensor<2x2xbf16>
 }
 
-func @RemoveTrivialAddV2(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialAddV2(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.AddV2"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.AddV2"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialAddV2
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
-func @RemoveTrivialSub(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialSub(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.Sub"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.Sub"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialSub
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
 func @RemoveTrivialSubInt8(%arg0: tensor<2x2xi8>) -> tensor<2x2xi8> {
@@ -362,26 +356,22 @@ func @RemoveTrivialSubInt8(%arg0: tensor<2x2xi8>) -> tensor<2x2xi8> {
   // CHECK-NEXT: return %arg0 : tensor<2x2xi8>
 }
 
-func @RemoveTrivialMul(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialMul(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<1.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.Mul"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.Mul"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialMul
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
-func @RemoveTrivialDiv(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+func @RemoveTrivialDiv(%arg0: tensor<2x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<1.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  %1 = "tf.Div"(%0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.Div"(%arg0, %cst) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: RemoveTrivialDiv
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  // CHECK-NEXT: return %[[RESULT]] : tensor<2x2xf32>
+  // CHECK-NEXT: return %arg0 : tensor<2x2xf32>
 }
 
 func @RemoveTrivialRealDiv(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
@@ -420,28 +410,24 @@ func @DivBf16LHS(%arg0: tensor<2x2xbf16>) -> tensor<2x2xbf16> {
   // CHECK: tf.Div
 }
 
-func @DontRemoveTrivialAdd(%arg0: tensor<1x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<2x2xf32> {
+func @DontRemoveTrivialAdd(%arg0: tensor<1x2xf32>) -> tensor<2x2xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
-  %1 = "tf.AddV2"(%0, %cst) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
-  return %1 : tensor<2x2xf32>
+  %0 = "tf.AddV2"(%arg0, %cst) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
 
   // CHECK-LABEL: DontRemoveTrivialAdd
   // CHECK: %[[CONST:.*]] = constant dense<0.000000e+00> : tensor<2x2xf32>
-  // CHECK: %[[add:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<1x2xf32>, tensor<1x2xf32>) -> tensor<1x2xf32>
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%[[add]], %[[CONST]]) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %[[CONST]]) : (tensor<1x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
   // CHECK: return %[[RESULT]] : tensor<2x2xf32>
 }
 
-func @DontRemoveTrivialAdd2(%arg0: tensor<?x?xf32>, %arg1: tensor<2x2xf32>) -> tensor<?x?xf32> {
+func @DontRemoveTrivialAdd2(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
   %cst = constant dense<0.0> : tensor<2x2xf32>
-  %0 = "tf.AddV2"(%arg0, %arg1) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
-  %1 = "tf.AddV2"(%0, %cst) : (tensor<?x?xf32> , tensor<2x2xf32>) -> tensor<?x?xf32>
-  return %1 :tensor<?x?xf32>
+  %0 = "tf.AddV2"(%arg0, %cst) : (tensor<?x?xf32> , tensor<2x2xf32>) -> tensor<?x?xf32>
+  return %0 :tensor<?x?xf32>
 
   // CHECK-LABEL: DontRemoveTrivialAdd2
   // CHECK: %[[CONST:.*]] = constant dense<0.000000e+00> : tensor<2x2xf32>
-  // CHECK: %[[add:.*]] = "tf.AddV2"(%arg0, %arg1) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
-  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%[[add]], %[[CONST]]) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
+  // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %[[CONST]]) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
   // CHECK: return %[[RESULT]] : tensor<?x?xf32>
 }

From bf639d750bc3eb22a0ac2affb24772658641e1bd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 16 May 2020 16:12:32 -0700
Subject: [PATCH 333/412] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/7af0c8559b6d

PiperOrigin-RevId: 311914136
Change-Id: I8a04df09178d9dfce79c13c8d14daf4f69048dee
---
 tensorflow/compiler/mlir/lite/ir/tfl_ops.h            |  2 +-
 .../compiler/mlir/tensorflow/ir/control_flow_ops.h    |  2 +-
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h       |  2 +-
 .../mlir/tensorflow/transforms/constant_fold.cc       |  2 +-
 tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h           |  2 +-
 tensorflow/compiler/mlir/xla/ir/chlo_ops.h            |  2 +-
 tensorflow/compiler/mlir/xla/ir/hlo_ops.h             |  2 +-
 tensorflow/compiler/mlir/xla/ir/lhlo_ops.h            |  2 +-
 tensorflow/compiler/xla/service/cpu/ir_emitter.cc     |  5 ++---
 tensorflow/compiler/xla/service/gpu/ir_emitter.cc     |  2 +-
 third_party/mlir/BUILD                                | 11 ++++-------
 11 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
index 0e6a3db1f1b..c7a1504c3b7 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/LoopLikeInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
 #include "tensorflow/lite/schema/schema_generated.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
index 15a4ecfc537..39245425a5a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/control_flow_ops.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 namespace TFControlFlow {
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
index 979f506b3b1..88307267ab4 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h
@@ -31,7 +31,7 @@ limitations under the License.
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/DerivedAttributeOpInterface.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index be35c6caa16..55a0b5c3fd3 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/tf_status.h"
diff --git a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
index 545183a052b..9c98c9b0e19 100644
--- a/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
+++ b/tensorflow/compiler/mlir/tfjs/ir/tfjs_ops.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/xla/ir/chlo_ops.h b/tensorflow/compiler/mlir/xla/ir/chlo_ops.h
index 474d4b7d95a..a5337907579 100644
--- a/tensorflow/compiler/mlir/xla/ir/chlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/chlo_ops.h
@@ -25,7 +25,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 namespace xla_chlo {
diff --git a/tensorflow/compiler/mlir/xla/ir/hlo_ops.h b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
index 25b2f009cc6..9725a0684f6 100644
--- a/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/hlo_ops.h
@@ -29,7 +29,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 class OpBuilder;
diff --git a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
index 190c5ff832d..1c4ccaae214 100644
--- a/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
+++ b/tensorflow/compiler/mlir/xla/ir/lhlo_ops.h
@@ -27,7 +27,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffects.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 
 namespace mlir {
 class OpBuilder;
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 5a4c6250293..70dde919afb 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -2875,9 +2875,8 @@ Status IrEmitter::HandleRngGetAndUpdateState(HloInstruction* rng_state) {
                                  old_state->getType()->getScalarType(),
                                  address->getType()->getPointerAddressSpace()));
   llvm::StoreInst* store = Store(old_state, address);
-  store->setAlignment(
-      llvm::MaybeAlign(IrEmitter::MinimumAlignmentForPrimitiveType(
-          rng_state->shape().element_type())));
+  store->setAlignment(llvm::Align(IrEmitter::MinimumAlignmentForPrimitiveType(
+      rng_state->shape().element_type())));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 011eb07d3bd..744cd7b56bf 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -222,7 +222,7 @@ bool IrEmitter::MaybeEmitDirectAtomicOperation(
     // Derive a minimum alignment from the type. The optimizer can increase it
     // later.
     store->setAlignment(
-        llvm::MaybeAlign(ShapeUtil::ByteSizeOfPrimitiveType(element_type)));
+        llvm::Align(ShapeUtil::ByteSizeOfPrimitiveType(element_type)));
     return true;
   }
 
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 5636bc27cff..58c932ea723 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -788,9 +788,6 @@ cc_library(
             "lib/Support/*.h",
         ],
         exclude = [
-            # TODO(herhut): Move JitRunner out of Support so that Support does not
-            # depend on dialect.
-            "lib/Support/JitRunner.cpp",
             # TODO(jpienaar): Move this out, else Support depends on Analysis/
             "lib/Support/MlirOptMain.cpp",
         ],
@@ -2232,10 +2229,10 @@ gentbl(
 cc_library(
     name = "SideEffects",
     srcs = [
-        "lib/Interfaces/SideEffects.cpp",
+        "lib/Interfaces/SideEffectInterfaces.cpp",
     ],
     hdrs = [
-        "include/mlir/Interfaces/SideEffects.h",
+        "include/mlir/Interfaces/SideEffectInterfaces.h",
     ],
     includes = ["include"],
     deps = [
@@ -2621,8 +2618,8 @@ cc_binary(
 
 cc_library(
     name = "MlirJitRunner",
-    srcs = ["lib/Support/JitRunner.cpp"],
-    hdrs = ["include/mlir/Support/JitRunner.h"],
+    srcs = ["lib/ExecutionEngine/JitRunner.cpp"],
+    hdrs = ["include/mlir/ExecutionEngine/JitRunner.h"],
     includes = ["include"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",

From 82d70b6763317e59ab84f42c095d96b676b6d4cd Mon Sep 17 00:00:00 2001
From: Anudhyan Boral <anudhyan@google.com>
Date: Sat, 16 May 2020 19:59:08 -0700
Subject: [PATCH 334/412] Add F64 Sqrt test.

PiperOrigin-RevId: 311926087
Change-Id: I2f71e56825ad255a823c5a2fdd593231c474e6b1
---
 tensorflow/compiler/xla/client/lib/math_test.cc | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index 9b8156efe5b..cb79b2ef7db 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -236,6 +236,19 @@ XLA_TEST_F(MathTest, SqrtF32) {
   ComputeAndCompareR0<float>(&builder, 0.0f, {zero_data.get()}, error_spec_);
 }
 
+XLA_TEST_F(MathTest, SqrtF64) {
+  XlaBuilder builder(TestName());
+  Literal zero_literal = LiteralUtil::Zero(PrimitiveType::F64);
+
+  std::unique_ptr<GlobalData> zero_data =
+      client_->TransferToServer(zero_literal).ConsumeValueOrDie();
+
+  XlaOp zero = Parameter(&builder, 0, zero_literal.shape(), "zero");
+  Sqrt(zero);
+
+  ComputeAndCompareR0<double>(&builder, 0.0f, {zero_data.get()}, error_spec_);
+}
+
 #ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64
 XLA_TEST_F(MathTest, ErfInvF64) {
   XlaBuilder builder(TestName());

From 93955171ee302e272ef59e286cf8c5b3060112ec Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 17 May 2020 02:02:56 -0700
Subject: [PATCH 335/412] Update GraphDef version to 404.

PiperOrigin-RevId: 311945310
Change-Id: Ieda91ca5df65ea1f26085b5d8420b954815f0c7d
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index c3a1fe1ed16..63501a14f56 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 403  // Updated: 2020/5/16
+#define TF_GRAPH_DEF_VERSION 404  // Updated: 2020/5/17
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From c4a9de96742d85a8772e3868fc2f13955a195e18 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 17 May 2020 02:02:57 -0700
Subject: [PATCH 336/412] compat: Update forward compatibility horizon to
 2020-05-17

PiperOrigin-RevId: 311945312
Change-Id: I16d03e29d3c39925d112516edcb48d3f4c16c0d1
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index f1c599c15c6..2a99a0774ad 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 16)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 17)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From e480d8f7ff66dbab239019c9f202748f6fa1f661 Mon Sep 17 00:00:00 2001
From: Chao Mei <chaomei@google.com>
Date: Sun, 17 May 2020 21:07:20 -0700
Subject: [PATCH 337/412] Add a compile flag (i.e. adding "--define
 tflite_with_xnnpack=true" when using bazel to build, which corresponds to
 defining macro TFLITE_BUILD_WITH_XNNPACK_DELEGATE) to apply XNNPACK delegate
 in TFLite runtime.

This is mainly to support Windows where weak symbols are not supported.

PiperOrigin-RevId: 312011534
Change-Id: I27c6b206b8aa8ded2d2671c2fca843574f75752b
---
 tensorflow/lite/BUILD                         | 32 ++++++++++++
 tensorflow/lite/interpreter_builder.cc        | 26 +++-------
 .../lite/tflite_with_xnnpack_optional.cc      | 52 +++++++++++++++++++
 .../lite/tflite_with_xnnpack_optional.h       | 26 ++++++++++
 4 files changed, 117 insertions(+), 19 deletions(-)
 create mode 100644 tensorflow/lite/tflite_with_xnnpack_optional.cc
 create mode 100644 tensorflow/lite/tflite_with_xnnpack_optional.h

diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 14babee2da7..ef25f03562f 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -246,6 +246,7 @@ cc_library(
         ":minimal_logging",
         ":simple_memory_arena",
         ":string",
+        ":tflite_with_xnnpack_optional",
         ":type_to_tflitetype",
         ":util",
         ":version",
@@ -311,6 +312,8 @@ cc_library(
     ],
 )
 
+# Link this library to inject XNNPACK delegate to TFLite runtime automatically
+# by utilizing the weak symbols if they're supported by the platform.
 cc_library(
     name = "tflite_with_xnnpack",
     srcs = ["tflite_with_xnnpack.cc"],
@@ -323,6 +326,35 @@ cc_library(
     alwayslink = 1,
 )
 
+# Enables applying XNNPACK delegate for float models in TFLite runtime.
+# WARNING: This build flag is experimental and subject to change.
+config_setting(
+    name = "tflite_with_xnnpack_enabled",
+    values = {"define": "tflite_with_xnnpack=true"},
+)
+
+cc_library(
+    name = "tflite_with_xnnpack_optional",
+    srcs = ["tflite_with_xnnpack_optional.cc"],
+    hdrs = [
+        "core/macros.h",
+        "tflite_with_xnnpack_optional.h",
+    ],
+    copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
+    defines = select({
+        ":tflite_with_xnnpack_enabled": ["TFLITE_BUILD_WITH_XNNPACK_DELEGATE"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        "//tensorflow/lite/c:common",
+    ] + select({
+        ":tflite_with_xnnpack_enabled": [
+            "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
 cc_test(
     name = "string_util_test",
     size = "small",
diff --git a/tensorflow/lite/interpreter_builder.cc b/tensorflow/lite/interpreter_builder.cc
index fb87702fd13..43d81ef0770 100644
--- a/tensorflow/lite/interpreter_builder.cc
+++ b/tensorflow/lite/interpreter_builder.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/lite/core/api/flatbuffer_conversions.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 #include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tflite_with_xnnpack_optional.h"
 #include "tensorflow/lite/util.h"
 #include "tensorflow/lite/version.h"
 
@@ -108,27 +109,14 @@ TfLiteStatus ParseSparseIndexVector(const DimensionMetadata* src,
 
 const char* kEmptyTensorName = "";
 
-#if TFLITE_HAS_ATTRIBUTE_WEAK
 // Using weak symbols to create a delegate allows automatic injection of the
 // delegate simply by adding it as a dependency.
-
 // For flex delegate, see also the strong override in
 // lite/delegates/flex/delegate.cc.
 TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireFlexDelegate() {
   return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
 }
 
-// For XNNPACK delegate, see also the strong override in
-// lite/tflite_with_xnnpack.cc.
-TFLITE_ATTRIBUTE_WEAK Interpreter::TfLiteDelegatePtr AcquireXNNPACKDelegate(
-    int num_threads) {
-  return Interpreter::TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
-}
-#else
-Interpreter::TfLiteDelegatePtr (*AcquireFlexDelegate)() = nullptr;
-Interpreter::TfLiteDelegatePtr (*AcquireXNNPACKDelegate)(int) = nullptr;
-#endif
-
 namespace impl {
 
 InterpreterBuilder::InterpreterBuilder(const FlatBufferModel& model,
@@ -541,17 +529,17 @@ TfLiteStatus InterpreterBuilder::ParseTensors(
 TfLiteStatus InterpreterBuilder::ApplyDelegates(Interpreter* interpreter,
                                                 int num_threads) {
   // First, apply XNNPACK delegate if applicable.
-  if (AcquireXNNPACKDelegate && num_fp32_tensors_ > 0) {
-    if (auto xnnpack_delegate = AcquireXNNPACKDelegate(num_threads)) {
-      // The execution will fall back to default implementation if the XNNPACK
-      // delegate fails to be applied. Therefore, we ignore the return status
-      // here and let it fall through the rest of the code.
+  if (num_fp32_tensors_ > 0) {
+    // The execution will fall back to default implementation if the XNNPACK
+    // delegate fails to be applied. Therefore, we ignore the return status
+    // here and let it fall through the rest of the code.
+    if (auto xnnpack_delegate = MaybeCreateXNNPACKDelegate(num_threads)) {
       interpreter->ModifyGraphWithDelegate(std::move(xnnpack_delegate));
     }
   }
 
   // Secondly, apply Flex delegate if applicable.
-  if (has_flex_op_ && AcquireFlexDelegate) {
+  if (has_flex_op_) {
     if (auto flex_delegate = AcquireFlexDelegate()) {
       return interpreter->ModifyGraphWithDelegate(std::move(flex_delegate));
     }
diff --git a/tensorflow/lite/tflite_with_xnnpack_optional.cc b/tensorflow/lite/tflite_with_xnnpack_optional.cc
new file mode 100644
index 00000000000..31d4ff50f28
--- /dev/null
+++ b/tensorflow/lite/tflite_with_xnnpack_optional.cc
@@ -0,0 +1,52 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/tflite_with_xnnpack_optional.h"
+
+#include "tensorflow/lite/core/macros.h"
+
+#ifdef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
+#include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
+#endif
+
+namespace tflite {
+
+using TfLiteDelegatePtr =
+    std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>;
+
+#ifndef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
+// Using weak symbols to create a delegate allows automatic injection of the
+// delegate simply by adding it as a dependency. See the strong override in
+// lite/tflite_with_xnnpack.cc,
+TFLITE_ATTRIBUTE_WEAK TfLiteDelegatePtr
+AcquireXNNPACKDelegate(int num_threads) {
+  return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+}
+#endif
+
+#ifdef TFLITE_BUILD_WITH_XNNPACK_DELEGATE
+TfLiteDelegatePtr MaybeCreateXNNPACKDelegate(int num_threads) {
+  auto opts = TfLiteXNNPackDelegateOptionsDefault();
+  // Note that we don't want to use the thread pool for num_threads == 1.
+  opts.num_threads = num_threads > 1 ? num_threads : 0;
+  return TfLiteDelegatePtr(TfLiteXNNPackDelegateCreate(&opts),
+                           TfLiteXNNPackDelegateDelete);
+}
+#else
+TfLiteDelegatePtr MaybeCreateXNNPACKDelegate(int num_threads) {
+  return AcquireXNNPACKDelegate(num_threads);
+}
+#endif
+
+}  // namespace tflite
diff --git a/tensorflow/lite/tflite_with_xnnpack_optional.h b/tensorflow/lite/tflite_with_xnnpack_optional.h
new file mode 100644
index 00000000000..afbdbd17356
--- /dev/null
+++ b/tensorflow/lite/tflite_with_xnnpack_optional.h
@@ -0,0 +1,26 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
+#define TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_
+#include <memory>
+
+#include "tensorflow/lite/c/common.h"
+
+namespace tflite {
+std::unique_ptr<TfLiteDelegate, void (*)(TfLiteDelegate*)>
+MaybeCreateXNNPACKDelegate(int num_threads);
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_TFLITE_WITH_XNNPACK_OPTIONAL_H_

From c5fbab166f3b983c39efc997e63a11c1bd7f549e Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Mon, 18 May 2020 00:02:44 -0700
Subject: [PATCH 338/412] Fix Core ML delegate framework's include

PiperOrigin-RevId: 312025787
Change-Id: I00121199e2363d307cd52c1b2bfa4cbc66d36831
---
 tensorflow/lite/experimental/ios/BUILD.apple | 22 ++++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index 5c954bc3de8..a29e8bd6ed5 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -11,17 +11,6 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
-genrule(
-    name = "strip_coreml_include_hdr",
-    srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"],
-    outs = ["coreml_delegate.h"],
-    cmd = """
-    sed 's/#include \".*common.h"/#include \"common.h\"/' \
-    "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)" \
-    > "$@"
-    """,
-)
-
 TFL_FRAMEWORK_HDRS = [
     "//tensorflow/lite/delegates/gpu:metal_delegate.h",
     "//tensorflow/lite/c:c_api.h",
@@ -57,6 +46,17 @@ ios_static_framework(
     ],
 )
 
+genrule(
+    name = "strip_coreml_include_hdr",
+    srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"],
+    outs = ["coreml_delegate.h"],
+    cmd = """
+    sed "s|#include \".*common.h\"|#include \"TensorFlowLiteC/common.h\"|"\
+    "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)"\
+    > "$@"
+    """,
+)
+
 # This target builds the Core ML delegate as a separate static framework, which
 # does not include the TensorFlow Lite runtime. As this target does not contain
 # TensorFlow Lite runtime, it is intended to be linked along with the

From ea4ef0e6faf651c9f76ef90848dc62d8aa660ac1 Mon Sep 17 00:00:00 2001
From: David Rim <davidrim@google.com>
Date: Mon, 18 May 2020 00:03:24 -0700
Subject: [PATCH 339/412] Bumps llvm version

PiperOrigin-RevId: 312025889
Change-Id: I9c2a75e34bbfb2b9f6afaf0398c9cfde6870ac3b
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 404d253e8bd..452152efacf 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -655,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "9d4b4f344d8ea917e082cf58d66b71c0171e1650"
-    LLVM_SHA256 = "36e4470b5656cea3e0afb218edbdd96376fcb51dc2c5ed887b21237068baee41"
+    LLVM_COMMIT = "7af0c8559b6d9426dd5e977370516d2baa4c206f"
+    LLVM_SHA256 = "4c5efbc48755f9983a8522eddd6e448f0b93e3e75a56a507c1ecb44d367db6d5"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 344f8982507cd03ba79b7e21fef6f115451ee497 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Mon, 18 May 2020 00:28:56 -0700
Subject: [PATCH 340/412] Slightly optimize quantized add.

PiperOrigin-RevId: 312028385
Change-Id: Ie1fbb3071e4e258c24db78440e1275168694fda9
---
 .../lite/kernels/internal/optimized/integer_ops/add.h | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
index ff8e4687d58..95b78b3a6b3 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/add.h
@@ -47,6 +47,9 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
   const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
   const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
 
+  const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset);
+
   for (; i <= size - 16; i += 16) {
     const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
     const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
@@ -61,13 +64,13 @@ inline void AddElementwise(int size, const ArithmeticParams& params,
     const int16x8_t input2_val_s16_low =
         vmovl_s8(vget_low_s8(input2_val_original));
     const int16x8_t input1_val_high =
-        vaddq_s16(input1_val_s16_high, vdupq_n_s16(params.input1_offset));
+        vaddq_s16(input1_val_s16_high, input1_offset_dup);
     const int16x8_t input2_val_high =
-        vaddq_s16(input2_val_s16_high, vdupq_n_s16(params.input2_offset));
+        vaddq_s16(input2_val_s16_high, input2_offset_dup);
     const int16x8_t input1_val_low =
-        vaddq_s16(input1_val_s16_low, vdupq_n_s16(params.input1_offset));
+        vaddq_s16(input1_val_s16_low, input1_offset_dup);
     const int16x8_t input2_val_low =
-        vaddq_s16(input2_val_s16_low, vdupq_n_s16(params.input2_offset));
+        vaddq_s16(input2_val_s16_low, input2_offset_dup);
     const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
     const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
     const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);

From 76853076b382474ff35f4561fde231b06a5ccdfa Mon Sep 17 00:00:00 2001
From: David Rim <davidrim@google.com>
Date: Mon, 18 May 2020 01:32:19 -0700
Subject: [PATCH 341/412] Add optimized MatrixBatchVectorMultiplyAccumulate for
 asymmetric inputs for sse

PiperOrigin-RevId: 312035618
Change-Id: I5ae85ae9b0b646d2fe1e665c25aae6b99622dd2b
---
 .../internal/optimized/neon_tensor_utils.cc   |  35 +++--
 .../internal/optimized/neon_tensor_utils.h    |  10 --
 .../optimized/neon_tensor_utils_impl.h        |   6 -
 .../internal/optimized/sse_tensor_utils.cc    | 129 ++++++++++--------
 .../internal/optimized/sse_tensor_utils.h     |  22 +--
 .../optimized/sse_tensor_utils_impl.h         |  10 +-
 .../reference/portable_tensor_utils.cc        |  29 ----
 .../reference/portable_tensor_utils.h         |  10 --
 .../reference/portable_tensor_utils_impl.h    |   6 -
 .../kernels/internal/tensor_utils_test.cc     |   8 +-
 10 files changed, 110 insertions(+), 155 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 4c90cd86a56..c96f298370a 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -1466,16 +1466,20 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
       int i = 0;
       int32_t* scratch_ptr = scratch;
       for (; i <= total_size - 8; i += 8, result += 8) {
-        float batch_scaling_factor0 = scaling_factors[i / m_rows];
-        float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
-        if (per_channel_scale) {
-          batch_scaling_factor0 *= per_channel_scale[i % m_rows];
-          batch_scaling_factor1 *= per_channel_scale[(i + 4) % m_rows];
-        }
+        const float batch_scaling_factor0 = scaling_factors[i / m_rows];
+        const float batch_scaling_factor1 = scaling_factors[(i + 4) / m_rows];
         const int batch_input_offset0 = -input_offset[i / m_rows];
         const int batch_input_offset1 = -input_offset[(i + 4) / m_rows];
-        const float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
-        const float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
+        float32x4_t scaling_factor0 = vdupq_n_f32(batch_scaling_factor0);
+        float32x4_t scaling_factor1 = vdupq_n_f32(batch_scaling_factor1);
+        if (per_channel_scale) {
+          const float32x4_t per_channel_scale0 =
+              vld1q_f32(&per_channel_scale[i % m_rows]);
+          const float32x4_t per_channel_scale1 =
+              vld1q_f32(&per_channel_scale[(i + 4) % m_rows]);
+          scaling_factor0 = vmulq_f32(scaling_factor0, per_channel_scale0);
+          scaling_factor1 = vmulq_f32(scaling_factor1, per_channel_scale1);
+        }
         const int32x4_t input_offset0 = vdupq_n_s32(batch_input_offset0);
         const int32x4_t input_offset1 = vdupq_n_s32(batch_input_offset1);
         const int32x4_t row_sum0 = vld1q_s32(row_sums + (i % m_rows));
@@ -1498,7 +1502,10 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
 
       scratch_ptr += i;
       for (; i < total_size; i++) {
-        const float batch_scaling_factor = scaling_factors[i / m_rows];
+        float batch_scaling_factor = scaling_factors[i / m_rows];
+        if (per_channel_scale) {
+          batch_scaling_factor *= per_channel_scale[i % m_rows];
+        }
         const int32_t zero_point = input_offset[i / m_rows];
         int32_t dotprod = *(scratch_ptr++);
         dotprod -= row_sums[i % m_rows] * zero_point;
@@ -1514,16 +1521,6 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
       per_channel_scale, input_offset, row_sums);
 }
 
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  NeonMatrixBatchVectorMultiplyAccumulateImpl(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      per_channel_scale, input_offset, nullptr);
-}
-
 inline int64x2x2_t MulAdd(int32x4_t acc, int32x4_t lhs, int32x4_t rhs) {
   int64x2x2_t result;
   const int64x2_t lhs_low = vmovl_s32(vget_low_s32(lhs));
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index b978bf5f3bb..86951fcd559 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -55,16 +55,6 @@ void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
                    vectors, scaling_factors, n_batch, scratch, result, context);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                   vectors, scaling_factors, n_batch, result, per_channel_scale,
-                   input_offset);
-}
-
 void MatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
index 1b043390c22..1554d07a61c 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -62,12 +62,6 @@ void NeonMatrixBatchVectorMultiplyAccumulate(
     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
     bool* compute_row_sums, CpuBackendContext* context);
 
-void NeonMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset);
-
 void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights,
                         const int32_t* bias, int32_t layer_norm_scale_a,
                         int32_t layer_norm_scale_b, int32_t variance_limit,
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
index 7fb69e7b4f4..80cc14c6d26 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
@@ -89,18 +90,24 @@ float GetFloatVectorElement(__m128 v) {
 
 }  // namespace
 
-void SseMatrixBatchVectorMultiplyAccumulate(
+void SseMatrixBatchVectorMultiplyAccumulateImpl(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result) {
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, const int32_t* row_sums) {
   for (std::intptr_t batch = 0; batch < n_batch; ++batch) {
     const float batch_scaling_factor = scaling_factors[batch];
+    const int32_t batch_offset = input_offset ? input_offset[batch] : 0;
     // Compute dot-product for every column.
     for (std::intptr_t row = 0; row < m_rows; ++row) {
       // Get the address of the first element of the row.
       const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
-
+      const float row_scale =
+          per_channel_scale ? per_channel_scale[row] * batch_scaling_factor
+                            : batch_scaling_factor;
+      const int32_t row_offset =
+          row_sums && batch_offset ? batch_offset * row_sums[row] : 0;
       // Initialize the dot product sum for the row to 0.
       __m128i dotprod_32x4 = _mm_setzero_si128();
       std::intptr_t col = 0;
@@ -152,8 +159,10 @@ void SseMatrixBatchVectorMultiplyAccumulate(
       for (; col < m_cols; ++col) {
         sum += row_ptr[col] * vectors[col];
       }  // for col
-
-      *result += sum * batch_scaling_factor;
+      if (row_offset) {
+        sum -= row_offset;
+      }
+      *result += sum * row_scale;
       ++result;
     }  // for row
 
@@ -165,56 +174,30 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, const float* __restrict__ per_channel_scale,
-    const int32_t* __restrict__ input_offset) {
-  if (input_offset == nullptr) {
-    SseMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
-                                           scaling_factors, n_batch, result);
-    return;
-  }
-  static constexpr std::intptr_t kBlockSize = 16;
-  for (std::intptr_t batch = 0; batch < n_batch; ++batch) {
-    const float batch_scaling_factor = scaling_factors[batch];
-    for (std::intptr_t row = 0; row < m_rows; ++row) {
-      const int8_t* __restrict__ row_ptr = matrix + row * m_cols;
-      float scale = batch_scaling_factor;
-      if (per_channel_scale != nullptr) {
-        scale *= per_channel_scale[row];
-      }
-      __m128i dotprod_32x4 = _mm_setzero_si128();
-      __m128i row_sum_16x8 = _mm_setzero_si128();
-      std::intptr_t col = 0;
-      for (; col < (m_cols & ~(kBlockSize - 1)); col += kBlockSize) {
-        const __m128i vec_8x16 =
-            _mm_loadu_si128(reinterpret_cast<const __m128i*>(vectors + col));
-        const __m128i row_8x16 =
-            _mm_loadu_si128(reinterpret_cast<const __m128i*>(row_ptr + col));
-        // dotprod += vec · row
-        dotprod_32x4 =
-            _mm_add_epi32(dotprod_32x4, DotProdInt8x4x4(vec_8x16, row_8x16));
+    float* __restrict__ result) {
+  SseMatrixBatchVectorMultiplyAccumulateImpl(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      /*per_channel_scale=*/nullptr, /*input_offset=*/nullptr,
+      /*row_sums=*/nullptr);
+}
 
-        // Pairwise add 16x 8-bit values; equivalently, multipy-add with 1.
-        // Result is 8x 16-bit values.
-        const __m128i row_16x8 = _mm_maddubs_epi16(_mm_set1_epi8(1), row_8x16);
-        row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
-      }  // for col
-      // Pairwise add 8x 16-bit values; equivalently, multipy-add with 1.
-      // Result is 4x 32-bit values.
-      const __m128i row_sum_32x4 =
-          _mm_madd_epi16(row_sum_16x8, _mm_set1_epi16(1));
-      int32_t sum = ReduceInt32x4(dotprod_32x4);
-      int32_t row_sum = ReduceInt32x4(row_sum_32x4);
-      // Postamble loop.
-      for (; col < m_cols; ++col) {
-        sum += row_ptr[col] * vectors[col];
-        row_sum += row_ptr[col];
-      }  // for col
-      sum -= row_sum * input_offset[batch];
-      *result += sum * scale;
-      ++result;
-    }  // for row
-    vectors += m_cols;
-  }  // for batch
+void SseMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context) {
+  if ((input_offset != nullptr) && (!compute_row_sums || *compute_row_sums)) {
+    memset(row_sums, 0, sizeof(int32_t) * m_rows);
+    SseReductionSumVector(matrix, row_sums, m_rows, m_cols);
+    if (compute_row_sums) {
+      *compute_row_sums = false;
+    }
+  }
+  SseMatrixBatchVectorMultiplyAccumulateImpl(
+      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
+      per_channel_scale, input_offset, row_sums);
 }
 
 namespace {
@@ -347,6 +330,44 @@ void SseSparseMatrixBatchVectorMultiplyAccumulate(
   }  // for batch
 }
 
+void SseReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                           const int output_size, const int reduction_size) {
+  static constexpr std::intptr_t kBlockSize = 16;
+  for (std::intptr_t row = 0; row < output_size; ++row) {
+    const int8_t* __restrict__ row_ptr = input_vector + row * reduction_size;
+    __m128i row_sum_16x8 = _mm_setzero_si128();
+    std::intptr_t col = 0;
+    for (; col < (reduction_size & ~(kBlockSize - 1)); col += kBlockSize) {
+      const __m128i row_8x16 =
+          _mm_loadu_si128(reinterpret_cast<const __m128i*>(row_ptr + col));
+      const __m128i row_16x8 = _mm_maddubs_epi16(_mm_set1_epi8(1), row_8x16);
+      row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
+    }  // for col
+#ifdef __SSE4_1__
+    // Postamble for 8x 8-bit inputs.
+    if (col < (reduction_size & ~7)) {
+      // _mm_loadu_si64 not supported in gcc versions < 9, breaks kokoro build.
+      const __m128i row_16x8 = _mm_cvtepi8_epi16(
+          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(row_ptr + col)));
+      // dotprod += vec · row
+      row_sum_16x8 = _mm_add_epi16(row_sum_16x8, row_16x8);
+      col += 8;
+    }
+#endif
+    const __m128i row_sum_32x4 =
+        _mm_madd_epi16(row_sum_16x8, _mm_set1_epi16(1));
+    int32_t row_sum = ReduceInt32x4(row_sum_32x4);
+#if defined(__SSE4_1__) && defined(__clang__)
+    // SSE 4.1: Don't try to unroll and vectorize this, already done above.
+#pragma clang loop unroll(disable) vectorize(disable)
+#endif
+    for (; col < reduction_size; col++) {
+      row_sum += *(row_ptr + col);
+    }
+    *(output_vector + row) += row_sum;
+  }
+}
+
 }  // namespace tensor_utils
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index 986e70a7823..224d811e862 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -59,10 +59,9 @@ void MatrixBatchVectorMultiplyAccumulate(
     int n_batch, float* __restrict__ result, const float* per_channel_scale,
     const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
     bool* compute_row_sums, CpuBackendContext* context) {
-  PortableMatrixBatchVectorMultiplyAccumulate(
-      matrix, m_rows, m_cols, vectors, scaling_factors, n_batch, result,
-      per_channel_scale, input_offset, scratch, row_sums, compute_row_sums,
-      context);
+  SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                  vectors, scaling_factors, n_batch, result, per_channel_scale,
+                  input_offset, scratch, row_sums, compute_row_sums, context);
 }
 
 void MatrixBatchVectorMultiplyAccumulate(
@@ -75,17 +74,6 @@ void MatrixBatchVectorMultiplyAccumulate(
                   vectors, scaling_factors, n_batch, result);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors,
-    const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, const float* __restrict__ per_channel_scale,
-    const int32_t* __restrict__ input_offset) {
-  SSE_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
-                  vectors, scaling_factors, n_batch, result, per_channel_scale,
-                  input_offset);
-}
-
 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
@@ -315,8 +303,8 @@ void ReductionSumVector(const int32_t* input_vector, int32_t* output_vector,
 
 void ReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
                         int output_size, int reduction_size) {
-  NEON_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
-                   reduction_size);
+  SSE_OR_PORTABLE(ReductionSumVector, input_vector, output_vector, output_size,
+                  reduction_size);
 }
 
 void MeanStddevNormalization(const float* input_vector, float* output_vector,
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
index 1996b1f30a9..c5ede624762 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils_impl.h
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <cstdint>
 
+#include "tensorflow/lite/kernels/cpu_backend_context.h"
+
 #if defined(_MSC_VER)
 #define __restrict__ __restrict
 #endif
@@ -38,8 +40,9 @@ void SseMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors,
     const float* __restrict__ scaling_factors, int n_batch,
-    float* __restrict__ result, const float* __restrict__ per_channel_scale,
-    const int32_t* __restrict__ input_offset);
+    float* __restrict__ result, const float* per_channel_scale,
+    const int32_t* input_offset, int32_t* scratch, int32_t* row_sums,
+    bool* compute_row_sums, CpuBackendContext* context);
 
 // Matrix multiplication for quantized values using symmetric quantization.
 // Sparse version.
@@ -49,6 +52,9 @@ void SseSparseMatrixBatchVectorMultiplyAccumulate(
     const float* __restrict__ scaling_factors, int n_batch,
     float* __restrict__ result);
 
+void SseReductionSumVector(const int8_t* input_vector, int32_t* output_vector,
+                           const int output_size, const int reduction_size);
+
 #endif  // __SSSE3__
 
 }  // namespace tensor_utils
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index 0e66dfee191..4f6db290d4f 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -161,35 +161,6 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
   }    // for batch
 }
 
-void PortableMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  for (int batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
-    const float batch_scaling_factor = scaling_factors[batch];
-    const float batch_offset = input_offset[batch];
-    const int8_t* row_ptr = matrix;
-    for (int row = 0; row < m_rows; ++row) {
-      int32_t dotprod = 0;
-      float scale = batch_scaling_factor;
-      if (per_channel_scale) {
-        scale *= per_channel_scale[row];
-      }
-#if defined(__GNUC__)
-      // Prefetch the row to cache.
-      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
-                         3 /* temporal locality */);
-#endif
-      for (int col = 0; col < m_cols; ++col, ++row_ptr) {
-        dotprod += (*row_ptr) * (vectors[col] - batch_offset);
-      }  // for col
-      *result += dotprod * scale;
-      ++result;
-    }  // for row
-  }    // for batch
-}
-
 void PortableMatrixBatchVectorMultiplyAccumulate(
     const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
     const int8_t* __restrict__ vectors, const float* scaling_factors,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index f2e6c9b4f7d..0fd7a407595 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -98,16 +98,6 @@ void MatrixBatchVectorMultiplyAccumulate(const int8_t* __restrict__ matrix,
                                               scaling_factors, n_batch, result);
 }
 
-void MatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset) {
-  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vectors,
-                                              scaling_factors, n_batch, result,
-                                              per_channel_scale, input_offset);
-}
-
 void SparseMatrixBatchVectorMultiplyAccumulate1x4(
     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index 6c15a6cd919..34767ccd942 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -83,12 +83,6 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
     int n_batch, int32_t* scratch, float* __restrict__ result,
     CpuBackendContext* context);
 
-void PortableMatrixBatchVectorMultiplyAccumulate(
-    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
-    const int8_t* __restrict__ vectors, const float* scaling_factors,
-    int n_batch, float* __restrict__ result, const float* per_channel_scale,
-    const int32_t* input_offset);
-
 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
     const float* __restrict__ matrix, const int32_t* __restrict__ segments,
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 3ad59acdb68..878cf0d2618 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -1136,11 +1136,15 @@ std::vector<float> TestPerChannelDotprodMatrixBatchVectorMultiply(
     bool is_per_channel = true) {
   MatrixVectorData data =
       SetupMatrixVectorData(rows, cols, batch, negative, is_per_channel);
-
+  std::vector<int32_t> scratch(rows * batch);
+  std::vector<int32_t> row_sums(rows);
+  bool compute_row_sums = true;
+  CpuBackendContext context;
   MatrixBatchVectorMultiplyAccumulate(
       data.matrix.data(), rows, cols, data.vectors.data(),
       data.scale_factors.data(), batch, &data.results[0],
-      data.per_channel_scales.data(), data.input_offsets.data());
+      data.per_channel_scales.data(), data.input_offsets.data(), scratch.data(),
+      row_sums.data(), &compute_row_sums, &context);
   return data.results;
 }
 

From de8a517f4068589fb5cd82c8a8a8dc3d5e101c0e Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Mon, 18 May 2020 01:58:56 -0700
Subject: [PATCH 342/412] fix escape in Core ML header processing

PiperOrigin-RevId: 312038605
Change-Id: I422e343729a7f27808c3f9b908460faeeaa58ce5
---
 tensorflow/lite/experimental/ios/BUILD.apple | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/lite/experimental/ios/BUILD.apple b/tensorflow/lite/experimental/ios/BUILD.apple
index a29e8bd6ed5..7e2a3623af1 100644
--- a/tensorflow/lite/experimental/ios/BUILD.apple
+++ b/tensorflow/lite/experimental/ios/BUILD.apple
@@ -51,7 +51,7 @@ genrule(
     srcs = ["//tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h"],
     outs = ["coreml_delegate.h"],
     cmd = """
-    sed "s|#include \".*common.h\"|#include \"TensorFlowLiteC/common.h\"|"\
+    sed 's|#include ".*common.h"|#include "TensorFlowLiteC/common.h"|'\
     "$(location //tensorflow/lite/experimental/delegates/coreml:coreml_delegate.h)"\
     > "$@"
     """,

From 647ef2db28957b9cb1d0df66ee9a2a37ca21ca15 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 02:02:53 -0700
Subject: [PATCH 343/412] Update GraphDef version to 405.

PiperOrigin-RevId: 312039077
Change-Id: I03ac966118084eb80d817cdfe98b175c75bf86aa
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 63501a14f56..7abbcd5474c 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 404  // Updated: 2020/5/17
+#define TF_GRAPH_DEF_VERSION 405  // Updated: 2020/5/18
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 72c50430aa5347e6c9bc1a1927a4e13db0dc766a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 02:02:54 -0700
Subject: [PATCH 344/412] compat: Update forward compatibility horizon to
 2020-05-18

PiperOrigin-RevId: 312039082
Change-Id: I03c04d8d9a395087e866a67ca58a263150b3f754
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 2a99a0774ad..88a26661f82 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 17)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 18)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From b2f3e8f5639a9370c9f8987a733ab3496eb87a97 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 18 May 2020 06:16:05 -0700
Subject: [PATCH 345/412] numerics_test.py: Move tfdbg2-specific test methods
 to debug_v2_ops_test.py

PiperOrigin-RevId: 312065934
Change-Id: Idf576fd41ae96ed19f815bcce8848eabef036834
---
 .../python/debug/lib/debug_v2_ops_test.py     | 34 ++++++++++++++
 .../python/kernel_tests/numerics_test.py      | 46 -------------------
 2 files changed, 34 insertions(+), 46 deletions(-)

diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py
index c76cbeeac6c..07721920f63 100644
--- a/tensorflow/python/debug/lib/debug_v2_ops_test.py
+++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_debug_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
@@ -680,6 +681,39 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase):
     self.assertAllEqual(tensor_1, tensor_2)
     self.assertEqual(tensor_id_1, tensor_id_2)
 
+  def testCheckNumericsV2OpNegativeAndPositiveInf(self):
+    """Test that CheckNumericsV2 op distinguishes negative and positive infs."""
+    with self.session(graph=ops.Graph()):
+      t1 = constant_op.constant([-1.0, 1.0])
+      t2 = constant_op.constant([0.0, 0.0])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"pass through test.*had -Inf and \+Inf values"):
+        self.evaluate(
+            array_ops.check_numerics_v2(t1 / t2, message="pass through test"))
+
+  def testCheckNumericsV2OpNegativeAndPositiveInfAndNaN(self):
+    """CheckNumericsV2 op distinguishes - & + infs when nan is present."""
+    with self.session(graph=ops.Graph()):
+      t1 = constant_op.constant([-1.0, 1.0, 0.0])
+      t2 = constant_op.constant([0.0, 0.0, 0.0])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"pass through test.*had -Inf, \+Inf, and NaN values"):
+        self.evaluate(
+            array_ops.check_numerics_v2(t1 / t2, message="pass through test"))
+
+  def testCheckNumericsV2PositiveInfAndNaN(self):
+    """Test that CheckNumericsV2 op shows sign of inf when nan is present."""
+    with self.session(graph=ops.Graph()):
+      t1 = constant_op.constant([0.0, 1.0])
+      t2 = constant_op.constant([0.0, 0.0])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"pass through test.*had \+Inf and NaN values"):
+        self.evaluate(
+            array_ops.check_numerics_v2(t1 / t2, message="pass through test"))
+
 
 if __name__ == "__main__":
   ops.enable_eager_execution()
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 4d31cd45289..950658bc886 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -24,7 +24,6 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -132,51 +131,6 @@ class NumericsTest(test.TestCase):
         r"or `tf.while_loop\(\)`\."):
       numerics.add_check_numerics_ops()
 
-  def testCheckNumericsV2OpNegativeAndPositiveInf(self):
-    """Test that CheckNumericsV2 op distinguishes negative and positive infs."""
-    with self.session(graph=ops.Graph()):
-      t1 = constant_op.constant([-1.0, 1.0])
-      t2 = constant_op.constant([0.0, 0.0])
-      checked = array_ops.check_numerics_v2(
-          t1 / t2, message="pass through test")
-      caught = None
-      try:
-        self.evaluate(checked)
-      except errors.InvalidArgumentError as error:
-        caught = error
-      self.assertIn("had -Inf and +Inf values", caught.message)
-      self.assertIn("pass through test", caught.message)
-
-  def testCheckNumericsV2OpNegativeAndPositiveInfAndNaN(self):
-    """CheckNumericsV2 op distinguishes - & + infs when nan is present."""
-    with self.session(graph=ops.Graph()):
-      t1 = constant_op.constant([-1.0, 1.0, 0.0])
-      t2 = constant_op.constant([0.0, 0.0, 0.0])
-      checked = array_ops.check_numerics_v2(
-          t1 / t2, message="pass through test")
-      caught = None
-      try:
-        self.evaluate(checked)
-      except errors.InvalidArgumentError as error:
-        caught = error
-      self.assertIn("had -Inf, +Inf, and NaN values", caught.message)
-      self.assertIn("pass through test", caught.message)
-
-  def testCheckNumericsV2PositiveInfAndNaN(self):
-    """Test that CheckNumericsV2 op shows sign of inf when nan is present."""
-    with self.session(graph=ops.Graph()):
-      t1 = constant_op.constant([0.0, 1.0])
-      t2 = constant_op.constant([0.0, 0.0])
-      checked = array_ops.check_numerics_v2(
-          t1 / t2, message="pass through test")
-      caught = None
-      try:
-        self.evaluate(checked)
-      except errors.InvalidArgumentError as error:
-        caught = error
-      self.assertIn("had +Inf and NaN values", caught.message)
-      self.assertIn("pass through test", caught.message)
-
 
 if __name__ == "__main__":
   # TODO(b/130689556): XLA CPU does not honor inf/nan which causes problems

From fb416f16e2b01252326816bb311c3e6165d13bcf Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 18 May 2020 06:28:20 -0700
Subject: [PATCH 346/412] [tfdbg] Fix source_utils_test in Python 3.8+

This is related to https://bugs.python.org/issue12458

In python 3.8, traceback reports the first instead of last line in
a multi-line continuation block.

Certain parts of source_utils_test.py assume that traceback always
returns the last line, which is true all the way up to 3.7.

In order to fix this, we use the `ast` module to extract the lineno
of the first line in a multi-line continuation block.

PiperOrigin-RevId: 312067389
Change-Id: I8a3ac129b3d75230a3eedd64c3605779dcab5336
---
 tensorflow/python/debug/BUILD                 |  1 -
 .../python/debug/lib/source_utils_test.py     | 38 ++++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 956e90999c7..1ef0504ecb8 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -840,7 +840,6 @@ py_test(
     python_version = "PY3",
     srcs_version = "PY2AND3",
     tags = [
-        "no_oss_py38",  #TODO(b/151449908)
         "no_windows",
     ],
     deps = [
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index faf2365fc9c..89964a21ba7 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -18,7 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import ast
 import os
+import sys
 import tempfile
 import zipfile
 
@@ -43,7 +45,41 @@ from tensorflow.python.util import tf_inspect
 
 
 def line_number_above():
-  return tf_inspect.stack()[1][2] - 1
+  """Get lineno of the AST node immediately above this function's call site.
+
+  It is assumed that there is no empty line(s) between the call site and the
+  preceding AST node.
+
+  Returns:
+    The lineno of the preceding AST node, at the same level of the AST.
+    If the preceding AST spans multiple lines:
+      - In Python 3.8+, the lineno of the first line is returned.
+      - In older Python versions, the lineno of the last line is returned.
+  """
+  # https://bugs.python.org/issue12458: In Python 3.8, traceback started
+  # to return the lineno of the first line of a multi-line continuation block,
+  # instead of that of the last line. Therefore, in Python 3.8+, we use `ast` to
+  # get the lineno of the first line.
+  call_site_lineno = tf_inspect.stack()[1][2]
+  if sys.version_info < (3, 8):
+    return call_site_lineno - 1
+  else:
+    with open(__file__, "rb") as f:
+      source_text = f.read().decode("utf-8")
+    source_tree = ast.parse(source_text)
+    prev_node = _find_preceding_ast_node(source_tree, call_site_lineno)
+    return prev_node.lineno
+
+
+def _find_preceding_ast_node(node, lineno):
+  """Find the ast node immediately before and not including lineno."""
+  for i, child_node in enumerate(node.body):
+    if child_node.lineno == lineno:
+      return node.body[i - 1]
+    if hasattr(child_node, "body"):
+      found_node = _find_preceding_ast_node(child_node, lineno)
+      if found_node:
+        return found_node
 
 
 class GuessIsTensorFlowLibraryTest(test_util.TensorFlowTestCase):

From ff2019a216aed7bbb1e30432b47abcfe5567f0b4 Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Mon, 18 May 2020 07:06:15 -0700
Subject: [PATCH 347/412] Optimize multiply by quantize multiplier.

PiperOrigin-RevId: 312072311
Change-Id: I7d01be9aa8f1a238c6887d4770a1090899337383
---
 .../internal/optimized/optimized_ops.h        | 82 ++++++-------------
 1 file changed, 27 insertions(+), 55 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index b18f0f4bb5a..64598d70ee3 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -201,63 +201,35 @@ MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
 // MultiplyByQuantizedMultipler.
 #ifdef USE_NEON
 inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
-    int32x4x4_t input_val, int32 quantized_multiplier, int shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  const int left_shift = shift > 0 ? shift : 0;
-  const int right_shift = shift > 0 ? 0 : -shift;
+    int32x4x4_t input_val, int32 quantized_multiplier, int32 shift) {
+  const int left_shift = std::max(shift, 0);
+  const int right_shift = std::min(shift, 0);
   int32x4x4_t result;
-  // The vector type support for SaturatingRoundingDoublingHighMulth in gemmlowp
-  // is limited to NEON.
-#ifdef GEMMLOWP_NEON
-  const int32x4_t left_shifted_one_dup = vdupq_n_s32(1 << left_shift);
-  result.val[0] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[0], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-  result.val[1] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[1], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-  result.val[2] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[2], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-  result.val[3] =
-      RoundingDivideByPOT(SaturatingRoundingDoublingHighMul(
-                              vmulq_s32(input_val.val[3], left_shifted_one_dup),
-                              quantized_multiplier),
-                          right_shift);
-#else
-  for (int i = 0; i < 4; ++i) {
-    int32_t vals[4];
-    vals[0] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 0) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
-    vals[1] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 1) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
-    vals[2] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 2) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
-    vals[3] = RoundingDivideByPOT(
-        SaturatingRoundingDoublingHighMul(
-            vgetq_lane_s32(input_val.val[i], 3) * (1 << left_shift),
-            quantized_multiplier),
-        right_shift);
 
-    result.val[i] = vld1q_s32(reinterpret_cast<int32_t*>(&vals));
-  }
-#endif
+  int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+  int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+  int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+  result.val[0] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[1] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[2] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
+  result.val[3] =
+      vrshlq_s32(vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup),
+                               multiplier_dup),
+                 right_shift_dup);
+
   return result;
 }
 #endif

From b5ed51fb220fa85b96268b392fe7f60804c004c3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 07:37:15 -0700
Subject: [PATCH 348/412] Resolve trivial aliases for portable TensorFlow
 targets.

PiperOrigin-RevId: 312076343
Change-Id: I49adacfaea505bed1edb4ca51776057474d2a4ca
---
 tensorflow/tensorflow.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 9e89094f4e7..d72bdf58186 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -874,7 +874,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
-            clean_dep("//tensorflow/core:android_tensorflow_lib"),
+            clean_dep("//tensorflow/core:portable_tensorflow_lib"),
         ]),
         copts = tf_copts(),
         alwayslink = 1,
@@ -891,7 +891,7 @@ def tf_gen_op_wrappers_cc(
             clean_dep("//tensorflow/core:ops"),
             clean_dep("//tensorflow/core:protos_all_cc"),
         ]) + if_android([
-            clean_dep("//tensorflow/core:android_tensorflow_lib"),
+            clean_dep("//tensorflow/core:portable_tensorflow_lib"),
         ]),
         copts = tf_copts(),
         alwayslink = 1,

From ea113ef6cdbd34203f8f951af8621dbc1e4572e6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 07:41:37 -0700
Subject: [PATCH 349/412] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/a2a4e5aae894

PiperOrigin-RevId: 312076934
Change-Id: I12015eb4ec1278668834ca8a687d290a00eba112
---
 tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index c2b11819448..6375bf7341f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -292,7 +292,7 @@ llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type,
   llvm::AllocaInst* alloca =
       b->CreateAlloca(type, element_count, AsStringRef(name));
   if (alignment != 0) {
-    alloca->setAlignment(llvm::MaybeAlign(alignment));
+    alloca->setAlignment(llvm::Align(alignment));
   }
   return alloca;
 }

From f40a063d84df3f4e0ed2a2fc78d8b79f203a03b4 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Mon, 18 May 2020 07:46:08 -0700
Subject: [PATCH 350/412] [TF:TRT] Enhance InstantiateBuildAndRun to support
 the case where the input type and output type are not the same.

This is to prepare for a change to enhance the TF-TRT bridge to support the Cast
operations that can be represented via IIdentityLayer.

PiperOrigin-RevId: 312077452
Change-Id: Iab6bfb54d6a346eef158785f61a1311559cee855
---
 .../tf2tensorrt/convert/convert_nodes_test.cc | 37 +++++++++++++++----
 1 file changed, 29 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 884ed7a5771..82c02c17e93 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -1712,7 +1712,7 @@ INSTANTIATE_TEST_CASE_P(
 
 // Builds and runs the converted network. Checks output tensor shape. Tests
 // output values using a matcher.
-template <DataType dtype>
+template <DataType input_dtype, DataType output_dtype>
 void BuildAndRunConvertedNetwork(const string& name, OpConverterTest* test,
                                  const TestParamBase& p,
                                  const std::vector<float>& input_vec,
@@ -1731,12 +1731,14 @@ void BuildAndRunConvertedNetwork(const string& name, OpConverterTest* test,
     // runtime errors.
     return;
   }
-  typedef typename EnumToDataType<dtype>::Type T;
+  typedef typename EnumToDataType<input_dtype>::Type Tin;
   TensorShape shape;
   TF_EXPECT_OK(TensorShapeUtils::MakeShape(p.input_dims, &shape));
   const DataVec input_data{
-      {"input", test->AsTensor<T>(CastTestVector<float, T>(input_vec), shape)}};
-  DataVec output_data{{name, test->ConstructTensor<T>(6)}};
+      {"input",
+       test->AsTensor<Tin>(CastTestVector<float, Tin>(input_vec), shape)}};
+  typedef typename EnumToDataType<output_dtype>::Type Tout;
+  DataVec output_data{{name, test->ConstructTensor<Tout>(6)}};
   test->BuildAndRun(input_data, &output_data);
   // Check the shape of the actual output tensor
   TF_EXPECT_OK(TensorShapeUtils::MakeShape(p.expected_output_dims, &shape));
@@ -1744,7 +1746,7 @@ void BuildAndRunConvertedNetwork(const string& name, OpConverterTest* test,
       << "Expected shape: " << shape.DebugString() << ", actual shape"
       << output_data[0].tensor.shape().DebugString();
   // Cast the output to float and compare to expected output
-  auto out_span = GetSpanForData<T>(output_data[0]);
+  auto out_span = GetSpanForData<Tout>(output_data[0]);
   std::vector<float> casted_output(out_span.begin(), out_span.end());
   EXPECT_THAT(casted_output, matcher);
 }
@@ -1754,16 +1756,35 @@ void InstantiateBuildAndRun(DataType tf_dtype, const string& name,
                             const std::vector<float>& input_vec,
                             const Matcher<std::vector<float>>& matcher) {
   if (tf_dtype == DT_FLOAT) {
-    BuildAndRunConvertedNetwork<DT_FLOAT>(name, test, p, input_vec, matcher);
+    BuildAndRunConvertedNetwork<DT_FLOAT, DT_FLOAT>(name, test, p, input_vec,
+                                                    matcher);
   } else if (tf_dtype == DT_HALF) {
-    BuildAndRunConvertedNetwork<DT_HALF>(name, test, p, input_vec, matcher);
+    BuildAndRunConvertedNetwork<DT_HALF, DT_HALF>(name, test, p, input_vec,
+                                                  matcher);
   } else if (tf_dtype == DT_INT32) {
-    BuildAndRunConvertedNetwork<DT_INT32>(name, test, p, input_vec, matcher);
+    BuildAndRunConvertedNetwork<DT_INT32, DT_INT32>(name, test, p, input_vec,
+                                                    matcher);
   } else {
     FAIL() << "Test not supported for " << tf_dtype;
   }
 }
 
+void InstantiateBuildAndRun(DataType input_tf_dtype, DataType output_tf_dtype,
+                            const string& name, OpConverterTest* test,
+                            const TestParamBase& p,
+                            const std::vector<float>& input_vec,
+                            const Matcher<std::vector<float>>& matcher) {
+  if (input_tf_dtype == output_tf_dtype) {
+    InstantiateBuildAndRun(input_tf_dtype, name, test, p, input_vec, matcher);
+  } else if (input_tf_dtype == DT_HALF && output_tf_dtype) {
+    BuildAndRunConvertedNetwork<DT_HALF, DT_FLOAT>(name, test, p, input_vec,
+                                                   matcher);
+  } else {
+    FAIL() << "Test not supported for input " << input_tf_dtype << " output "
+           << output_tf_dtype;
+  }
+}
+
 template <typename T>
 void CopyTensorElements(const Tensor& tensor, protobuf::RepeatedField<T>* out) {
   out->Clear();

From 50fcac47a2652459a7f9b71255cfa1cf0077447b Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Mon, 18 May 2020 07:49:05 -0700
Subject: [PATCH 351/412] Optimize quantized mul.

PiperOrigin-RevId: 312077803
Change-Id: Ib6bbf261834a828590748e2c39ad146bad7d80ae
---
 .../internal/optimized/integer_ops/mul.h      | 139 ++++++++++++------
 1 file changed, 97 insertions(+), 42 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
index 18aeef4c8b5..0d385ec1656 100644
--- a/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
+++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/mul.h
@@ -38,49 +38,81 @@ inline void MulElementwise(int size, const ArithmeticParams& params,
   TFLITE_DCHECK_GT(params.output_offset, -256);
   TFLITE_DCHECK_LT(params.output_offset, 256);
 #ifdef USE_NEON
-  const auto input1_offset_vector = vdupq_n_s16(params.input1_offset);
-  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
-  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset);
   const auto output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
+      vdupq_n_s8(params.quantized_activation_min);
   const auto output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
+      vdupq_n_s8(params.quantized_activation_max);
   const int left_shift = std::max(0, params.output_shift);
   const int right_shift = std::max(0, -params.output_shift);
   const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
-  for (; i <= size - 8; i += 8) {
-    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
-    const auto input1_val_original = vld1_s8(input1_data + i);
-    const auto input2_val_original = vld1_s8(input2_data + i);
-    const auto input1_val_s16 = vmovl_s8(input1_val_original);
-    const auto input2_val_s16 = vmovl_s8(input2_val_original);
-    const auto input1_val = vaddq_s16(input1_val_s16, input1_offset_vector);
-    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+  for (; i <= size - 16; i += 16) {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
 
-    const auto input1_val_low = vget_low_s16(input1_val);
-    const auto input1_val_high = vget_high_s16(input1_val);
-    const auto input2_val_low = vget_low_s16(input2_val);
-    const auto input2_val_high = vget_high_s16(input2_val);
+    const int16x8_t input1_val_s16_high =
+        vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low =
+        vmovl_s8(vget_low_s8(input1_val_original));
 
-    auto p1 = vmull_s16(input2_val_low, input1_val_low);
-    auto p2 = vmull_s16(input2_val_high, input1_val_high);
+    const int16x8_t input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low =
+        vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high =
+        vaddq_s16(input1_val_s16_high, input1_offset_vector);
+    const int16x8_t input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const int16x8_t input1_val_low =
+        vaddq_s16(input1_val_s16_low, input1_offset_vector);
+    const int16x8_t input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_vector);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+
+    auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high);
+    auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low);
+    auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high);
+    auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low);
 
     p1 = vshlq_s32(p1, left_shift_vec);
     p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
     p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
     p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
     p1 = RoundingDivideByPOT(p1, right_shift);
     p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
 
     const auto p1_narrowed = vqmovn_s32(p1);
     const auto p2_narrowed = vqmovn_s32(p2);
-    const auto p =
-        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
-    const auto clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(p)));
-    vst1_s8(output_data + i, clamped);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+        vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+        vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped = vmaxq_s8(output_activation_min_vector,
+                                  vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 
@@ -117,40 +149,63 @@ inline void MulSimpleBroadcast(int size, const ArithmeticParams& params,
   const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
   const auto output_offset_vector = vdupq_n_s16(params.output_offset);
   const auto output_activation_min_vector =
-      vdup_n_s8(params.quantized_activation_min);
+      vdupq_n_s8(params.quantized_activation_min);
   const auto output_activation_max_vector =
-      vdup_n_s8(params.quantized_activation_max);
+      vdupq_n_s8(params.quantized_activation_max);
   const int left_shift = std::max(0, params.output_shift);
   const int right_shift = std::max(0, -params.output_shift);
   const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
-  for (; i <= size - 8; i += 8) {
-    // We load / store 8 at a time, multiplying as two sets of 4 int32s.
-    const auto input2_val_original = vld1_s8(input2_data + i);
-    const auto input2_val_s16 = vmovl_s8(input2_val_original);
-    const auto input2_val = vaddq_s16(input2_val_s16, input2_offset_vector);
+  for (; i <= size - 16; i += 16) {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const auto input2_val_original = vld1q_s8(input2_data + i);
+    const auto input2_val_s16_high =
+        vmovl_s8(vget_high_s8(input2_val_original));
+    const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
 
-    const auto input2_val_low = vget_low_s16(input2_val);
-    const auto input2_val_high = vget_high_s16(input2_val);
+    const auto input2_val_high =
+        vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const auto input2_val_low =
+        vaddq_s16(input2_val_s16_low, input2_offset_vector);
 
-    auto p1 = vmull_n_s16(input2_val_low, input1_val);
-    auto p2 = vmull_n_s16(input2_val_high, input1_val);
+    const auto input2_val_low_low = vget_low_s16(input2_val_low);
+    const auto input2_val_low_high = vget_high_s16(input2_val_low);
+    const auto input2_val_high_low = vget_low_s16(input2_val_high);
+    const auto input2_val_high_high = vget_high_s16(input2_val_high);
+
+    auto p1 = vmull_n_s16(input2_val_high_high, input1_val);
+    auto p2 = vmull_n_s16(input2_val_high_low, input1_val);
+    auto p3 = vmull_n_s16(input2_val_low_high, input1_val);
+    auto p4 = vmull_n_s16(input2_val_low_low, input1_val);
 
     p1 = vshlq_s32(p1, left_shift_vec);
     p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
     p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
     p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
     using gemmlowp::RoundingDivideByPOT;
     p1 = RoundingDivideByPOT(p1, right_shift);
     p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
 
     const auto p1_narrowed = vqmovn_s32(p1);
     const auto p2_narrowed = vqmovn_s32(p2);
-    const auto p =
-        vaddq_s16(vcombine_s16(p1_narrowed, p2_narrowed), output_offset_vector);
-    const auto clamped =
-        vmax_s8(output_activation_min_vector,
-                vmin_s8(output_activation_max_vector, vqmovn_s16(p)));
-    vst1_s8(output_data + i, clamped);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+        vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+        vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped = vmaxq_s8(output_activation_min_vector,
+                                  vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
   }
 #endif  // NEON
 

From 55aee9e55084b309d5a01dae6685d4622482d6df Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Mon, 18 May 2020 08:55:02 -0700
Subject: [PATCH 352/412] [TF:TRT] Add utilities for converting between TF
 types and TRT types.

PiperOrigin-RevId: 312087947
Change-Id: Ie4c47ab5c6aae97af5a83bba06e3de0637752ecf
---
 .../tf2tensorrt/convert/convert_nodes_test.cc | 32 ++++++-----------
 .../compiler/tf2tensorrt/convert/utils.cc     | 35 +++++++++++++++++++
 .../compiler/tf2tensorrt/convert/utils.h      |  3 ++
 3 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 82c02c17e93..964370af6be 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -137,30 +137,18 @@ std::ostream& operator<<(std::ostream& os, const std::vector<T>& v) {
   return os;
 }
 
-nvinfer1::DataType TfDataTypeToTrt(DataType tf_dtype) {
-  switch (tf_dtype) {
-    case DT_FLOAT:
-      return nvinfer1::DataType::kFLOAT;
-    case DT_HALF:
-      return nvinfer1::DataType::kHALF;
-    case DT_INT32:
-      return nvinfer1::DataType::kINT32;
-    default:
-      QCHECK(false) << "Unexpected data type " << DataTypeString(tf_dtype);
-  }
+nvinfer1::DataType TfDataTypeToTrt(DataType tf_type) {
+  nvinfer1::DataType trt_type;
+  Status status = TfTypeToTrtType(tf_type, &trt_type);
+  EXPECT_EQ(status, Status::OK());
+  return trt_type;
 }
 
-DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) {
-  switch (trt_dtype) {
-    case nvinfer1::DataType::kFLOAT:
-      return DT_FLOAT;
-    case nvinfer1::DataType::kHALF:
-      return DT_HALF;
-    case nvinfer1::DataType::kINT32:
-      return DT_INT32;
-    default:
-      QCHECK(false) << "Unexpected data type " << static_cast<int>(trt_dtype);
-  }
+DataType TrtDataTypeToTf(nvinfer1::DataType trt_type) {
+  DataType tf_type;
+  Status status = TrtTypeToTfType(trt_type, &tf_type);
+  EXPECT_EQ(status, Status::OK());
+  return tf_type;
 }
 
 NodeDef MakeNodeDef(const string& name, const string& op,
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.cc b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
index fb3ae6943d3..a4b64ec0dc5 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/errors.h"
 
 namespace tensorflow {
 namespace tensorrt {
@@ -185,6 +186,40 @@ Status TrtDimsToTensorShape(const nvinfer1::Dims trt_dims,
   return Status::OK();
 }
 
+Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type) {
+  switch (tf_type) {
+    case DT_FLOAT:
+      *trt_type = nvinfer1::DataType::kFLOAT;
+      break;
+    case DT_HALF:
+      *trt_type = nvinfer1::DataType::kHALF;
+      break;
+    case DT_INT32:
+      *trt_type = nvinfer1::DataType::kINT32;
+      break;
+    default:
+      return errors::Internal("Unsupported tensorflow type");
+  }
+  return Status::OK();
+}
+
+Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type) {
+  switch (trt_type) {
+    case nvinfer1::DataType::kFLOAT:
+      *tf_type = DT_FLOAT;
+      break;
+    case nvinfer1::DataType::kHALF:
+      *tf_type = DT_HALF;
+      break;
+    case nvinfer1::DataType::kINT32:
+      *tf_type = DT_INT32;
+      break;
+    default:
+      return errors::Internal("Invalid TRT type");
+  }
+  return Status::OK();
+}
+
 int GetNumberOfEngineInputs(const nvinfer1::ICudaEngine* engine) {
   int n_bindings = engine->getNbBindings();
   int n_input = 0;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/utils.h b/tensorflow/compiler/tf2tensorrt/convert/utils.h
index 5d4cf1bb851..59eeb420134 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/utils.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/utils.h
@@ -106,6 +106,9 @@ Status TrtDimsToTensorShape(const nvinfer1::Dims trt_dims,
                             bool use_implicit_batch, int batch_size,
                             TensorShape& shape);
 
+Status TfTypeToTrtType(DataType tf_type, nvinfer1::DataType* trt_type);
+Status TrtTypeToTfType(nvinfer1::DataType trt_type, DataType* tf_type);
+
 // Returns a string that includes compile time TensorRT library version
 // information {Maj, Min, Patch}.
 string GetLinkedTensorRTVersion();

From 46f7108d78c6a3c0854fe66ce1cd92e5ebb3d6e2 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Mon, 18 May 2020 09:08:29 -0700
Subject: [PATCH 353/412] Internal change

PiperOrigin-RevId: 312090528
Change-Id: I474709513b01db8c24c50fd670029451c51cb622
---
 tensorflow/python/keras/layers/embeddings.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index 3f57fd6cb63..e30e93f02dc 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -129,8 +129,10 @@ class Embedding(Layer):
     # since it knows all kernels using the variable only exist on CPU.
     # When eager execution is enabled, the placement decision has to be made
     # right now. Checking for the presence of GPUs to avoid complicating the
-    # TPU codepaths which can handle sparse optimizers.
-    if context.executing_eagerly() and context.context().num_gpus():
+    # TPU codepaths which can handle sparse optimizers. But if we are within
+    # a tf.function, we go back the graph mode logic and rely on the placer.
+    if (context.executing_eagerly() and context.context().num_gpus() and
+        not ops.inside_function()):
       with ops.device('cpu:0'):
         self.embeddings = self.add_weight(
             shape=(self.input_dim, self.output_dim),

From 32165792a3ae4705f50d82329db0733aa01bb6ed Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Mon, 18 May 2020 09:23:09 -0700
Subject: [PATCH 354/412] [TF:TRT] Implement cast from fp16 to fp32 with
 IIdentityLayer.

This is the first CL to implement the request in b/150285802.

Add Cast op test to convert_nodes_test.

PiperOrigin-RevId: 312093049
Change-Id: I77215cf6da104f51acc93de1b03e9a179db54f0a
---
 .../tf2tensorrt/convert/convert_nodes.cc      | 106 +++++++++++++++---
 .../tf2tensorrt/convert/convert_nodes.h       |   2 +
 .../tf2tensorrt/convert/convert_nodes_test.cc |  21 +++-
 3 files changed, 109 insertions(+), 20 deletions(-)

diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
index a43b16e9e6a..e791ff9ff60 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2tensorrt/convert/utils.h"
 #include "tensorflow/compiler/tf2tensorrt/utils/trt_logger.h"
@@ -795,6 +796,19 @@ nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const {
   }
 }
 
+Status TRT_TensorOrWeights::GetTfType(DataType* tf_type) const {
+  if (is_tensor()) {
+    nvinfer1::DataType trt_type = tensor()->getType();
+    return TrtTypeToTfType(trt_type, tf_type);
+  }
+
+  if (is_weights()) {
+    *tf_type = weights().GetTensor().dtype();
+    return Status::OK();
+  }
+  return errors::Internal("The object is probably not initialized");
+}
+
 string TRT_TensorOrWeights::DebugString() const {
   string output = "TRT_TensorOrWeights(type=";
   if (is_tensor()) {
@@ -1900,27 +1914,48 @@ Status CheckInputsWeights(
   return Status::OK();
 }
 
-Status AllowDataTypes(const OpConverterParams& params,
-                      const std::set<DataType>& allowed_dtypes,
-                      const char* dtype_attr_name = "T") {
-  const auto& node_def = params.node_def;
+Status GetNodeDefTfType(const NodeDef& node_def, DataType* tf_type,
+                        const char* type_attr_name) {
   TFAttrs attrs(node_def);
-  if (!attrs.count(dtype_attr_name)) {
-    return errors::InvalidArgument("Attribute with name ", dtype_attr_name,
+  if (!attrs.count(type_attr_name)) {
+    return errors::InvalidArgument("Attribute with name ", type_attr_name,
                                    " not found.");
   }
-  const auto op_dtype = attrs.get<DataType>(dtype_attr_name);
-  if (!allowed_dtypes.count(op_dtype)) {
-    // Build string list of allowed types.
-    std::ostringstream ss;
-    for (auto it = allowed_dtypes.begin(); it != allowed_dtypes.end(); ++it) {
-      if (it != allowed_dtypes.begin()) ss << ", ";
-      ss << DataTypeString(*it);
-    }
-    return errors::Unimplemented("Data type ", DataTypeString(op_dtype),
+  *tf_type = attrs.get<DataType>(type_attr_name);
+  return Status::OK();
+}
+
+Status GetInputTfType(const OpConverterParams& params, DataType* tf_type,
+                      int pos) {
+  const std::vector<TRT_TensorOrWeights>& inputs = params.inputs;
+  if (inputs.size() <= pos) {
+    return errors::Internal("Invalid input position");
+  }
+
+  return inputs[pos].GetTfType(tf_type);
+}
+
+constexpr const char kOutputTypeAttrName[] = "T";
+
+Status GetOutputTfType(const OpConverterParams& params, DataType* tf_type) {
+  return GetNodeDefTfType(params.node_def, tf_type, kOutputTypeAttrName);
+}
+
+Status AllowDataTypes(const OpConverterParams& params,
+                      const std::set<DataType>& allowed_types,
+                      const char* type_attr_name = kOutputTypeAttrName) {
+  const auto& node_def = params.node_def;
+  DataType tf_type;
+  TF_RETURN_IF_ERROR(GetNodeDefTfType(node_def, &tf_type, type_attr_name));
+  if (!allowed_types.count(tf_type)) {
+    string allowed_types_string = absl::StrJoin(
+        allowed_types, ", ", [](string* out, const DataType& type) {
+          absl::StrAppendFormat(out, "%s", DataTypeString(type));
+        });
+    return errors::Unimplemented("Data type ", DataTypeString(tf_type),
                                  " is not supported for ", node_def.op(),
-                                 ", must be one of [", ss.str(), "], at ",
-                                 node_def.name());
+                                 ", must be one of [", allowed_types_string,
+                                 "], at ", node_def.name());
   }
   return Status::OK();
 }
@@ -4598,6 +4633,42 @@ Status ConvertUnpack(OpConverterParams* params) {
   return ConvertSplitHelper(params, inputs.at(0), tf_axis, num, true);
 }
 
+// Supports cast fp16=>fp32 through IIdentityLayer.
+Status ConvertCast(OpConverterParams* params) {
+  const NodeDef& node_def = params->node_def;
+  TF_RETURN_IF_ERROR(CheckInputsWeights(*params, {{"x", false}}));
+  auto unsupport_cast_error = [&]() {
+    return errors::Unimplemented("Cast op: ", node_def.op(),
+                                 " not supported at: ", node_def.name());
+  };
+
+  DataType input_type;
+  TF_RETURN_IF_ERROR(GetInputTfType(*params, &input_type, 0));
+  if (input_type != DataType::DT_HALF) {
+    return unsupport_cast_error();
+  }
+
+  DataType output_type;
+  TF_RETURN_IF_ERROR(GetOutputTfType(*params, &output_type));
+  if (output_type != DataType::DT_FLOAT) {
+    return unsupport_cast_error();
+  }
+
+  if (params->validation_only) return Status::OK();
+
+  nvinfer1::ITensor* input = params->inputs.at(0).tensor();
+  nvinfer1::IIdentityLayer* layer =
+      params->converter->network()->addIdentity(*input);
+  layer->setPrecision(nvinfer1::DataType::kFLOAT);
+
+  if (layer->getOutput(0)->getType() != nvinfer1::DataType::kFLOAT) {
+    return errors::Internal("IIdentityLayer doesn't work as expected");
+  }
+
+  params->outputs->push_back(TRT_TensorOrWeights(layer->getOutput(0)));
+  return Status::OK();
+}
+
 Status ConvertConcat(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
@@ -5675,6 +5746,7 @@ static void RegisterValidatableOpConverters(
   (*registration)["CombinedNonMaxSuppression"] = ConvertCombinedNMS;
 #endif
   (*registration)["AddN"] = ConvertAddN;
+  (*registration)["Cast"] = ConvertCast;
   (*registration)["ConcatV2"] = ConvertConcat;
   (*registration)["Const"] = ConvertConst;
   (*registration)["Conv2D"] = ConvertConv2D;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
index 2092aecd657..2fe8eec9675 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h
@@ -294,6 +294,8 @@ class TRT_TensorOrWeights {
 
   nvinfer1::Dims GetTrtDims() const;
 
+  Status GetTfType(DataType* tf_type) const;
+
   int batch_size() const { return batch_size_; }
 
   string DebugString() const;
diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
index 964370af6be..1efc31f9e24 100644
--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc
@@ -5147,6 +5147,14 @@ NodeDef CreateUnaryOp() {
   return T(s.WithOpName("my_unary"), input).operation.node()->def();
 }
 
+NodeDef CreateCastOp() {
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_HALF);
+  return ops::Cast(s.WithOpName("my_unary"), input, DT_FLOAT)
+      .operation.node()
+      ->def();
+}
+
 TEST_P(ParameterizedOpConverterTest, ConvertUnary) {
   const auto& spec = GetParam();
   const TrtTestMode trt_mode = std::get<0>(spec);
@@ -5174,6 +5182,7 @@ TEST_P(ParameterizedOpConverterTest, ConvertUnary) {
   ADD_OP("Asinh", ops::Asinh, std::asinh);
   ADD_OP("Atan", ops::Atan, std::atan);
   ADD_OP("Atanh", ops::Atanh, std::atanh);
+  op_map["Cast"] = std::make_pair(CreateCastOp, [](float x) { return x; });
   ADD_OP("Ceil", ops::Ceil, std::ceil);
   ADD_OP("Cos", ops::Cos, std::cos);
   ADD_OP("Cosh", ops::Cosh, std::cosh);
@@ -5212,7 +5221,13 @@ TEST_P(ParameterizedOpConverterTest, ConvertUnary) {
     }
     NodeDef node_def = op_map[op_name].first();
 
-    AddTestTensor("input", p.input_dims, TfDataTypeToTrt(tf_dtype), trt_mode);
+    // TODO(bixia): we assume this test is only instantiated for DT_FLOAT for
+    // now. Need to find a better way to express input and output types.
+    DataType input_tf_dtype = op_name == "Cast" ? DT_HALF : tf_dtype;
+    DataType output_tf_dtype = tf_dtype;
+
+    AddTestTensor("input", p.input_dims, TfDataTypeToTrt(input_tf_dtype),
+                  trt_mode);
     RunValidationAndConversion(node_def, Status::OK(), "my_unary",
                                p.expected_output_dims);
 
@@ -5220,8 +5235,8 @@ TEST_P(ParameterizedOpConverterTest, ConvertUnary) {
     std::vector<float> output;
     std::transform(input_values.begin(), input_values.end(),
                    std::back_inserter(output), op_map[op_name].second);
-    InstantiateBuildAndRun(tf_dtype, "my_unary", this, p, input_values,
-                           ArrayFloatNear(output, 0.0001, true));
+    InstantiateBuildAndRun(input_tf_dtype, output_tf_dtype, "my_unary", this, p,
+                           input_values, ArrayFloatNear(output, 0.0001, true));
   }
 }
 

From 9c49cda7d988680985aa194703edd72df60a57bc Mon Sep 17 00:00:00 2001
From: Mihai Maruseac <mihaimaruseac@google.com>
Date: Mon, 18 May 2020 09:27:00 -0700
Subject: [PATCH 355/412] Update release notes for the 1.15.3, 2.0.2 and 2.1.1
 patch releases.

PiperOrigin-RevId: 312093793
Change-Id: I476369d7d3f8e8d54dd10f412f25049265fc688f
---
 RELEASE.md | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index 6c8921cf492..f251f6ceffa 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,28 @@
+# Release 2.1.1
+
+## Bug Fixes and Other Changes
+* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645)
+* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601)
+* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960)
+* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770)
+* Fixes a versioning bug which causes Keras layers from TF 1.x to be used instead of those from TF 2.x
+
+# Release 2.0.2
+
+## Bug Fixes and Other Changes
+* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645)
+* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601)
+* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960)
+* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770)
+
+# Release 1.15.3
+
+## Bug Fixes and Other Changes
+* Updates `sqlite3` to `3.31.01` to handle [CVE-2019-19880](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19880), [CVE-2019-19244](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19244) and [CVE-2019-19645](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-19645)
+* Updates `curl` to `7.69.1` to handle [CVE-2019-15601](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-15601)
+* Updates `libjpeg-turbo` to `2.0.4` to handle [CVE-2018-19664](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-19664), [CVE-2018-20330](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-20330) and [CVE-2019-13960](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-13960)
+* Updates Apache Spark to `2.4.5` to handle [CVE-2019-10099](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2019-10099), [CVE-2018-17190](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-17190) and [CVE-2018-11770](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2018-11770)
+
 # Release 2.2.0
 
 TensorFlow 2.2 discontinues support for Python 2, [previously announced](https://groups.google.com/a/tensorflow.org/d/msg/announce/gVwS5RC8mds/dCt1ka2XAAAJ) as following [Python 2's EOL on January 1, 2020](https://www.python.org/dev/peps/pep-0373/#update).

From cfdb9434054da65025c25d5dbcda029c16faf868 Mon Sep 17 00:00:00 2001
From: Ilya Tokar <tokarip@google.com>
Date: Mon, 18 May 2020 09:35:23 -0700
Subject: [PATCH 356/412] Tweak round_to_bfloat16 to make it vectorizable.

This simplifies control flow by handling positive and
negative denormals separately. Should be ~40% faster.

PiperOrigin-RevId: 312095390
Change-Id: I5b6388e48b8c217edb0fc4fe14c3add64fb52c65
---
 tensorflow/core/lib/bfloat16/bfloat16.h | 327 ++++++++++++------------
 1 file changed, 163 insertions(+), 164 deletions(-)

diff --git a/tensorflow/core/lib/bfloat16/bfloat16.h b/tensorflow/core/lib/bfloat16/bfloat16.h
index 4c38738593f..54d78480066 100644
--- a/tensorflow/core/lib/bfloat16/bfloat16.h
+++ b/tensorflow/core/lib/bfloat16/bfloat16.h
@@ -194,171 +194,170 @@ struct bfloat16 {
     input = f.u;
     bfloat16 output;
 
+    // Fast rounding algorithm that rounds a half value to nearest even. This
+    // reduces expected error when we convert a large number of floats. Here
+    // is how it works:
+    //
+    // Definitions:
+    // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
+    // with the following tags:
+    //
+    // Sign |  Exp (8 bits) | Frac (23 bits)
+    //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
+    //
+    //  S: Sign bit.
+    //  E: Exponent bits.
+    //  F: First 6 bits of fraction.
+    //  L: Least significant bit of resulting bfloat16 if we truncate away the
+    //  rest of the float32. This is also the 7th bit of fraction
+    //  R: Rounding bit, 8th bit of fraction.
+    //  T: Sticky bits, rest of fraction, 15 bits.
+    //
+    // To round half to nearest even, there are 3 cases where we want to round
+    // down (simply truncate the result of the bits away, which consists of
+    // rounding bit and sticky bits) and two cases where we want to round up
+    // (truncate then add one to the result).
+    //
+    // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
+    // 1s) as the rounding bias, adds the rounding bias to the input, then
+    // truncates the last 16 bits away.
+    //
+    // To understand how it works, we can analyze this algorithm case by case:
+    //
+    // 1. L = 0, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input may create any carry, depending on
+    //   whether there is any value set to 1 in T bits.
+    //   - R may be set to 1 if there is a carry.
+    //   - L remains 0.
+    //   - Note that this case also handles Inf and -Inf, where all fraction
+    //   bits, including L, R and Ts are all 0. The output remains Inf after
+    //   this algorithm.
+    //
+    // 2. L = 1, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits but
+    //   adds 1 to rounding bit.
+    //   - L remains 1.
+    //
+    // 3. L = 0, R = 1, all of T are 0:
+    //   Expect: round down, this is exactly at half, the result is already
+    //   even (L=0).
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input sets all sticky bits to 1, but
+    //   doesn't create a carry.
+    //   - R remains 1.
+    //   - L remains 0.
+    //
+    // 4. L = 1, R = 1:
+    //   Expect: round up, this is exactly at half, the result needs to be
+    //   round to the next even number.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits, but
+    //   creates a carry from rounding bit.
+    //   - The carry sets L to 0, creates another carry bit and propagate
+    //   forward to F bits.
+    //   - If all the F bits are 1, a carry then propagates to the exponent
+    //   bits, which then creates the minimum value with the next exponent
+    //   value. Note that we won't have the case where exponents are all 1,
+    //   since that's either a NaN (handled in the other if condition) or inf
+    //   (handled in case 1).
+    //
+    // 5. L = 0, R = 1, any of T is 1:
+    //   Expect: round up, this is greater than half.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input creates a carry from sticky bits,
+    //   sets rounding bit to 0, then create another carry.
+    //   - The second carry sets L to 1.
+    //
+    // Examples:
+    //
+    //  Exact half value that is already even:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
+    //
+    //     This falls into case 3. We truncate the rest of 16 bits and no
+    //     carry is created into F and L:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //  Exact half value, round to next even number:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     which then propagates into L and F:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //
+    //  Max denormal value round to min normal value:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
+    //
+    //  Max normal value round to Inf:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
+    //
+    //
+    // Least significant bit of resulting bfloat.
+    uint32_t lsb = (input >> 16) & 1;
+    uint32_t rounding_bias = 0x7fff + lsb;
+    input += rounding_bias;
+    output.value = static_cast<uint16_t>(input >> 16);
+    if ((f.u & 0xff800000u) == 0) {
+      // Flush positive denormal to 0
+      output.value = 0x0;
+    }
+    if ((f.u & 0xff800000u) == 0x80000000u) {
+      // Flush negative denormal to -0
+      output.value = 0x8000;
+    }
     if (float_isnan(v)) {
-      // If the value is a NaN, squash it to a qNaN with msb of fraction set,
-      // this makes sure after truncation we don't end up with an inf.
-      //
-      // qNaN magic: All exponent bits set + most significant bit of fraction
-      // set.
-      output.value = 0x7fc0;
-    } else if (std::fabs(v) < std::numeric_limits<float>::min()) {
-      // Flush denormal to +/- 0.0
-      output.value = std::signbit(v) ? 0x8000 : 0;
-    } else {
-      // Fast rounding algorithm that rounds a half value to nearest even. This
-      // reduces expected error when we convert a large number of floats. Here
-      // is how it works:
-      //
-      // Definitions:
-      // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
-      // with the following tags:
-      //
-      // Sign |  Exp (8 bits) | Frac (23 bits)
-      //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
-      //
-      //  S: Sign bit.
-      //  E: Exponent bits.
-      //  F: First 6 bits of fraction.
-      //  L: Least significant bit of resulting bfloat16 if we truncate away the
-      //  rest of the float32. This is also the 7th bit of fraction
-      //  R: Rounding bit, 8th bit of fraction.
-      //  T: Sticky bits, rest of fraction, 15 bits.
-      //
-      // To round half to nearest even, there are 3 cases where we want to round
-      // down (simply truncate the result of the bits away, which consists of
-      // rounding bit and sticky bits) and two cases where we want to round up
-      // (truncate then add one to the result).
-      //
-      // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
-      // 1s) as the rounding bias, adds the rounding bias to the input, then
-      // truncates the last 16 bits away.
-      //
-      // To understand how it works, we can analyze this algorithm case by case:
-      //
-      // 1. L = 0, R = 0:
-      //   Expect: round down, this is less than half value.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input may create any carry, depending on
-      //   whether there is any value set to 1 in T bits.
-      //   - R may be set to 1 if there is a carry.
-      //   - L remains 0.
-      //   - Note that this case also handles Inf and -Inf, where all fraction
-      //   bits, including L, R and Ts are all 0. The output remains Inf after
-      //   this algorithm.
-      //
-      // 2. L = 1, R = 0:
-      //   Expect: round down, this is less than half value.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 1 = 0x8000
-      //   - Adding rounding bias to input doesn't change sticky bits but
-      //   adds 1 to rounding bit.
-      //   - L remains 1.
-      //
-      // 3. L = 0, R = 1, all of T are 0:
-      //   Expect: round down, this is exactly at half, the result is already
-      //   even (L=0).
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input sets all sticky bits to 1, but
-      //   doesn't create a carry.
-      //   - R remains 1.
-      //   - L remains 0.
-      //
-      // 4. L = 1, R = 1:
-      //   Expect: round up, this is exactly at half, the result needs to be
-      //   round to the next even number.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 1 = 0x8000
-      //   - Adding rounding bias to input doesn't change sticky bits, but
-      //   creates a carry from rounding bit.
-      //   - The carry sets L to 0, creates another carry bit and propagate
-      //   forward to F bits.
-      //   - If all the F bits are 1, a carry then propagates to the exponent
-      //   bits, which then creates the minimum value with the next exponent
-      //   value. Note that we won't have the case where exponents are all 1,
-      //   since that's either a NaN (handled in the other if condition) or inf
-      //   (handled in case 1).
-      //
-      // 5. L = 0, R = 1, any of T is 1:
-      //   Expect: round up, this is greater than half.
-      //
-      //   Algorithm:
-      //   - Rounding bias: 0x7fff + 0 = 0x7fff
-      //   - Adding rounding bias to input creates a carry from sticky bits,
-      //   sets rounding bit to 0, then create another carry.
-      //   - The second carry sets L to 1.
-      //
-      // Examples:
-      //
-      //  Exact half value that is already even:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
-      //
-      //     This falls into case 3. We truncate the rest of 16 bits and no
-      //     carry is created into F and L:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
-      //
-      //  Exact half value, round to next even number:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     which then propagates into L and F:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
-      //
-      //
-      //  Max denormal value round to min normal value:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     propagate into L and F, which then propagates into exponent
-      //     bits:
-      //
-      //    Output:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
-      //
-      //  Max normal value round to Inf:
-      //    Input:
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
-      //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
-      //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
-      //
-      //     This falls into case 4. We create a carry from R and T,
-      //     propagate into L and F, which then propagates into exponent
-      //     bits:
-      //
-      //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
-      //     S     E E E E E E E E      F F F F F F L
-      //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
-      //
-      //
-      // Least significant bit of resulting bfloat.
-      uint32_t lsb = (input >> 16) & 1;
-      uint32_t rounding_bias = 0x7fff + lsb;
-      input += rounding_bias;
-      output.value = static_cast<uint16_t>(input >> 16);
+      output.value = NAN_VALUE;
     }
     return output;
   }

From dbc0fffedb506c12837a5eda0d87b01b659136ba Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Mon, 18 May 2020 09:35:47 -0700
Subject: [PATCH 357/412] Report remote target name for worker service RPCs.

PiperOrigin-RevId: 312095453
Change-Id: I73fc7948f994426b8d62bdefd5573cfe3b5b793d
---
 .../rpc/grpc_remote_worker.cc                    | 16 ++++++++++------
 .../distributed_runtime/rpc/grpc_remote_worker.h |  3 ++-
 .../distributed_runtime/rpc/grpc_worker_cache.cc |  6 +++---
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 85431acdf0c..6e706179863 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -45,7 +45,7 @@ class GrpcRemoteWorker : public WorkerInterface {
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
                             thread::ThreadPool* callback_threadpool,
-                            WorkerCacheLogger* logger)
+                            WorkerCacheLogger* logger, const string& target)
       : channel_(std::move(channel)),
         stub_(channel_),
         cq_(completion_queue),
@@ -66,7 +66,8 @@ class GrpcRemoteWorker : public WorkerInterface {
         instancesource_(Method(GrpcWorkerMethod::kCompleteInstance)),
         getstepsequence_(Method(GrpcWorkerMethod::kGetStepSequence)),
         markrecvfinished_(Method(GrpcWorkerMethod::kMarkRecvFinished)),
-        logger_(logger) {}
+        logger_(logger),
+        target_(target) {}
 
   ~GrpcRemoteWorker() override {}
 
@@ -273,7 +274,7 @@ class GrpcRemoteWorker : public WorkerInterface {
                     bool fail_fast = true) {
     new RPCState<protobuf::Message>(
         &stub_, cq_, method, *request, response, std::move(done), call_opts,
-        callback_threadpool_, /*max_retries=*/0, fail_fast);
+        callback_threadpool_, /*max_retries=*/0, fail_fast, &target_);
   }
 
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
@@ -281,7 +282,8 @@ class GrpcRemoteWorker : public WorkerInterface {
                     CallOptions* call_opts = nullptr) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
                                  std::move(done), call_opts,
-                                 callback_threadpool_);
+                                 callback_threadpool_, /*max_retries=*/0,
+                                 /*fail_fast=*/true, &target_);
   }
 
   void IssueMarkRecvFinishedRequest(int64 request_id) {
@@ -321,6 +323,7 @@ class GrpcRemoteWorker : public WorkerInterface {
 
   // Support for logging.
   WorkerCacheLogger* logger_;
+  const string target_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcRemoteWorker);
 };
@@ -328,9 +331,10 @@ class GrpcRemoteWorker : public WorkerInterface {
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      thread::ThreadPool* callback_threadpool,
-                                     WorkerCacheLogger* logger) {
+                                     WorkerCacheLogger* logger,
+                                     const string& target) {
   return new GrpcRemoteWorker(std::move(channel), completion_queue,
-                              callback_threadpool, logger);
+                              callback_threadpool, logger, target);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index c0a49ecfc38..97e590e0ad1 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -29,7 +29,8 @@ class WorkerInterface;
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
                                      thread::ThreadPool* callback_threadpool,
-                                     WorkerCacheLogger* logger);
+                                     WorkerCacheLogger* logger,
+                                     const string& target);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index f6b6e15a2ba..1d75728ddd2 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -69,9 +69,9 @@ class GrpcWorkerCache : public WorkerCachePartial {
         return nullptr;
       }
       size_t index = AssignWorkerToThread(target);
-      return NewGrpcRemoteWorker(channel,
-                                 worker_env_->GetCompletionQueue(index),
-                                 worker_env_->GetThreadPool(), &logger_);
+      return NewGrpcRemoteWorker(
+          channel, worker_env_->GetCompletionQueue(index),
+          worker_env_->GetThreadPool(), &logger_, target);
     }
   }
 

From 1b2a65c15fed4a27bc94ebbce930feea455d927f Mon Sep 17 00:00:00 2001
From: Mehdi Amini <aminim@google.com>
Date: Mon, 18 May 2020 09:46:53 -0700
Subject: [PATCH 358/412] Add legalization from hlo.dot to lhlo.dot

PiperOrigin-RevId: 312097353
Change-Id: Ia8b0fef86c77426f54090354779c62163bf97426
---
 .../mlir/xla/tests/hlo-legalize-to-lhlo.mlir         | 12 ++++++++++++
 .../mlir/xla/transforms/hlo_legalize_to_lhlo.cc      |  1 +
 .../mlir/xla/transforms/map_hlo_to_lhlo_op.h         |  1 +
 3 files changed, 14 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
index 53296b257ae..68f6d172afc 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-lhlo.mlir
@@ -395,3 +395,15 @@ func @tanh_dyn(%arg0: tensor<?x?xf32>) {
   // CHECK: "xla_lhlo.tanh"(%arg0, %[[RESULT]]) : (memref<?x?xf32>, memref<?x?xf32>) -> ()
   return
 }
+
+// -----
+
+// CHECK-LABEL: func @dot
+func @dot(%arg0: tensor<1024x1024xf32>) -> tensor<1024x1024xf32> {
+// CHECK-SAME: (%[[ARG0:.*]]: [[TYPE:.*]],
+// CHECK-SAME:  %[[RESULT:.*]]: [[TYPE]])
+// CHECK: "xla_lhlo.dot"(%[[ARG0]], %[[ARG0]], %{{.*}}) : ([[TYPE]], [[TYPE]], [[TYPE]]) -> ()
+    %dot = "xla_hlo.dot"(%arg0, %arg0)
+      : (tensor<1024x1024xf32>, tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
+    return %dot : tensor<1024x1024xf32>
+  }
diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
index 10f35768bbd..11b2ae65d8e 100644
--- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc
@@ -362,6 +362,7 @@ void populateHLOToLHLOConversionPattern(
       HloToLhloOpConverter<xla_hlo::CopyOp>,
       HloToLhloOpConverter<xla_hlo::CosOp>,
       HloToLhloOpConverter<xla_hlo::DivOp>,
+      HloToLhloOpConverter<xla_hlo::DotOp>,
       HloToLhloOpConverter<xla_hlo::ExpOp>,
       HloToLhloOpConverter<xla_hlo::ImagOp>,
       HloToLhloOpConverter<xla_hlo::IotaOp>,
diff --git a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
index fed21e9bafc..21b954a3eb4 100644
--- a/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
+++ b/tensorflow/compiler/mlir/xla/transforms/map_hlo_to_lhlo_op.h
@@ -49,6 +49,7 @@ MAP_HLO_TO_LHLO(ConvertOp);
 MAP_HLO_TO_LHLO(CopyOp);
 MAP_HLO_TO_LHLO(CosOp);
 MAP_HLO_TO_LHLO(DivOp);
+MAP_HLO_TO_LHLO(DotOp);
 MAP_HLO_TO_LHLO(ExpOp);
 MAP_HLO_TO_LHLO(ImagOp);
 MAP_HLO_TO_LHLO(IotaOp);

From 0bf90cb2a8b241a728943d343f1cdd922e408c73 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 10:12:52 -0700
Subject: [PATCH 359/412] Enable (non-gradient) tests of tf.linalg.cholesky in
 eager mode.

PiperOrigin-RevId: 312102967
Change-Id: Icefc46a8268413dfaec42109d4f57dd07f602a54
---
 .../python/kernel_tests/cholesky_op_test.py   | 45 ++++++++++---------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index 7d5f7715eb1..01c497a37ed 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -32,7 +32,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.platform import benchmark
@@ -91,7 +91,7 @@ def TriAngInvCompositeGrad(l, grad):
 
 class CholeskyOpTest(test.TestCase):
 
-  def _verifyCholeskyBase(self, sess, x, chol, verification):
+  def _verifyCholeskyBase(self, x, chol, verification):
     chol_np, verification_np = self.evaluate([chol, verification])
     self.assertAllClose(x, verification_np)
     self.assertShapeEqual(x, chol)
@@ -106,11 +106,11 @@ class CholeskyOpTest(test.TestCase):
 
   def _verifyCholesky(self, x):
     # Verify that LL^T == x.
-    with self.cached_session(use_gpu=True) as sess:
-      chol = linalg_ops.cholesky(x)
-      verification = math_ops.matmul(chol, chol, adjoint_b=True)
-      self._verifyCholeskyBase(sess, x, chol, verification)
+    chol = linalg_ops.cholesky(x)
+    verification = math_ops.matmul(chol, chol, adjoint_b=True)
+    self._verifyCholeskyBase(x, chol, verification)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testBasic(self):
     data = np.array([[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]])
     for dtype in (np.float32, np.float64):
@@ -123,6 +123,7 @@ class CholeskyOpTest(test.TestCase):
         complex_data += data
         self._verifyCholesky(complex_data)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testBatch(self):
     simple_array = np.array([[[1., 0.], [0., 5.]]])  # shape (1, 2, 2)
     self._verifyCholesky(simple_array)
@@ -144,21 +145,21 @@ class CholeskyOpTest(test.TestCase):
         matrices[i] = np.dot(matrices[i].T.conj(), matrices[i])
     self._verifyCholesky(matrices)
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNonSquareMatrix(self):
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(np.array([[1., 2., 3.], [3., 4., 5.]]))
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(
           np.array([[[1., 2., 3.], [3., 4., 5.]], [[1., 2., 3.], [3., 4., 5.]]
                    ]))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     tensor3 = constant_op.constant([1., 2.])
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(tensor3)
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       linalg_ops.cholesky(tensor3)
 
   # The below invalid Cholesky call returns an error with TF Classic and just
@@ -175,21 +176,23 @@ class CholeskyOpTest(test.TestCase):
         self._verifyCholesky(
             np.array([[1., -1., 0.], [-1., 1., -1.], [0., -1., 1.]]))
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     self._verifyCholesky(np.empty([0, 2, 2]))
     self._verifyCholesky(np.empty([2, 0, 0]))
 
   @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
-    with self.session(use_gpu=True) as sess:
-      matrix1 = random_ops.random_normal([5, 5], seed=42)
-      matrix2 = random_ops.random_normal([5, 5], seed=42)
-      matrix1 = math_ops.matmul(matrix1, matrix1, adjoint_a=True)
-      matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True)
-      c1 = linalg_ops.cholesky(matrix1)
-      c2 = linalg_ops.cholesky(matrix2)
-      c1_val, c2_val = self.evaluate([c1, c2])
-      self.assertAllClose(c1_val, c2_val)
+    seed = [42, 24]
+    matrix_shape = [5, 5]
+    matrix1 = stateless_random_ops.stateless_random_normal(matrix_shape, seed)
+    matrix2 = stateless_random_ops.stateless_random_normal(matrix_shape, seed)
+    matrix1 = math_ops.matmul(matrix1, matrix1, adjoint_a=True)
+    matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True)
+    c1 = linalg_ops.cholesky(matrix1)
+    c2 = linalg_ops.cholesky(matrix2)
+    c1_val, c2_val = self.evaluate([c1, c2])
+    self.assertAllClose(c1_val, c2_val)
 
 
 class CholeskyGradTest(test.TestCase):

From 83b85568fb5a5aade46a41909ee9a1b6f3643b57 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Mon, 18 May 2020 10:23:36 -0700
Subject: [PATCH 360/412] Support int8 in tflite_convert

PiperOrigin-RevId: 312105323
Change-Id: I161b9b324e37f42f2026592f7c5bec8ac568c3d6
---
 tensorflow/lite/python/tflite_convert.py      |  6 ++-
 tensorflow/lite/python/tflite_convert_test.py | 39 +++++++++++++++----
 2 files changed, 35 insertions(+), 10 deletions(-)

diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index d0dd7313df3..c7504a3a638 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -65,6 +65,8 @@ def _parse_inference_type(value, flag):
     return lite_constants.FLOAT
   if value == "QUANTIZED_UINT8":
     return lite_constants.QUANTIZED_UINT8
+  if value == "INT8":
+    return lite_constants.INT8
   raise ValueError("Unsupported value for --{0}. Only FLOAT and "
                    "QUANTIZED_UINT8 are supported.".format(flag))
 
@@ -352,12 +354,12 @@ def _get_tf1_flags(parser):
   parser.add_argument(
       "--inference_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8"],
+      choices=["FLOAT", "QUANTIZED_UINT8", "INT8"],
       help="Target data type of real-number arrays in the output file.")
   parser.add_argument(
       "--inference_input_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8"],
+      choices=["FLOAT", "QUANTIZED_UINT8", "INT8"],
       help=("Target data type of real-number input arrays. Allows for a "
             "different type for input arrays in the case of quantization."))
 
diff --git a/tensorflow/lite/python/tflite_convert_test.py b/tensorflow/lite/python/tflite_convert_test.py
index 1e80907edbd..d6a35ba9248 100644
--- a/tensorflow/lite/python/tflite_convert_test.py
+++ b/tensorflow/lite/python/tflite_convert_test.py
@@ -98,8 +98,8 @@ class TfLiteConvertV1Test(TestModels):
     sess.close()
 
     flags_str = ('--graph_def_file={0} --input_arrays={1} '
-                 '--output_arrays={2}'.format(graph_def_file,
-                                              'Placeholder', 'add'))
+                 '--output_arrays={2}'.format(graph_def_file, 'Placeholder',
+                                              'add'))
     self._run(flags_str, should_succeed=True)
     os.remove(graph_def_file)
 
@@ -137,8 +137,31 @@ class TfLiteConvertV1Test(TestModels):
     sess.close()
 
     flags_str = ('--graph_def_file={0} --input_arrays={1} '
-                 '--output_arrays={2}'.format(graph_def_file,
-                                              'random', 'add'))
+                 '--output_arrays={2}'.format(graph_def_file, 'random', 'add'))
+    self._run(flags_str, should_succeed=True)
+    os.remove(graph_def_file)
+
+  def testQATFrozenGraphDefInt8(self):
+    with ops.Graph().as_default():
+      in_tensor_1 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+      in_tensor_2 = array_ops.placeholder(
+          shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+      _ = array_ops.fake_quant_with_min_max_args(
+          in_tensor_1 + in_tensor_2, min=0., max=1., name='output',
+          num_bits=16)  # INT8 inference type works for 16 bits fake quant.
+      sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = self._getFilepath('model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+    sess.close()
+
+    flags_str = ('--inference_type=INT8 --std_dev_values=128,128 '
+                 '--mean_values=128,128 '
+                 '--graph_def_file={0} --input_arrays={1},{2} '
+                 '--output_arrays={3}'.format(graph_def_file, 'inputA',
+                                              'inputB', 'output'))
     self._run(flags_str, should_succeed=True)
     os.remove(graph_def_file)
 
@@ -166,8 +189,8 @@ class TfLiteConvertV1Test(TestModels):
   def testKerasFileMLIR(self):
     keras_file = self._getKerasModelFile()
 
-    flags_str = ('--keras_model_file={} --experimental_new_converter'
-                 .format(keras_file))
+    flags_str = (
+        '--keras_model_file={} --experimental_new_converter'.format(keras_file))
     self._run(flags_str, should_succeed=True)
     os.remove(keras_file)
 
@@ -299,8 +322,8 @@ class TfLiteConvertV2Test(TestModels):
   def testKerasFileMLIR(self):
     keras_file = self._getKerasModelFile()
 
-    flags_str = ('--keras_model_file={} --experimental_new_converter'
-                 .format(keras_file))
+    flags_str = (
+        '--keras_model_file={} --experimental_new_converter'.format(keras_file))
     self._run(flags_str, should_succeed=True)
     os.remove(keras_file)
 

From dec7430b13213974928ae395322feabc788b1664 Mon Sep 17 00:00:00 2001
From: Kibeom Kim <kkb@google.com>
Date: Mon, 18 May 2020 10:38:01 -0700
Subject: [PATCH 361/412] Ensure that tf_py_test tfrt test is not enabled for
 open source build by introducing tfrt_enabled_internal flag.

PiperOrigin-RevId: 312108475
Change-Id: Ia73668bf1e8f097441ed23dd75fb1ac2c0327e1f
---
 tensorflow/python/data/service/BUILD               |  2 ++
 tensorflow/python/eager/BUILD                      |  2 +-
 tensorflow/python/keras/layers/preprocessing/BUILD |  2 ++
 tensorflow/python/kernel_tests/BUILD               |  5 ++++-
 tensorflow/python/kernel_tests/proto/BUILD         |  2 +-
 tensorflow/python/saved_model/BUILD                |  2 ++
 tensorflow/tensorflow.bzl                          | 11 ++++++++++-
 7 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/data/service/BUILD b/tensorflow/python/data/service/BUILD
index 19bcaa3b952..18678230205 100644
--- a/tensorflow/python/data/service/BUILD
+++ b/tensorflow/python/data/service/BUILD
@@ -1,4 +1,6 @@
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index c08cb8cc1c3..394b929bf1b 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 # buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 load(
diff --git a/tensorflow/python/keras/layers/preprocessing/BUILD b/tensorflow/python/keras/layers/preprocessing/BUILD
index 052a57b52f3..b580382f9d8 100644
--- a/tensorflow/python/keras/layers/preprocessing/BUILD
+++ b/tensorflow/python/keras/layers/preprocessing/BUILD
@@ -2,6 +2,8 @@
 #   Contains the Keras preprocess layers (internal TensorFlow version).
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
 load("//tensorflow/core/platform/default:distribute.bzl", "distribute_py_test")
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 13f59b74baf..cd03da9b179 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1,8 +1,11 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.bzl", "sycl_py_test", "tf_custom_op_library", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "sycl_py_test", "tf_custom_op_library")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
 package(
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],  # Apache 2.0
diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD
index d9643f3d125..0e935dfe8c4 100644
--- a/tensorflow/python/kernel_tests/proto/BUILD
+++ b/tensorflow/python/kernel_tests/proto/BUILD
@@ -1,7 +1,7 @@
 # Tests of tf.io.*proto.
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 2e5db7edd27..5c30d320fb7 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -2,6 +2,8 @@
 # TensorFlow SavedModel.
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+# buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
 package(
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d72bdf58186..70b03146f34 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -2218,6 +2218,15 @@ def tf_py_test(
         xla_enabled = False,
         grpc_enabled = False,
         tfrt_enabled = False,
+        # `tfrt_enabled` is set for some test targets, and if we enable
+        # TFRT tests just by that, this will enable TFRT builds for open source.
+        # TFRT open source is not fully integrated yet so we need a temporary
+        # workaround to enable TFRT only for internal builds. `tfrt_enabled_internal`
+        # will be set by `tensorflow.google.bzl`'s `tf_py_test` target, which is
+        # only applied for internal builds.
+        # TODO(b/156911178): Revert this temporary workaround once TFRT open source
+        # is fully integrated with TF.
+        tfrt_enabled_internal = False,
         **kwargs):
     """Create one or more python tests with extra tensorflow dependencies."""
     xla_test_true_list = []
@@ -2261,7 +2270,7 @@ def tf_py_test(
         deps = depset(deps + xla_test_true_list),
         **kwargs
     )
-    if tfrt_enabled:
+    if tfrt_enabled_internal:
         py_test(
             name = name + "_tfrt",
             size = size,

From 95620005efbc52a446a232d5e74ee9fec793f918 Mon Sep 17 00:00:00 2001
From: Marat Dukhan <maratek@google.com>
Date: Mon, 18 May 2020 10:41:07 -0700
Subject: [PATCH 362/412] Document new methods to enable XNNPACK engine in
 TFLite

PiperOrigin-RevId: 312109175
Change-Id: Iefcbb2ef5d7c83160ef2fc09d668c8e4ac440949
---
 tensorflow/lite/delegates/xnnpack/README.md | 45 ++++++++++++++++++---
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/tensorflow/lite/delegates/xnnpack/README.md b/tensorflow/lite/delegates/xnnpack/README.md
index e0ef6f0899c..c4e3f540faf 100644
--- a/tensorflow/lite/delegates/xnnpack/README.md
+++ b/tensorflow/lite/delegates/xnnpack/README.md
@@ -1,15 +1,48 @@
 # XNNPACK backend for TensorFlow Lite
 
 XNNPACK is a highly optimized library of floating-point neural network
-inference operators for ARM, WebAssembly, and x86 platforms. This document
-describes how to use the XNNPACK library as a backend for TensorFlow Lite.
+inference operators for ARM, x86, and WebAssembly architectures in Android, iOS,
+Windows, Linux, macOS, and Emscripten environments. This document describes how
+to use the XNNPACK library as an inference engine for TensorFlow Lite.
 
-## Enabling XNNPACK backend in TensorFlow Lite models
+## Using XNNPACK engine with TensorFlow Lite interpreter
 
 XNNPACK integrates with TensorFlow Lite interpreter through the delegation
-mechanism. To leverage XNNPACK library for acceleration, the users need to
-create an XNNPACK delegate with the `TfLiteXNNPackDelegateCreate` function,
-and call `Interpreter::ModifyGraphWithDelegate` to delegate supported parts of
+mechanism. There are three methods to enable XNNPACK engine in TensorFlow Lite.
+
+### Enable XNNPACK via Bazel build flags (recommended)
+
+When building TensorFlow Lite with Bazel, add
+`--define tflite_with_xnnpack=true`, and the TensorFlow Lite interpreter will
+use XNNPACK engine by default.
+
+The exact command depends on the target platform, e.g. for Android AAR you'd use
+
+```
+bazel build -c opt --fat_apk_cpu=x86,x86_64,arm64-v8a,armeabi-v7a \
+  --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \
+  --define tflite_with_xnnpack=true \
+  //tensorflow/lite/java:tensorflow-lite
+```
+
+### Enable XNNPACK via additional dependency
+
+Another way to enable XNNPACK is to build and link the
+`//tensorflow/lite:tflite_with_xnnpack` target into your application alongside
+the TensorFlow Lite framework.
+
+This method works on platforms which support POSIX-style weak symbols (Android,
+iOS, Linux, Mac, but **NOT** Windows).
+
+### Enable XNNPACK via low-level delegate API (not recommended)
+
+While it is possible to use low-level delegate API to enable XNNPACK, this
+method is **NOT RECOMMENDED** unless you need to use TensorFlow Lite both with
+and without XNNPACK (e.g. for benchmarking).
+
+With low-level delegate API users create an XNNPACK delegate with the
+`TfLiteXNNPackDelegateCreate` function, and then call
+`Interpreter::ModifyGraphWithDelegate` to delegate supported parts of
 the model to the XNNPACK delegate. The users must destroy the delegate with
 `TfLiteXNNPackDelegateDelete` **after** releasing the TensorFlow Lite
 interpreter. The snippet below illustrates the typical usage:

From 723b2b59946c3a0bfa83b0b5df408e4699c88016 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 10:44:42 -0700
Subject: [PATCH 363/412] enable device tracer test.

PiperOrigin-RevId: 312109916
Change-Id: Ibf8f17dc7cfd95aeb991796880161567fcb9ebe4
---
 tensorflow/core/profiler/internal/gpu/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD
index e6ee8514227..c6fe4d77031 100644
--- a/tensorflow/core/profiler/internal/gpu/BUILD
+++ b/tensorflow/core/profiler/internal/gpu/BUILD
@@ -55,7 +55,6 @@ tf_cc_test_gpu(
     linkstatic = tf_kernel_tests_linkstatic(),
     tags = tf_cuda_tests_tags() + [
         "nomac",
-        "notap",  # b/154510273
         "gpu_cupti",
     ],
     deps = [

From 9cf08f43e07c6bb47bd9d41b3c6b0f33811f77c6 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 18 May 2020 11:17:10 -0700
Subject: [PATCH 364/412] [XLA:Python] Delete deprecated methods from
 XLA:Python API.

PiperOrigin-RevId: 312117146
Change-Id: I232b67b9c4955b7fa6ab7e3ced9446d5ca2ea0e8
---
 tensorflow/compiler/xla/python/xla.cc        | 114 -------------------
 tensorflow/compiler/xla/python/xla_client.py |  10 +-
 2 files changed, 5 insertions(+), 119 deletions(-)

diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index f10ec978399..0c4695cabf3 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -930,34 +930,6 @@ PYBIND11_MODULE(xla_extension, m) {
           "client",
           [](const ClientAndPtr<Device>& device) { return device.client; })
       .def("__str__", &Device::DebugString)
-      // TODO(phawkins): remove capitalized names after updating callers.
-      .def("TransferToInfeed",
-           [](const Device& device, const LiteralSlice& literal) {
-             GlobalPyRefManager()->CollectGarbage();
-             py::gil_scoped_release gil_release;
-             TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
-                                 device.GetLocalDeviceState());
-             return local_device->client()->TransferToInfeedLocal(
-                 literal, local_device->device_ordinal());
-           })
-      .def(
-          "TransferFromOutfeed",
-          [](const Device& device, const Shape& shape) -> StatusOr<py::object> {
-            GlobalPyRefManager()->CollectGarbage();
-            std::shared_ptr<Literal> literal_shared;
-            {
-              py::gil_scoped_release gil_release;
-              TF_ASSIGN_OR_RETURN(LocalDeviceState * local_device,
-                                  device.GetLocalDeviceState());
-              TF_ASSIGN_OR_RETURN(
-                  Literal literal,
-                  local_device->client()->TransferFromOutfeedLocal(
-                      shape, local_device->device_ordinal()));
-
-              literal_shared = std::make_shared<Literal>(std::move(literal));
-            }
-            return LiteralToPython(std::move(literal_shared));
-          })
       .def("transfer_to_infeed",
            [](const Device& device, const LiteralSlice& literal) {
              GlobalPyRefManager()->CollectGarbage();
@@ -1244,28 +1216,6 @@ PYBIND11_MODULE(xla_extension, m) {
       .def("size_of_generated_code_in_bytes",
            &PjRtExecutable::SizeOfGeneratedCodeInBytes)
       .def("delete", &PjRtExecutable::Delete)
-      // TODO(phawkins): delete capitalized methods after updating callers.
-      .def("Delete", &PjRtExecutable::Delete)
-      .def(
-          "Execute",
-          [](const PjRtExecutable& executable,
-             absl::Span<PjRtBuffer* const> args)
-              -> StatusOr<std::vector<ClientAndUniquePtr<PjRtBuffer>>> {
-            py::gil_scoped_release gil_release;
-            ExecuteOptions options;
-            options.untuple_result = true;
-            TF_ASSIGN_OR_RETURN(
-                std::vector<std::unique_ptr<PjRtBuffer>> output_buffers,
-                executable.Execute(args, options));
-            std::vector<ClientAndUniquePtr<PjRtBuffer>> outputs;
-            outputs.reserve(output_buffers.size());
-            for (auto& buffer : output_buffers) {
-              outputs.push_back(WrapWithClient(
-                  executable.client()->shared_from_this(), std::move(buffer)));
-            }
-            return outputs;
-          },
-          py::arg("arguments"))
       .def(
           "execute",
           [](const PjRtExecutable& executable,
@@ -1286,33 +1236,6 @@ PYBIND11_MODULE(xla_extension, m) {
             return outputs;
           },
           py::arg("arguments"))
-      // TODO(phawkins): delete capitalized methods after updating callers.
-      .def(
-          "ExecuteOnLocalDevices",
-          [](const PjRtExecutable& executable,
-             absl::Span<const std::vector<PjRtBuffer*>> args)
-              -> StatusOr<
-                  std::vector<std::vector<ClientAndUniquePtr<PjRtBuffer>>>> {
-            py::gil_scoped_release gil_release;
-            ExecuteOptions options;
-            options.untuple_result = true;
-            TF_ASSIGN_OR_RETURN(
-                std::vector<std::vector<std::unique_ptr<PjRtBuffer>>>
-                    output_buffers,
-                executable.ExecuteOnLocalDevices(args, options));
-            std::vector<std::vector<ClientAndUniquePtr<PjRtBuffer>>> outputs;
-            outputs.resize(output_buffers.size());
-            for (int computation = 0; computation < output_buffers.size();
-                 ++computation) {
-              for (auto& buffer : output_buffers[computation]) {
-                outputs[computation].push_back(
-                    WrapWithClient(executable.client()->shared_from_this(),
-                                   std::move(buffer)));
-              }
-            }
-            return outputs;
-          },
-          py::arg("arguments"))
       .def(
           "execute_on_local_devices",
           [](const PjRtExecutable& executable,
@@ -1414,12 +1337,6 @@ PYBIND11_MODULE(xla_extension, m) {
         proto.ParseFromString(serialized_hlo_module_proto);
         return absl::make_unique<XlaComputation>(proto);
       }))
-      // TODO(phawkins): delete capitalized names after updating callers.
-      .def("GetProgramShape", &XlaComputation::GetProgramShape)
-      .def("GetSerializedProto", &GetComputationSerializedProto)
-      .def("GetHloText", &GetComputationHloText)
-      .def("GetHloDotGraph", &GetComputationHloDotGraph)
-      .def("Hash", &HashComputation)
       .def("get_hlo_module", &GetHloModule)
       .def("program_shape", &XlaComputation::GetProgramShape)
       .def("as_serialized_hlo_module_proto", &GetComputationSerializedProto)
@@ -1512,28 +1429,7 @@ PYBIND11_MODULE(xla_extension, m) {
           },
           "Builds a computation from the contents of the builder.",
           py::arg("root") = absl::nullopt)
-      .def("ClearOpMetadata", &XlaBuilder::ClearOpMetadata)
       .def("GetShape", &XlaBuilder::GetShape)
-      .def(
-          "GetProgramShape",
-          [](const XlaBuilder& builder,
-             absl::optional<XlaOp> root) -> StatusOr<ProgramShape> {
-            return root ? builder.GetProgramShape(*root)
-                        : builder.GetProgramShape();
-          },
-          py::arg("root") = absl::nullopt)
-      .def("IsConstant", &XlaBuilder::IsConstant)
-      .def("SetOpMetadata", &XlaBuilder::SetOpMetadata)
-      .def("SetSharding", &XlaBuilder::SetSharding)
-      .def("ClearSharding", &XlaBuilder::ClearSharding)
-      .def("SetUpAlias",
-           [](XlaBuilder& builder, const std::vector<int64>& output_index,
-              int64 param_number, const std::vector<int64>& param_index) {
-             builder.SetUpAlias(
-                 ShapeIndex(output_index.begin(), output_index.end()),
-                 param_number,
-                 ShapeIndex(param_index.begin(), param_index.end()));
-           })
       .def(
           "build",
           [](XlaBuilder& builder, absl::optional<XlaOp> root) {
@@ -1564,17 +1460,7 @@ PYBIND11_MODULE(xla_extension, m) {
                  ShapeIndex(param_index.begin(), param_index.end()));
            });
 
-  // TODO(phawkins): delete capitalized names after updating callers
-  m.def("BufferToDLPackManagedTensor", BufferToDLPackManagedTensor);
   m.def("buffer_to_dlpack_managed_tensor", BufferToDLPackManagedTensor);
-  m.def("DLPackManagedTensorToBuffer",
-        [](const py::capsule& tensor, std::shared_ptr<PjRtClient> client)
-            -> StatusOr<ClientAndUniquePtr<PjRtBuffer>> {
-          TF_ASSIGN_OR_RETURN(
-              std::unique_ptr<PjRtBuffer> buffer,
-              DLPackManagedTensorToBuffer(tensor, client.get()));
-          return WrapWithClient(std::move(client), std::move(buffer));
-        });
   m.def("dlpack_managed_tensor_to_buffer",
         [](const py::capsule& tensor, std::shared_ptr<PjRtClient> client)
             -> StatusOr<ClientAndUniquePtr<PjRtBuffer>> {
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index d9cd906939d..76c3bc33a91 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -300,13 +300,13 @@ CompileOptions = _xla.CompileOptions
 # An Executable is a C++ class that duck types with the following API:
 # class Executable(object):
 #   def local_devices(self) -> [Device]:
-#   def Execute(self, arguments : [Buffer]) -> Buffer:
+#   def execute(self, arguments : [Buffer]) -> Buffer:
 #     """Execute on one replica with Buffer arguments and return value."""
 #
-#   def SizeOfGeneratedCodeInBytes(self) -> int:
+#   def size_of_generated_code_in_bytes(self) -> int:
 #     """Return generated binary size, or -1 if not known."""
 #
-#   def ExecuteOnLocalDevices(self, arguments: [[Buffer]]) -> [Buffer]:
+#   def execute_on_local_devices(self, arguments: [[Buffer]]) -> [Buffer]:
 #     """Execute on many replicas with Buffer arguments and return value.
 #
 #     Args:
@@ -329,7 +329,7 @@ def execute_with_python_values(executable, arguments, backend):
     return backend.buffer_from_pyval(arg, device=executable.local_devices()[0])
 
   arguments = [put(arg) for arg in arguments]
-  outputs = executable.Execute(arguments)
+  outputs = executable.execute(arguments)
   return [x.to_py() for x in outputs]
 
 
@@ -359,7 +359,7 @@ def execute_with_python_values_replicated(executable, arguments, backend):
     flat_arg_buffers = flat_arg_buffers[len(replica_args):]
   return [[x.to_py()
            for x in xs]
-          for xs in executable.ExecuteOnLocalDevices(arg_buffers)]
+          for xs in executable.execute_on_local_devices(arg_buffers)]
 
 
 class PaddingType(enum.Enum):

From ef45324fc62fc9a911e5771a40f9790900500de9 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 18 May 2020 11:26:48 -0700
Subject: [PATCH 365/412] Hexagon Delegate - Allow optional tensors as valid
 tensors in inputs. - Update fully connected builder to handle optional bias
 tensor.

PiperOrigin-RevId: 312119090
Change-Id: If905792a78f61abde0f269ed252aa2501ae60815
---
 .../hexagon/builders/matmul_builder.cc        | 68 +++++++++--------
 .../hexagon/builders/tests/matmul_test.cc     | 73 +++++++++++++++++--
 .../experimental/delegates/hexagon/utils.cc   | 21 ++++--
 3 files changed, 116 insertions(+), 46 deletions(-)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
index c53e62d27a7..c0c815ffdcc 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/matmul_builder.cc
@@ -129,35 +129,41 @@ TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
 
   // Bias tensor.
   int bias_tensor_id = inputs->data[2];
-  const auto& bias_tensor = context->tensors[bias_tensor_id];
-  auto* const_bias_node =
-      graph_builder_->AddConstNodeWithData(bias_tensor_id, bias_tensor);
-  graph_builder_->AddTensorWithID(bias_tensor_id, const_bias_node->GetID(), 0);
-  ComputeMinAndMaxQuantValues(bias_tensor, &bias_min_, &bias_max_);
-  auto* bias_min_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&bias_min_),
-      sizeof(bias_min_));
-  auto* bias_max_const = graph_builder_->AddConstNodeWithData(
-      quant_bound_shape, reinterpret_cast<char*>(&bias_max_),
-      sizeof(bias_max_));
+  TensorID matmul_and_bias_out = matmul_out,
+           matmul_and_bias_out_min = matmul_out_min,
+           matmul_and_bias_out_max = matmul_out_max;
+  if (bias_tensor_id != -1) {
+    const auto& bias_tensor = context->tensors[bias_tensor_id];
+    auto* const_bias_node =
+        graph_builder_->AddConstNodeWithData(bias_tensor_id, bias_tensor);
+    graph_builder_->AddTensorWithID(bias_tensor_id, const_bias_node->GetID(),
+                                    0);
+    ComputeMinAndMaxQuantValues(bias_tensor, &bias_min_, &bias_max_);
+    auto* bias_min_const = graph_builder_->AddConstNodeWithData(
+        quant_bound_shape, reinterpret_cast<char*>(&bias_min_),
+        sizeof(bias_min_));
+    auto* bias_max_const = graph_builder_->AddConstNodeWithData(
+        quant_bound_shape, reinterpret_cast<char*>(&bias_max_),
+        sizeof(bias_max_));
 
-  // MatMul + Bias.
-  auto* bias_add_op = graph_builder_->AddNode(GetTFLiteNodeID());
-  bias_add_op->SetOpType(OP_QuantizedBiasAdd_32p32to32);
-  bias_add_op->AddInput(matmul_out);
-  bias_add_op->AddInput(graph_builder_->GetHexagonTensorId(bias_tensor_id));
-  bias_add_op->AddInput(matmul_out_min);
-  bias_add_op->AddInput(matmul_out_max);
-  bias_add_op->AddInput(TensorID(bias_min_const->GetID(), 0));
-  bias_add_op->AddInput(TensorID(bias_max_const->GetID(), 0));
-  const auto& bias_add_out =
-      bias_add_op->AddOutput(sizeof(int32_t), 4,
-                             {output_batch_size, output_height_size,
-                              output_width_size, output_depth_size});
-  const auto& bias_add_out_min =
-      bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
-  const auto& bias_add_out_max =
-      bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    // MatMul + Bias.
+    auto* bias_add_op = graph_builder_->AddNode(GetTFLiteNodeID());
+    bias_add_op->SetOpType(OP_QuantizedBiasAdd_32p32to32);
+    bias_add_op->AddInput(matmul_out);
+    bias_add_op->AddInput(graph_builder_->GetHexagonTensorId(bias_tensor_id));
+    bias_add_op->AddInput(matmul_out_min);
+    bias_add_op->AddInput(matmul_out_max);
+    bias_add_op->AddInput(TensorID(bias_min_const->GetID(), 0));
+    bias_add_op->AddInput(TensorID(bias_max_const->GetID(), 0));
+    matmul_and_bias_out =
+        bias_add_op->AddOutput(sizeof(int32_t), 4,
+                               {output_batch_size, output_height_size,
+                                output_width_size, output_depth_size});
+    matmul_and_bias_out_min =
+        bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+    matmul_and_bias_out_max =
+        bias_add_op->AddOutput(sizeof(float), 4, {1, 1, 1, 1});
+  }
 
   // Quantize 32-bit result into 8-bit format using output tensor min/max.
   ComputeMinAndMaxQuantValues(context->tensors[outputs->data[0]], &output_min_,
@@ -170,9 +176,9 @@ TfLiteStatus MatMulOpBuilder::PopulateSubGraph(const TfLiteIntArray* inputs,
       sizeof(output_max_));
   auto* quantize_biasadd_op = graph_builder_->AddNode(GetTFLiteNodeID());
   quantize_biasadd_op->SetOpType(OP_Requantize_32to8);
-  quantize_biasadd_op->AddInput(bias_add_out);
-  quantize_biasadd_op->AddInput(bias_add_out_min);
-  quantize_biasadd_op->AddInput(bias_add_out_max);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out_min);
+  quantize_biasadd_op->AddInput(matmul_and_bias_out_max);
   quantize_biasadd_op->AddInput(TensorID(output_min_const->GetID(), 0));
   quantize_biasadd_op->AddInput(TensorID(output_max_const->GetID(), 0));
   node_output_ =
diff --git a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
index a16e22888dd..3a5f320a6a7 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/builders/tests/matmul_test.cc
@@ -22,7 +22,7 @@ using testing::ElementsAreArray;
 class FullyConnectedOpModel : public SingleOpModelWithHexagon {
  public:
   FullyConnectedOpModel(int units, int batches, const TensorData& input,
-                        const TensorData& output)
+                        const TensorData& output, bool optional_bias = false)
       : batches_(batches), units_(units) {
     int total_input_size = 1;
     for (size_t i = 0; i < input.shape.size(); ++i) {
@@ -34,9 +34,13 @@ class FullyConnectedOpModel : public SingleOpModelWithHexagon {
     weights_ =
         AddInput({input.type, {units_, input_size_}, input.min, input.max});
 
-    auto bias_scale = GetScale(input_) * GetScale(weights_);
-    TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
-    bias_ = AddInput(bias);
+    if (optional_bias) {
+      bias_ = AddNullInput();
+    } else {
+      auto bias_scale = GetScale(input_) * GetScale(weights_);
+      TensorData bias{TensorType_INT32, {units_}, 0, 0, bias_scale};
+      bias_ = AddInput(bias);
+    }
 
     output_ = AddOutput(output);
 
@@ -46,15 +50,16 @@ class FullyConnectedOpModel : public SingleOpModelWithHexagon {
                                     FullyConnectedOptionsWeightsFormat_DEFAULT,
                                     /*keep_num_dims=*/false)
             .Union());
-
-    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+    BuildInterpreter({GetShape(input_), GetShape(weights_)});
 
     // Weights & bias tensors need to be constant.
     // We don't use AddConstInput to allow setting filter values later.
     auto* weights_tensor = interpreter_->tensor(weights_);
     weights_tensor->allocation_type = kTfLiteMmapRo;
-    auto* bias_tensor = interpreter_->tensor(bias_);
-    bias_tensor->allocation_type = kTfLiteMmapRo;
+    if (!optional_bias) {
+      auto* bias_tensor = interpreter_->tensor(bias_);
+      bias_tensor->allocation_type = kTfLiteMmapRo;
+    }
   }
 
   void SetBias(const std::vector<float>& data) {
@@ -146,4 +151,56 @@ TEST(QuantizedFullyConnectedOpTest, TestQuantizedUint8) {
               ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
+TEST(QuantizedFullyConnectedOpTest, TestQuantizedUint8_NoBias) {
+  FullyConnectedOpModel m(
+      /*units=*/3, /*batches*/ 2,
+      /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
+      /*output=*/{TensorType_UINT8, {}, -127, 128}, /*optional_bias*/ true);
+
+  m.SetWeights<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+
+  m.SetInput<uint8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<uint8_t>();
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output)));
+}
+
+TEST(QuantizedFullyConnectedOpTest, TestQuantizedInt8_NoBias) {
+  FullyConnectedOpModel m(/*units=*/3, /*batches*/ 2,
+                          /*input=*/{TensorType_INT8, {2, 10}, -63.5, 64},
+                          /*output=*/{TensorType_INT8, {}, -127, 128},
+                          /*optional_bias*/ true);
+
+  m.SetWeights<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 2
+  });
+
+  m.SetInput<int8_t>({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+  auto reference_output = m.GetDequantizedOutput<int8_t>();
+
+  m.ApplyDelegateAndInvoke();
+
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(reference_output)));
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index 8aff13549b8..ae7f6994657 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -116,6 +116,9 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
   int tensor_id;
   for (int i = 0; i < node->inputs->size; ++i) {
     tensor_id = node->inputs->data[i];
+    // Skip optional tensors. Builders should handle optional tensors
+    // not available.
+    if (tensor_id == -1) continue;
     const auto& tensor = context->tensors[tensor_id];
     if (tensor.dims->size > 4) return false;
   }
@@ -191,19 +194,22 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
       if (!InputsWithCorrectTypes(node, context,
                                   {{kTfLiteUInt8, kTfLiteInt8},
                                    {kTfLiteUInt8, kTfLiteInt8},
-                                   {kTfLiteInt32}}))
+                                   {kTfLiteInt32, kTfLiteNoType}}))
         return false;
 
       const auto& weights_tensor = context->tensors[node->inputs->data[1]];
-      const auto& bias_tensor = context->tensors[node->inputs->data[2]];
-      const bool weights_and_bias_const =
-          weights_tensor.allocation_type == kTfLiteMmapRo &&
-          bias_tensor.allocation_type == kTfLiteMmapRo;
+      bool bias_const_or_no_bias = true;
+      if (node->inputs->data[2] != -1) {
+        const auto& bias_tensor = context->tensors[node->inputs->data[2]];
+        bias_const_or_no_bias = bias_tensor.allocation_type == kTfLiteMmapRo;
+      }
+      const bool weights_const =
+          weights_tensor.allocation_type == kTfLiteMmapRo;
 
       const TfLiteFullyConnectedParams* matmul_params =
           reinterpret_cast<const TfLiteFullyConnectedParams*>(
               node->builtin_data);
-      return (weights_and_bias_const &&
+      return (weights_const && bias_const_or_no_bias &&
               IsActivationReluOrNone(matmul_params->activation) &&
               matmul_params->keep_num_dims == false &&
               matmul_params->weights_format ==
@@ -335,7 +341,8 @@ bool IsNodeSupportedByHexagon(const TfLiteRegistration* registration,
         return false;
       const auto& input_tensor = context->tensors[node->inputs->data[1]];
       const bool is_four_dim_or_less = input_tensor.dims->size < 5;
-      // We need splitting axis to be constant, so Hexagon knows output shapes.
+      // We need splitting axis to be constant, so Hexagon knows output
+      // shapes.
       return is_four_dim_or_less &&
              IsConstantTensor(GetInput(context, node, 0));
     }

From 6f19d507f4955f571582349213c69991868379bb Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 18 May 2020 11:50:56 -0700
Subject: [PATCH 366/412] [XLA] Fix rendering of the RngBitGenerator
 description table

PiperOrigin-RevId: 312123981
Change-Id: I9d1ecdf88dfb9f5689dcfc26f6243a192ab55dd6
---
 .../compiler/xla/g3doc/operation_semantics.md | 32 +++++++++++--------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 495701eaac2..002d07184a7 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -2299,20 +2299,26 @@ The output is guaranteed to be a deterministic function of the initial state but
 it is *not* guaranteed to be deterministic between backends and different
 compiler versions.
 
-<b>`RngBitGenerator(algorithm, key, shape)`</b> | Arguments | Type | Semantics |
-|---------------- | ----------------- | ------------------------------------- |
-| `algorithm` | `RandomAlgorithm` | PRNG algorithm to be used. | |
-`initial_state` | `XlaOp` | Initial state for the PRNG algorithm. | | `shape` |
-`Shape` | Output shape for generated data. |
+<b>`RngBitGenerator(algorithm, key, shape)`</b>
 
-Available values for `algorithm`: * `rng_default`: Backend specific algorithm
-with backend specific shape requirements. * `rng_three_fry`: ThreeFry
-counter-based PRNG algorithm. The `initial_state` shape is `u64[2]` with
-arbitrary values.
-[Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
-* `rng_philox`: Philox algorithm to generate random numbers in parallel. The
-`initial_state` shape is `u64[3]` with arbitrary values.
-[Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+Arguments       | Type              | Semantics
+--------------- | ----------------- | -------------------------------------
+`algorithm`     | `RandomAlgorithm` | PRNG algorithm to be used.
+`initial_state` | `XlaOp`           | Initial state for the PRNG algorithm.
+`shape`         | `Shape`           | Output shape for generated data.
+
+Available values for `algorithm`:
+
+-   `rng_default`: Backend specific algorithm with backend specific shape
+    requirements.
+
+-   `rng_three_fry`: ThreeFry counter-based PRNG algorithm. The `initial_state`
+    shape is `u64[2]` with arbitrary values.
+    [Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
+
+-   `rng_philox`: Philox algorithm to generate random numbers in parallel. The
+    `initial_state` shape is `u64[3]` with arbitrary values.
+    [Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.](http://www.thesalmons.org/john/random123/papers/random123sc11.pdf)
 
 ## Scatter
 

From 672e419c9f7e331fff4449799e8cd7c476ac4b7c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 12:35:23 -0700
Subject: [PATCH 367/412] Enable tests for tf.linalg.lu in eager mode.

PiperOrigin-RevId: 312132817
Change-Id: I0dd5b96cc2b3462817e0637794a623c24bd0f989
---
 tensorflow/python/kernel_tests/lu_op_test.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
index 7935b66f4af..de9d8c32cb5 100644
--- a/tensorflow/python/kernel_tests/lu_op_test.py
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -30,7 +30,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import test
@@ -214,15 +214,20 @@ class LuOpTest(test.TestCase):
     data = np.random.rand(n, n) + 1j * np.random.rand(n, n)
     self._verifyLu(data)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testEmpty(self):
     self._verifyLu(np.empty([0, 2, 2]))
     self._verifyLu(np.empty([2, 0, 0]))
 
-  @test_util.run_deprecated_v1
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    matrix1 = random_ops.random_normal([5, 5], seed=42)
-    matrix2 = random_ops.random_normal([5, 5], seed=42)
+    matrix_shape = [5, 5]
+    seed = [42, 24]
+    matrix1 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    matrix2 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    self.assertAllEqual(matrix1, matrix2)
     lu1, p1 = linalg_ops.lu(matrix1)
     lu2, p2 = linalg_ops.lu(matrix2)
     lu1_val, p1_val, lu2_val, p2_val = self.evaluate([lu1, p1, lu2, p2])

From 7254343a10ba00d48f828981cec3e3587e667ca9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 12:37:47 -0700
Subject: [PATCH 368/412] Enable tests for tf.linalg.matrix_square_root in
 eager mode.

PiperOrigin-RevId: 312133318
Change-Id: I541a94a21594384fba30a9198ad5a7300537c498
---
 .../matrix_square_root_op_test.py             | 37 +++++++++++--------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index c36d83e2530..6cf330ed981 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -21,10 +21,11 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import test
 
 
@@ -89,31 +90,35 @@ class SquareRootOpTest(test.TestCase):
     self._verifySquareRootReal(np.empty([0, 2, 2]))
     self._verifySquareRootReal(np.empty([2, 0, 0]))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testWrongDimensions(self):
     # The input to the square root should be at least a 2-dimensional tensor.
     tensor = constant_op.constant([1., 2.])
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       gen_linalg_ops.matrix_square_root(tensor)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testNotSquare(self):
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       tensor = constant_op.constant([[1., 0., -1.], [-1., 1., 0.]])
       self.evaluate(gen_linalg_ops.matrix_square_root(tensor))
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def testConcurrentExecutesWithoutError(self):
-    with test_util.use_gpu():
-      matrix1 = random_ops.random_normal([5, 5], seed=42)
-      matrix2 = random_ops.random_normal([5, 5], seed=42)
-      square1 = math_ops.matmul(matrix1, matrix1)
-      square2 = math_ops.matmul(matrix2, matrix2)
-      sqrt1 = gen_linalg_ops.matrix_square_root(square1)
-      sqrt2 = gen_linalg_ops.matrix_square_root(square2)
-      all_ops = [sqrt1, sqrt2]
-      sqrt = self.evaluate(all_ops)
-      self.assertAllClose(sqrt[0], sqrt[1])
+    matrix_shape = [5, 5]
+    seed = [42, 24]
+    matrix1 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    matrix2 = stateless_random_ops.stateless_random_normal(
+        shape=matrix_shape, seed=seed)
+    self.assertAllEqual(matrix1, matrix2)
+    square1 = math_ops.matmul(matrix1, matrix1)
+    square2 = math_ops.matmul(matrix2, matrix2)
+    sqrt1 = gen_linalg_ops.matrix_square_root(square1)
+    sqrt2 = gen_linalg_ops.matrix_square_root(square2)
+    all_ops = [sqrt1, sqrt2]
+    sqrt = self.evaluate(all_ops)
+    self.assertAllClose(sqrt[0], sqrt[1])
 
 
 if __name__ == "__main__":

From b5436f9d5fe7bdfc8e42f0b27328a8457d48ccf6 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Mon, 18 May 2020 12:43:30 -0700
Subject: [PATCH 369/412] Rename TraceMe::SetMetadata to
 TraceMe::AppendMetadata and add lambda overload.

PiperOrigin-RevId: 312134462
Change-Id: Ia1a0f7de954fba6c0b05a6beae10cc08dc803cfc
---
 tensorflow/core/profiler/lib/BUILD            |  2 +
 tensorflow/core/profiler/lib/traceme.h        | 56 +++++++++-----
 tensorflow/core/profiler/lib/traceme_encode.h | 73 +++++++++++++++----
 tensorflow/python/profiler/internal/BUILD     |  1 +
 .../profiler/internal/traceme_wrapper.cc      | 10 ++-
 5 files changed, 107 insertions(+), 35 deletions(-)

diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 0aa1a5d6b67..5bb9236efb3 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -94,6 +94,7 @@ cc_library(
     hdrs = ["traceme.h"],
     visibility = ["//visibility:public"],
     deps = [
+        ":traceme_encode",
         "@com_google_absl//absl/strings",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform",
@@ -159,6 +160,7 @@ filegroup(
         "profiler_session.h",
         "scoped_annotation.h",
         "traceme.h",
+        "traceme_encode.h",
     ],
     visibility = ["//visibility:public"],
 )
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index 2c3e3ebe6cc..ec5f6765afb 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -28,6 +28,7 @@ limitations under the License.
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/profiler/internal/traceme_recorder.h"
 #endif
+#include "tensorflow/core/profiler/lib/traceme_encode.h"  // IWYU pragma: export
 
 namespace tensorflow {
 namespace profiler {
@@ -123,13 +124,20 @@ class TraceMe {
   explicit TraceMe(const char* raw, int level = 1)
       : TraceMe(absl::string_view(raw), level) {}
 
-  // This overload only generates the activity name if tracing is enabled.
-  // Useful for avoiding things like string concatenation when tracing is
-  // disabled. The |name_generator| may be a lambda or functor that returns a
-  // type that the string() constructor can take.
+  // This overload only generates the name (and possibly metadata) if tracing is
+  // enabled. Useful for avoiding expensive operations (e.g., string
+  // concatenation) when tracing is disabled.
+  // name_generator may be a lambda or functor that returns a type that the
+  // string() constructor can take, e.g., the result of TraceMeEncode.
   // name_generator is templated, rather than a std::function to avoid
   // allocations std::function might make even if never called.
-  // Usage: profiler::TraceMe([&]{ return StrCat(prefix, ":", postfix); });
+  // Example Usage:
+  //   TraceMe op_trace_me([&]() {
+  //     return StrCat(op_name, ":", op_type);
+  //   }
+  //   TraceMe trace_me_with_metadata([&value1]() {
+  //     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
+  //   });
   template <typename NameGeneratorT>
   explicit TraceMe(NameGeneratorT name_generator, int level = 1) {
     DCHECK_GE(level, 1);
@@ -167,21 +175,35 @@ class TraceMe {
 #endif
   }
 
-  // Sets new_metadata in the metadata part of no_init_.name.
-  void SetMetadata(absl::string_view new_metadata) {
+  // Appends new_metadata to the TraceMe name passed to the constructor.
+  // metadata_generator may be a lambda or functor that returns a type that the
+  // string() constructor can take, e.g., the result of TraceMeEncode.
+  // metadata_generator is only evaluated when tracing is enabled.
+  // metadata_generator is templated, rather than a std::function to avoid
+  // allocations std::function might make even if never called.
+  // Example Usage:
+  //   trace_me.AppendMetadata([&value1]() {
+  //     return TraceMeEncode({{"key1", value1}, {"key2", 42}});
+  //   });
+  template <typename MetadataGeneratorT>
+  void AppendMetadata(MetadataGeneratorT metadata_generator) {
 #if !defined(IS_MOBILE_PLATFORM)
     if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) {
       if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
-        std::string& name = no_init_.name;
-        DCHECK(!name.empty());
-        DCHECK(!new_metadata.empty());
-        if (name.back() == '#') {  // name already has metadata
-          name.back() = ',';
-          if (TF_PREDICT_TRUE(new_metadata.front() == '#')) {
-            new_metadata.remove_prefix(1);
-          }
-        }
-        name.append(new_metadata.data(), new_metadata.size());
+        traceme_internal::AppendMetadata(&no_init_.name, metadata_generator());
+      }
+    }
+#endif
+  }
+
+  // Appends new_metadata to the payload.
+  // This overload should only be used by other TraceMe APIs.
+  // Prefer the overload above instead.
+  void AppendMetadata(absl::string_view new_metadata) {
+#if !defined(IS_MOBILE_PLATFORM)
+    if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) {
+      if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) {
+        traceme_internal::AppendMetadata(&no_init_.name, new_metadata);
       }
     }
 #endif
diff --git a/tensorflow/core/profiler/lib/traceme_encode.h b/tensorflow/core/profiler/lib/traceme_encode.h
index 772f56a2153..2e23c6d878b 100644
--- a/tensorflow/core/profiler/lib/traceme_encode.h
+++ b/tensorflow/core/profiler/lib/traceme_encode.h
@@ -28,7 +28,7 @@ limitations under the License.
 
 namespace tensorflow {
 namespace profiler {
-namespace internal {
+namespace traceme_internal {
 
 // Copies the contents of str to the address pointed by out.
 // Returns the address after the copy.
@@ -36,24 +36,18 @@ namespace internal {
 TF_ATTRIBUTE_ALWAYS_INLINE inline char* Append(char* out,
                                                absl::string_view str) {
   const size_t str_size = str.size();
-  if (str_size > 0) {
+  if (TF_PREDICT_TRUE(str_size > 0)) {
     memcpy(out, str.data(), str_size);
     out += str_size;
   }
   return out;
 }
 
-}  // namespace internal
-
-// Encodes an event name and arguments into a string stored by TraceMe.
-// Use within a lambda to avoid expensive operations when tracing is inactive.
-// Example Usage:
-//   TraceMe trace_me([&name, value1]() {
-//     return TraceMeEncode(name, {{"key1", value1}, {"key2", 42}});
-//   });
-inline std::string TraceMeEncode(
+// Appends args encoded as TraceMe metadata to name.
+TF_ATTRIBUTE_ALWAYS_INLINE inline std::string AppendArgs(
     std::string name,
-    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+    const std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>>&
+        args) {
   if (TF_PREDICT_TRUE(args.size() > 0)) {
     const auto old_size = name.size();
     auto new_size = old_size + args.size() * 2 + 1;
@@ -65,9 +59,9 @@ inline std::string TraceMeEncode(
     char* out = begin + old_size;
     *out++ = '#';
     for (const auto& arg : args) {
-      out = internal::Append(out, arg.first);
+      out = Append(out, arg.first);
       *out++ = '=';
-      out = internal::Append(out, arg.second.Piece());
+      out = Append(out, arg.second.Piece());
       *out++ = ',';
     }
     *(out - 1) = '#';
@@ -76,6 +70,57 @@ inline std::string TraceMeEncode(
   return name;
 }
 
+// Appends new_metadata to the metadata part of name.
+TF_ATTRIBUTE_ALWAYS_INLINE inline void AppendMetadata(
+    std::string* name, absl::string_view new_metadata) {
+  if (!TF_PREDICT_FALSE(new_metadata.empty())) {
+    if (!name->empty() && name->back() == '#') {  // name already has metadata
+      name->back() = ',';
+      if (TF_PREDICT_TRUE(new_metadata.front() == '#')) {
+        new_metadata.remove_prefix(1);
+      }
+    }
+    name->append(new_metadata.data(), new_metadata.size());
+  }
+}
+
+}  // namespace traceme_internal
+
+// Encodes an event name and arguments into TraceMe metadata.
+// Use within a lambda to avoid expensive operations when tracing is disabled.
+// Example Usage:
+//   TraceMe trace_me([value1]() {
+//     return TraceMeEncode("my_trace", {{"key1", value1}, {"key2", 42}});
+//   });
+inline std::string TraceMeEncode(
+    std::string name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::move(name), args);
+}
+inline std::string TraceMeEncode(
+    absl::string_view name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::string(name), args);
+}
+inline std::string TraceMeEncode(
+    const char* name,
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::string(name), args);
+}
+
+// Encodes arguments into TraceMe metadata.
+// Use within a lambda to avoid expensive operations when tracing is disabled.
+// Example Usage:
+//   TraceMe trace_me("my_trace");
+//   ...
+//   trace_me.AppendMetadata([value1]() {
+//     return TraceMeEncode({{"key1", value1}, {"key2", 42}});
+//   });
+inline std::string TraceMeEncode(
+    std::initializer_list<std::pair<absl::string_view, absl::AlphaNum>> args) {
+  return traceme_internal::AppendArgs(std::string(), args);
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index d9f93c2fb21..9b0f216508e 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -89,6 +89,7 @@ tf_python_pybind_extension(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/lib:traceme_headers",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@pybind11",
     ],
diff --git a/tensorflow/python/profiler/internal/traceme_wrapper.cc b/tensorflow/python/profiler/internal/traceme_wrapper.cc
index a1b5370836b..6b0098e316d 100644
--- a/tensorflow/python/profiler/internal/traceme_wrapper.cc
+++ b/tensorflow/python/profiler/internal/traceme_wrapper.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
 #include <utility>
 
+#include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "pybind11/pybind11.h"
 #include "tensorflow/core/platform/types.h"
@@ -27,13 +29,13 @@ namespace {
 // Helper to implement TraceMe as a context manager in Python.
 class TraceMeWrapper {
  public:
-  explicit TraceMeWrapper(const tensorflow::string& name) : name_(name) {}
+  explicit TraceMeWrapper(const std::string& name) : name_(name) {}
 
   void Enter() { traceme_.emplace(std::move(name_)); }
 
-  void SetMetadata(const tensorflow::string& new_metadata) {
+  void SetMetadata(const std::string& new_metadata) {
     if (TF_PREDICT_TRUE(traceme_)) {
-      traceme_->SetMetadata(new_metadata);
+      traceme_->AppendMetadata(absl::string_view(new_metadata));
     }
   }
 
@@ -50,7 +52,7 @@ class TraceMeWrapper {
 
 PYBIND11_MODULE(_pywrap_traceme, m) {
   py::class_<TraceMeWrapper> traceme_class(m, "TraceMe");
-  traceme_class.def(py::init<const tensorflow::string&>())
+  traceme_class.def(py::init<const std::string&>())
       .def("Enter", &TraceMeWrapper::Enter)
       .def("Exit", &TraceMeWrapper::Exit)
       .def("SetMetadata", &TraceMeWrapper::SetMetadata)

From 8e661af54d9787b2a3a2371cc6efcfa1d8db6a34 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Mon, 18 May 2020 13:03:24 -0700
Subject: [PATCH 370/412] [XLA] Simplify tautological compares (and (< x A) (<
 x B)) to (< x A) when `a <= B` holds.

This is required for figuring out the trip count of loops whose condition
contains the conjunction.  Such conjunctions arise from TF when a for loop with
`tf.range` is lowered, or when using `tf.while_loop` with `maximum_iterations`
set.

PiperOrigin-RevId: 312138518
Change-Id: I12c5c7d0aeedbf0d375f3cff1d23b39aea89f64a
---
 .../xla/service/algebraic_simplifier.cc       | 65 +++++++++++++++++++
 .../xla/service/algebraic_simplifier_test.cc  | 19 ++++++
 2 files changed, 84 insertions(+)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 55af8726dc8..ecbf2075abe 100755
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -508,6 +508,13 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
   // Tries to convert slice(reshape(X)) into reshape(slice(X))
   StatusOr<bool> TryToReorderSliceAndReshape(HloInstruction* slice);
 
+  // Tries to simplify `(and (< a N) (< a K))` in cases where `N <= K` into
+  // `(< a N)`. This is crucial for being able to figure out the loop trip
+  // count.
+  //
+  // Assumes that the input is conjunction.
+  StatusOr<bool> TrySimplifyTautologicalCompare(HloInstruction* conjunction);
+
   // Useful when we want to use the same visitor over multiple computations.
   void ResetState(HloComputation* computation);
 
@@ -856,6 +863,57 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
   return Status::OK();
 }
 
+StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyTautologicalCompare(
+    HloInstruction* conjunction) {
+  HloInstruction *lhs, *rhs;
+  if (!Match(conjunction, m::And(m::Op(&lhs), m::Op(&rhs)))) {
+    return false;
+  }
+  struct LessThanCompareInfo {  // (LT var constant)
+    HloInstruction* var;
+    int64 constant;
+  };
+
+  auto get_compare_info_helper =
+      [&](HloInstruction* lhs,
+          HloInstruction* rhs) -> absl::optional<LessThanCompareInfo> {
+    if (!Match(rhs, m::Constant().WithShape(
+                        m::Shape().IsEffectiveScalar().WithElementType(
+                            PrimitiveType::S32)))) {
+      return absl::nullopt;
+    }
+    return {LessThanCompareInfo{lhs, *rhs->literal().GetFirstInteger()}};
+  };
+
+  auto get_compare_info =
+      [&](HloInstruction* cmp) -> absl::optional<LessThanCompareInfo> {
+    HloInstruction *lhs, *rhs;
+    if (!Match(cmp, m::Compare(m::Op(&lhs), m::Op(&rhs))
+                        .WithComparisonDirection(ComparisonDirection::kLt))) {
+      return absl::nullopt;
+    }
+    if (auto match1 = get_compare_info_helper(lhs, rhs)) {
+      return match1;
+    } else if (auto match2 = get_compare_info_helper(rhs, lhs)) {
+      return match2;
+    }
+    return absl::nullopt;
+  };
+
+  absl::optional<LessThanCompareInfo> lhs_info = get_compare_info(lhs);
+  absl::optional<LessThanCompareInfo> rhs_info = get_compare_info(rhs);
+  if (lhs_info && rhs_info && lhs_info->var == rhs_info->var) {
+    int64 new_bound = std::min(lhs_info->constant, rhs_info->constant);
+    TF_RETURN_IF_ERROR(ReplaceWithNewInstruction(
+        conjunction,
+        HloInstruction::CreateCompare(lhs->shape(), lhs_info->var,
+                                      MakeScalarLike(lhs_info->var, new_bound),
+                                      ComparisonDirection::kLt)));
+    return true;
+  }
+  return false;
+}
+
 Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(logical_and, m::And(m::Op(&lhs), m::Op(&rhs))));
@@ -890,6 +948,13 @@ Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
     return Status::OK();
   }
 
+  // Simplify tautological conjunctions.
+  TF_ASSIGN_OR_RETURN(bool found_tautological_compare,
+                      TrySimplifyTautologicalCompare(logical_and));
+  if (found_tautological_compare) {
+    return Status::OK();
+  }
+
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 6c8e80aa963..08a004e39fe 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -5761,6 +5761,25 @@ TEST_F(AlgebraicSimplifierTest, CompareSame) {
               GmockMatch(m::Broadcast(m::ConstantScalar(true))));
 }
 
+TEST_F(AlgebraicSimplifierTest, CompareSimplified) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      param = s32[] parameter(0)
+      c1 = s32[] constant(10)
+      c2 = s32[] constant(100)
+      cmp1 = pred[] compare(param, c1), direction=LT
+      cmp2 = pred[] compare(param, c2), direction=LT
+      ROOT out = pred[] and(cmp1, cmp2)
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Compare(m::Op(), m::Op().IsConstantScalar(10))
+                     .WithComparisonDirection(ComparisonDirection::kLt)));
+}
+
 TEST_F(AlgebraicSimplifierTest, CanDisableDotToMultiplyRewrite) {
   // Some backends may have better performance by treating an outer product as a
   // Dot, rather than a broadcast Multiply

From 869920697b243622073317ddc533bdff41684c41 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Mon, 18 May 2020 13:27:55 -0700
Subject: [PATCH 371/412] [tf.lite] Use in-process conversion when the new
 converter is used

Out-of-process conversion was a workaround for the legacy converter,
which would generally crash the process when conversion failed. However,
out-of-process conversion also adds a good deal of complexity, so avoid
it when using the new conversion backend.

PiperOrigin-RevId: 312142994
Change-Id: I7ddc83df99ccf24be6e15f46d6a116dce8321933
---
 tensorflow/lite/python/convert.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 6b7a32f1bcc..a5fbb88132e 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -169,9 +169,10 @@ def toco_convert_protos(model_flags_str,
     RuntimeError: When conversion fails, an exception is raised with the error
       message embedded.
   """
-  # TODO(aselle): When toco does not use fatal errors for failure, we can
-  # switch this on.
-  if not _toco_from_proto_bin:
+  # Historically, TOCO conversion failures would trigger a crash, so we would
+  # attempt to run the converter out-of-process. The MLIR conversion pipeline
+  # surfaces errors instead, and can be safely run in-process.
+  if enable_mlir_converter or not _toco_from_proto_bin:
     try:
       model_str = wrap_toco.wrapped_toco_convert(model_flags_str,
                                                  toco_flags_str, input_data_str,

From da67fcddef242a0c358f4acc5f263880c1863836 Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Mon, 18 May 2020 13:36:18 -0700
Subject: [PATCH 372/412] Edit Hexagon documentation to reflect new supported
 models

PiperOrigin-RevId: 312144610
Change-Id: I9c8b0d9ad6ea4b745b4bb985ca143cca660a5b14
---
 .../g3doc/performance/hexagon_delegate.md     | 22 ++++++++-----------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/hexagon_delegate.md b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
index 60fe9465bf4..0e947d1d5e1 100644
--- a/tensorflow/lite/g3doc/performance/hexagon_delegate.md
+++ b/tensorflow/lite/g3doc/performance/hexagon_delegate.md
@@ -22,15 +22,15 @@ are supported, including:
 
 **Supported models:**
 
-The Hexagon delegate currently supports quantized models generated using
-[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize),
-e.g.,
-[these quantized models](https://www.tensorflow.org/lite/guide/hosted_models#quantized_models)
-hosted on the TensorFlow Lite repo. It does not (yet) support models with
-[8-bit symmetric quantization spec](https://www.tensorflow.org/lite/performance/quantization_spec).
-Sample models include
-[MobileNet V1](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz),
-[SSD Mobilenet](https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip).
+The Hexagon delegate supports all models that conform to our
+[8-bit symmetric quantization spec](https://www.tensorflow.org/lite/performance/quantization_spec),
+including those generated using
+[post-training integer quantization](https://www.tensorflow.org/lite/performance/post_training_integer_quant).
+UInt8 models trained with the legacy
+[quantization-aware training](https://github.com/tensorflow/tensorflow/tree/r1.13/tensorflow/contrib/quantize)
+path are also supported, for e.g.,
+[these quantized versions](https://www.tensorflow.org/lite/guide/hosted_models#quantized_models)
+on our Hosted Models page.
 
 ## Hexagon Delegate Java API
 
@@ -254,10 +254,6 @@ ro.board.platform`).
 
 ## FAQ
 
-*   Will the delegate support models created using
-    [post-training quantization](https://www.tensorflow.org/lite/performance/post_training_quantization)?
-    *   This is tentatively planned for a future release, though there is no
-        concrete timeline.
 *   Which ops are supported by the delegate?
     *   See the current list of [supported ops and constraints](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/delegates/hexagon/README.md)
 *   How can I tell that the model is using the DSP when I enable the delegate?

From d4f71ff132a1262f4a6b05f58807e8ba3d46b83d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 13:38:25 -0700
Subject: [PATCH 373/412] Enable tests for tf.linalg.tensordot in eager mode.

PiperOrigin-RevId: 312144965
Change-Id: I2d75f7d9bd7f05aef6d1dee620dffcea66071b97
---
 .../python/kernel_tests/tensordot_op_test.py  | 43 ++++++++++++-------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index 71e448f7855..7f8c5e9781b 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python import tf2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -41,16 +41,19 @@ def _add_test(test, test_name, fn):
 
 class TensordotTest(test_lib.TestCase):
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_invalid_shape(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4], [5, 6]]
     a_axes = [1]
     b_axes = [0]
     # Invalid static shapes.
-    with self.assertRaises(ValueError):
+    with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)):
       math_ops.tensordot(a, b, (a_axes, b_axes))
+
     # Invalid dynamic shapes.
+    if context.executing_eagerly():
+      return
     with self.cached_session() as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "Matrix size-incompatible"):
@@ -65,7 +68,7 @@ class TensordotTest(test_lib.TestCase):
                 axes_ph: (a_axes, b_axes)
             })
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_invalid_axes(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4]]
@@ -77,6 +80,8 @@ class TensordotTest(test_lib.TestCase):
     with self.assertRaises(IndexError):
       math_ops.tensordot(a, b, [[0], [7]])
 
+    if context.executing_eagerly():
+      return
     # Invalid dynamic axes.
     a_ph = array_ops.placeholder(dtypes.float32)
     b_ph = array_ops.placeholder(dtypes.float32)
@@ -93,22 +98,22 @@ class TensordotTest(test_lib.TestCase):
                   axes_ph: axes_value
               })
 
-  # Test case for 11950
+  # Test case for https://github.com/tensorflow/tensorflow/issues/11950
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_valid_axis(self):
     for axes_value in [1, 2], [[1], [2]], [[], []], 0:
-      with self.cached_session():
-        np_a = np.ones((3, 3))
-        np_b = np.array([2, 3, 1])[None, None]
-        np_ans = np.tensordot(np_a, np_b, axes_value)
+      np_a = np.ones((3, 3))
+      np_b = np.array([2, 3, 1])[None, None]
+      np_ans = np.tensordot(np_a, np_b, axes_value)
 
-        tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
-        tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
-        tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value)
+      tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
+      tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
+      tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value)
 
-        self.assertAllEqual(tf_ans.shape, np_ans.shape)
-        self.assertAllEqual(tf_ans, np_ans)
+      self.assertAllEqual(tf_ans.shape, np_ans.shape)
+      self.assertAllEqual(self.evaluate(tf_ans), np_ans)
 
-  @test_util.run_v1_only("b/120545219")
+  @test_util.run_v1_only("Shape inference test")
   def test_partial_shape_inference(self):
     for axes in ([1], [0]), 1:
       a = array_ops.placeholder(dtypes.float32)
@@ -159,7 +164,10 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
         size=np.prod(b_shape)).reshape(b_shape).astype(dtype_)
     return a, b, a_dims, b_dims
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_tensordot(self):
+    if dynamic_shape_ and context.executing_eagerly():
+      self.skipTest("Placeholders not support in eager mode")
     num_trials = min(30, num_dims_ * num_dims_)
     if dtype_ == np.float16:
       tol = 0.05
@@ -187,7 +195,10 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def test_tensordot_scalar_axes(self):
+    if dynamic_shape_ and context.executing_eagerly():
+      self.skipTest("Placeholders not support in eager mode")
     if num_dims_ < 1:
       self.skipTest("Not a test")
     if dtype_ == np.float16:
@@ -229,7 +240,7 @@ if __name__ == "__main__":
       for rank_b in 1, 2, 4, 5:
         for num_dims in range(0, min(rank_a, rank_b) + 1):
           # TF2 does not support placeholders under eager so we skip it
-          for dynamic_shape in set([False, not tf2.enabled()]):
+          for dynamic_shape in set([False, True]):
             for testcase in _get_tensordot_tests(dtype, rank_a, rank_b,
                                                  num_dims, dynamic_shape):
               name = "%s_%s_%s_%s_%s_%s" % (testcase.__name__, dtype.__name__,

From ecf503380978e04e5e47f231fcc33a49d6c9d841 Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Mon, 18 May 2020 13:38:32 -0700
Subject: [PATCH 374/412] Return a meaningful error for dynamic shape inputs
 with outside compilation head extraction in TPUs.

PiperOrigin-RevId: 312144982
Change-Id: I187b58ac8759b391fdcb9649bffd979025350f55
---
 .../python/distribute/tpu_strategy_test.py    | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index de4c975d5ef..6c93e29c028 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
 from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
+from tensorflow.python.framework import config
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -140,6 +141,9 @@ class TPUStrategyTest(test.TestCase):
     # for non-local TPU.
     if FLAGS.tpu:
       self.skipTest("Recovery fails for non-local TPU, see b/148150981")
+
+    # Disable automatic outside compilation.
+    config.set_soft_device_placement(False)
     strategy = get_tpu_strategy()
 
     @def_function.function
@@ -164,6 +168,28 @@ class TPUStrategyTest(test.TestCase):
 
     good_run()
 
+  def test_dynamic_shape_with_outside_compilation_failure(self):
+    # Enable automatic outside compilation.
+    config.set_soft_device_placement(True)
+    strategy = get_tpu_strategy()
+    dataset = dataset_ops.Dataset.from_tensors(("string", 1.0)).repeat().batch(
+        2, drop_remainder=False)
+    dataset = strategy.experimental_distribute_dataset(dataset)
+    iterator = iter(dataset)
+
+    @def_function.function
+    def train_fn(iterator):
+
+      def step_fn(inputs):
+        _, inputs = inputs
+        return math_ops.reduce_sum(inputs)
+
+      return strategy.experimental_local_results(
+          strategy.run(step_fn, args=(next(iterator),)))
+
+    with self.assertRaisesRegex(errors.InternalError, "Compilation failure"):
+      logging.info(train_fn(iterator))
+
   def test_computation_on_subset_cores(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)

From 3d4c5d1b578397070d8cecbfe88d8fa06c183189 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 18 May 2020 14:06:53 -0700
Subject: [PATCH 375/412] NFC: Update canonicalize tests to use regex.

PiperOrigin-RevId: 312150354
Change-Id: Ifed616606d5c8c708a3800256c4234b9bbb3ce3c
---
 .../mlir/lite/tests/canonicalize.mlir         | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
index 1f067aae685..5c69130c939 100644
--- a/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/canonicalize.mlir
@@ -11,9 +11,9 @@ func @reshape_removeAdjacent(tensor<4x4x4xf32>) -> tensor<64xf32> {
   return %1 : tensor<64xf32>
 
 // CHECK-LABEL: func @reshape_removeAdjacent
-// CHECK:  %cst = constant dense<64> : tensor<1xi32>
-// CHECK:  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  return
+// CHECK:  %[[CST:.*]] = constant dense<64> : tensor<1xi32>
+// CHECK:  %[[RESHAPE:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  return %[[RESHAPE]]
 }
 
 // Checks that tfl.reshape should be removed if its output has more than one
@@ -29,11 +29,11 @@ func @reshape_removeAdjacentWithMultipleUse(tensor<4x4x4xf32>) -> tensor<64xf32>
   return %3 : tensor<64xf32>
 
 // CHECK-LABEL: func @reshape_removeAdjacentWithMultipleUse
-// CHECK:  %cst = constant dense<64> : tensor<1xi32>
-// CHECK:  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  %1 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  %2 = addf %0, %1
-// CHECK:  return %2
+// CHECK:  %[[CST:.*]] = constant dense<64> : tensor<1xi32>
+// CHECK:  %[[RESHAPE_1:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  %[[RESHAPE_2:.*]]  = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  %[[RESULT:.*]] = addf %[[RESHAPE_1]], %[[RESHAPE_2]]
+// CHECK:  return %[[RESULT]]
 }
 
 // Checks that tfl.reshape should be kept if its output has more than one
@@ -47,11 +47,11 @@ func @reshape_keepAdjacentWithMultipleUse(tensor<4x4x4xf32>) -> (tensor<16x4xf32
   return %0, %1 : tensor<16x4xf32>, tensor<64xf32>
 
 // CHECK-LABEL: func @reshape_keepAdjacentWithMultipleUse
-// CHECK:  %cst = constant dense<[16, 4]> : tensor<2xi32>
-// CHECK:  %cst_0 = constant dense<64> : tensor<1xi32>
-// CHECK:  %0 = "tfl.reshape"(%arg0, %cst) : (tensor<4x4x4xf32>, tensor<2xi32>) -> tensor<16x4xf32>
-// CHECK:  %1 = "tfl.reshape"(%arg0, %cst_0) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
-// CHECK:  return %0, %1
+// CHECK:  %[[CST:.*]]  = constant dense<[16, 4]> : tensor<2xi32>
+// CHECK:  %[[CST_0:.*]]  = constant dense<64> : tensor<1xi32>
+// CHECK:  %[[RESHAPE_1:.*]] = "tfl.reshape"(%arg0, %[[CST]]) : (tensor<4x4x4xf32>, tensor<2xi32>) -> tensor<16x4xf32>
+// CHECK:  %[[RESHAPE_2:.*]] = "tfl.reshape"(%arg0, %[[CST_0]]) : (tensor<4x4x4xf32>, tensor<1xi32>) -> tensor<64xf32>
+// CHECK:  return  %[[RESHAPE_1]],  %[[RESHAPE_2]]
 }
 
 // Checks that tfl.reshape should be removed if its output type is the same

From 6dcb7268bb28221134cd1151a730e89023d59623 Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Mon, 18 May 2020 14:33:45 -0700
Subject: [PATCH 376/412] Rename `_get_closest` to more accurately reflect what
 it does.

PiperOrigin-RevId: 312155516
Change-Id: I27d8dd110ace0150ea735f718ed94948a9a75a74
---
 tensorflow/python/distribute/values.py  | 22 +++++++++++-----------
 tensorflow/python/training/optimizer.py |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 444915aa123..84904f93104 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -139,7 +139,7 @@ class DistributedValues(object):
         "This method should be overridden by sub-classes which support cross-"
         "replica accesses.")
 
-  def _get_closest(self):
+  def _get_on_device_or_primary(self):
     """Returns value in same replica or device if possible, else the _primary."""
     replica_id = _get_current_replica_id_as_int()
     if replica_id is None:
@@ -379,7 +379,7 @@ class Mirrored(DistributedDelegate):
   """Holds a map from replica to values which are kept in sync."""
 
   def _get_cross_replica(self):
-    return self._get_closest()
+    return self._get_on_device_or_primary()
 
   def _as_graph_element(self):
     obj = self._get()
@@ -480,11 +480,11 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     return init_op
 
   def initialized_value(self):
-    return self._get_closest().initialized_value()
+    return self._get_on_device_or_primary().initialized_value()
 
   @property
   def initial_value(self):
-    return self._get_closest().initial_value
+    return self._get_on_device_or_primary().initial_value
 
   @property
   def constraint(self):
@@ -537,7 +537,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       return self._values[replica_id].handle
 
   def eval(self, session=None):
-    return self._get_closest().eval(session)
+    return self._get_on_device_or_primary().eval(session)
 
   @property
   def _save_slice_info(self):
@@ -552,7 +552,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   @property
   def device(self):
-    return self._get_closest().device
+    return self._get_on_device_or_primary().device
 
   @property
   def trainable(self):
@@ -587,7 +587,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
       return array_ops.identity(self._get())
 
   def value(self):
-    return self._get_closest().value()
+    return self._get_on_device_or_primary().value()
 
   def numpy(self):
     if context.executing_eagerly():
@@ -961,7 +961,7 @@ class MirroredVariable(DistributedVariable, Mirrored):
     return array_ops.identity(Mirrored._get_cross_replica(self))
 
   def _as_graph_element(self):
-    return self._get_closest()._as_graph_element()  # pylint: disable=protected-access
+    return self._get_on_device_or_primary()._as_graph_element()  # pylint: disable=protected-access
 
   def _gather_saveables_for_checkpoint(self):
     """Overrides Trackable method.
@@ -1067,7 +1067,7 @@ class SyncOnReadVariable(DistributedVariable):
   """Holds a map from replica to variables whose values are reduced on save."""
 
   def _update_replica(self, update_fn, value, **kwargs):
-    return update_fn(self._get_closest(), value, **kwargs)
+    return update_fn(self._get_on_device_or_primary(), value, **kwargs)
 
   # TODO(b/154017756): Make assign behaivor in cross replica context consistent
   # with MirroredVariable.
@@ -1146,8 +1146,8 @@ class SyncOnReadVariable(DistributedVariable):
       if ds_context.in_cross_replica_context():
         return self._get_cross_replica()
       else:
-        # _get_closest() returns a Variable.
-        return self._get_closest().value()
+        # _get_on_device_or_primary() returns a Variable.
+        return self._get_on_device_or_primary().value()
 
   def _get_cross_replica(self):
     if self._aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 9732ea04f26..1fe8a8c729b 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -768,7 +768,7 @@ class Optimizer(
       # pylint: enable=protected-access
       mirrored_slot = named_slots.get(key, None)
       if mirrored_slot is None: return None
-      return mirrored_slot._get_closest()  # pylint: disable=protected-access
+      return mirrored_slot._get_on_device_or_primary()  # pylint: disable=protected-access
 
     return named_slots.get(_var_key(var), None)
 

From 756e66db61ec5b0a642be7381f65cc87d4e64802 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 15:03:26 -0700
Subject: [PATCH 377/412] Modify signature of layout_config().

PiperOrigin-RevId: 312161403
Change-Id: I9304d4839f6bcea6804dd959b131ffac7c0be6d6
---
 tensorflow/compiler/xla/service/hlo_module_config.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index 833d0fe59d0..964f83322a4 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -204,7 +204,7 @@ class HloModuleConfig {
 
   std::vector<std::vector<int64>>* mutable_dot_config() { return &dot_config_; }
 
-  absl::Span<const std::vector<std::vector<int64>>> layout_config() const {
+  const std::vector<std::vector<std::vector<int64>>>& layout_config() const {
     return layout_config_;
   }
 

From 1a07ecf8526bca5748bf447b16586b60889cdc36 Mon Sep 17 00:00:00 2001
From: Xiao Yu <fishx@google.com>
Date: Mon, 18 May 2020 15:08:28 -0700
Subject: [PATCH 378/412] In TF-TFRT integration, C API will get dtype from
 underlying fallback tensor directly if the tfrt dtype is Unsupported. This is
 used to support dtypes that are not natively implemented in TFRT (e.g.
 DT_RESOURCE).

Enable a few resnet50 tests.

PiperOrigin-RevId: 312162457
Change-Id: Iece6d621120e8b20d0a0fe7b271a76dc29caa924
---
 .../python/eager/benchmarks/resnet50/resnet50_test.py     | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index 9d049a6d59d..34ceb56d129 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -104,7 +104,6 @@ class ResNet50Test(tf.test.TestCase):
       context.async_wait()
     self.assertEqual((2, 1000), output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_apply(self):
     self._apply(defun=False)
 
@@ -121,7 +120,6 @@ class ResNet50Test(tf.test.TestCase):
   def test_apply_with_defun_async(self):
     self._apply(defun=True, execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_apply_no_top(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False)
@@ -132,7 +130,6 @@ class ResNet50Test(tf.test.TestCase):
                     if data_format == 'channels_first' else (2, 1, 1, 2048))
     self.assertEqual(output_shape, output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_apply_with_pooling(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False, pooling='avg')
@@ -141,7 +138,6 @@ class ResNet50Test(tf.test.TestCase):
       output = model(images, training=False)
     self.assertEqual((2, 2048), output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_apply_no_average_pooling(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -153,7 +149,6 @@ class ResNet50Test(tf.test.TestCase):
                     (2, 7, 7, 2048))
     self.assertEqual(output_shape, output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_apply_block3_strides(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -165,7 +160,6 @@ class ResNet50Test(tf.test.TestCase):
                     (2, 1, 1, 2048))
     self.assertEqual(output_shape, output.shape)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_apply_retrieve_intermediates(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(
@@ -220,7 +214,6 @@ class ResNet50Test(tf.test.TestCase):
     self.assertEqual(len(events), 2)
     self.assertEqual(events[1].summary.value[0].tag, 'loss')
 
-  @test_util.disable_tfrt('b/155260334')
   def test_train(self):
     self._test_train()
 
@@ -228,7 +221,6 @@ class ResNet50Test(tf.test.TestCase):
   def test_train_async(self):
     self._test_train(execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('b/155260334')
   def test_no_garbage(self):
     device, data_format = resnet50_test_util.device_and_data_format()
     model = resnet50.ResNet50(data_format)

From 3c54ef5ab94813713ae538b76a78e1fac4ac424d Mon Sep 17 00:00:00 2001
From: Yujing Zhang <yujingzhang@google.com>
Date: Mon, 18 May 2020 15:17:54 -0700
Subject: [PATCH 379/412] Support running a tf.function with packed variable
 inputs both locally and remotely. - Support packing multiple EagerTensors of
 the same dtype and shape. - Create CompositeDevices on the same task as the
 local host CPU, in order to correctly trigger packed TensorHandle copy from a
 client to a remote worker.

PiperOrigin-RevId: 312164194
Change-Id: Ia15718309c8c68eb645bfe0bf967ddd6d2551b3a
---
 .../core/common_runtime/composite_device.cc   | 12 ++--
 .../core/common_runtime/composite_device.h    |  5 +-
 .../common_runtime/composite_device_test.cc   | 11 ++--
 .../core/common_runtime/eager/context.cc      |  7 ++-
 .../core/common_runtime/eager/context_test.cc | 12 ++--
 .../common_runtime/eager/execute_node_test.cc |  3 +-
 .../eager/tensor_handle_test.cc               |  3 +
 .../process_function_library_runtime_test.cc  |  3 +-
 tensorflow/python/eager/backprop.py           | 13 ++++
 tensorflow/python/eager/context.py            | 16 +++++
 tensorflow/python/eager/function_test.py      | 37 ++++++++++++
 tensorflow/python/eager/pywrap_tensor.cc      | 15 ++++-
 tensorflow/python/eager/pywrap_tfe.h          |  3 +-
 tensorflow/python/eager/remote_test.py        | 31 ++++++++++
 tensorflow/python/framework/ops.py            | 59 +++++++++++++++++++
 tensorflow/python/framework/ops_test.py       | 47 +++++++++++++++
 tensorflow/python/tfe_wrapper.cc              | 20 +++++++
 17 files changed, 274 insertions(+), 23 deletions(-)

diff --git a/tensorflow/core/common_runtime/composite_device.cc b/tensorflow/core/common_runtime/composite_device.cc
index 3103fa37941..7fd41e00a04 100644
--- a/tensorflow/core/common_runtime/composite_device.cc
+++ b/tensorflow/core/common_runtime/composite_device.cc
@@ -24,7 +24,7 @@ const char* const kCompositeDeviceType = "COMPOSITE";
 
 std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
     const std::vector<string>& underlying_devices, const int unique_device_id,
-    Status* status) {
+    const DeviceNameUtils::ParsedName& host_name, Status* status) {
   if (underlying_devices.empty()) {
     status->Update(
         errors::InvalidArgument("underlying_devices should not be empty."));
@@ -62,13 +62,15 @@ std::unique_ptr<CompositeDevice> CompositeDevice::MakeDevice(
       return nullptr;
     }
   }
+
+  DeviceNameUtils::ParsedName parsed_composite_name = host_name;
   DeviceAttributes device_attributes;
-  parsed_name.type = kCompositeDeviceType;
-  device_attributes.set_device_type(parsed_name.type);
-  parsed_name.id = unique_device_id;
+  parsed_composite_name.type = kCompositeDeviceType;
+  parsed_composite_name.id = unique_device_id;
   const string composite_name =
-      DeviceNameUtils::ParsedNameToString(parsed_name);
+      DeviceNameUtils::ParsedNameToString(parsed_composite_name);
   device_attributes.set_name(composite_name);
+  device_attributes.set_device_type(kCompositeDeviceType);
 
   return absl::WrapUnique(
       new CompositeDevice(device_attributes, underlying_devices));
diff --git a/tensorflow/core/common_runtime/composite_device.h b/tensorflow/core/common_runtime/composite_device.h
index 127e5b8303a..850eae55e8d 100644
--- a/tensorflow/core/common_runtime/composite_device.h
+++ b/tensorflow/core/common_runtime/composite_device.h
@@ -42,10 +42,11 @@ class CompositeDevice : public Device {
     return &underlying_devices_;
   }
 
-  // Helper for creating a CompositeDevice.
+  // Helper for creating a CompositeDevice on the same task as the given host
+  // CPU.
   static std::unique_ptr<CompositeDevice> MakeDevice(
       const std::vector<string>& underlying_devices, const int unique_device_id,
-      Status* status);
+      const DeviceNameUtils::ParsedName& host_name, Status* status);
 
  private:
   CompositeDevice(const DeviceAttributes& device_attributes,
diff --git a/tensorflow/core/common_runtime/composite_device_test.cc b/tensorflow/core/common_runtime/composite_device_test.cc
index ac2f9108ecb..73a6ae44912 100644
--- a/tensorflow/core/common_runtime/composite_device_test.cc
+++ b/tensorflow/core/common_runtime/composite_device_test.cc
@@ -20,12 +20,15 @@ limitations under the License.
 namespace tensorflow {
 
 TEST(CompositeDeviceTest, Basic) {
+  const string host_name = "/job:localhost/replica:0/task:0/device:CPU:0";
+  DeviceNameUtils::ParsedName parsed_host_name;
+  EXPECT_TRUE(DeviceNameUtils::ParseFullName(host_name, &parsed_host_name));
   std::vector<string> underlying_devices;
   {
     Status status;
     std::unique_ptr<CompositeDevice> composite_device =
         CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/0,
-                                    &status);
+                                    parsed_host_name, &status);
     EXPECT_EQ(composite_device, nullptr);
     EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
     EXPECT_TRUE(absl::StrContains(status.error_message(),
@@ -41,7 +44,7 @@ TEST(CompositeDeviceTest, Basic) {
         "/job:localhost/replica:0/task:0/device:CPU:1");
     std::unique_ptr<CompositeDevice> composite_device =
         CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/0,
-                                    &status);
+                                    parsed_host_name, &status);
     TF_ASSERT_OK(status);
     EXPECT_EQ(composite_device->device_type(), kCompositeDeviceType);
     EXPECT_EQ(underlying_devices, *composite_device->underlying_devices());
@@ -53,7 +56,7 @@ TEST(CompositeDeviceTest, Basic) {
         "/job:localhost/replica:0/task:0/device:CPU:0");
     std::unique_ptr<CompositeDevice> composite_device =
         CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/1,
-                                    &status);
+                                    parsed_host_name, &status);
     EXPECT_EQ(composite_device, nullptr);
     EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
     EXPECT_TRUE(
@@ -68,7 +71,7 @@ TEST(CompositeDeviceTest, Basic) {
         "/job:localhost/replica:0/task:0/device:GPU:0");
     std::unique_ptr<CompositeDevice> composite_device =
         CompositeDevice::MakeDevice(underlying_devices, /*unique_device_id=*/1,
-                                    &status);
+                                    parsed_host_name, &status);
     EXPECT_EQ(composite_device, nullptr);
     EXPECT_EQ(error::INVALID_ARGUMENT, status.code());
     EXPECT_TRUE(absl::StrContains(status.error_message(),
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index b8dfe92aac6..207c6a02d5b 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -935,8 +935,11 @@ Status EagerContext::FindOrCreateCompositeDevice(
   }
 
   Status s;
-  auto device = CompositeDevice::MakeDevice(underlying_devices,
-                                            composite_devices_.size(), &s);
+  // Create a CompositeDevice on the same task as the host CPU, in order to
+  // trigger packed TensorHandle copy from a client to a remote worker.
+  auto device =
+      CompositeDevice::MakeDevice(underlying_devices, composite_devices_.size(),
+                                  HostCPU()->parsed_name(), &s);
   TF_RETURN_IF_ERROR(s);
   *composite_device = device.get();
   pflr_->AddCompositeDevice(*composite_device);
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index f83e3f0b45d..c6ed61c80c4 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -31,7 +31,7 @@ static Device* CreateDevice(const string& type, int n) {
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
   };
   DeviceAttributes attr;
-  attr.set_name("/job:a/replica:0/task:0/device:" + type + ":" +
+  attr.set_name("/job:localhost/replica:0/task:0/device:" + type + ":" +
                 std::to_string(n));
   attr.set_device_type(type);
   return new FakeDevice(attr);
@@ -179,10 +179,10 @@ TEST_F(EagerContextTest, CompositeDevice) {
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
                                                       &composite_device_0));
   EXPECT_EQ(composite_device_0->name(),
-            "/job:worker/replica:0/task:0/device:COMPOSITE:0");
+            "/job:localhost/replica:0/task:0/device:COMPOSITE:0");
   CompositeDevice* device = nullptr;
   TF_EXPECT_OK(context()->FindCompositeDeviceFromName(
-      "/job:worker/replica:0/task:0/device:COMPOSITE:0", &device));
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:0", &device));
   EXPECT_EQ(device, composite_device_0);
   CompositeDevice* composite_device_1 = nullptr;
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
@@ -193,13 +193,13 @@ TEST_F(EagerContextTest, CompositeDevice) {
   TF_ASSERT_OK(context()->FindOrCreateCompositeDevice(underlying_devices,
                                                       &composite_device_2));
   EXPECT_EQ(composite_device_2->name(),
-            "/job:worker/replica:0/task:0/device:COMPOSITE:1");
+            "/job:localhost/replica:0/task:0/device:COMPOSITE:1");
   TF_EXPECT_OK(context()->FindCompositeDeviceFromName(
-      "/job:worker/replica:0/task:0/device:COMPOSITE:1", &device));
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:1", &device));
   EXPECT_EQ(device, composite_device_2);
 
   EXPECT_TRUE(errors::IsNotFound(context()->FindCompositeDeviceFromName(
-      "/job:worker/replica:0/task:0/device:COMPOSITE:2", &device)));
+      "/job:localhost/replica:0/task:0/device:COMPOSITE:2", &device)));
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc
index 99f030322df..83fbcf5017e 100644
--- a/tensorflow/core/common_runtime/eager/execute_node_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc
@@ -61,7 +61,8 @@ TEST(ExecuteNodeTest, ExecuteNodeArgs) {
   Status s;
   std::unique_ptr<CompositeDevice> composite_device =
       CompositeDevice::MakeDevice({device0->name(), device1->name()},
-                                  /*unique_device_id=*/0, &s);
+                                  /*unique_device_id=*/0,
+                                  device_mgr.HostCPU()->parsed_name(), &s);
   TF_ASSERT_OK(s);
 
   auto ctx = new EagerContext(
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 779158375de..13b634bbec4 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -100,6 +100,7 @@ class PackedTensorHandleTest : public ::testing::Test {
     for (const char* name : device_names_) {
       devices.emplace_back(CreateDevice("GPU", name));
     }
+    devices.emplace_back(CreateDevice("CPU", host_name_));
     device_mgr_ = new StaticDeviceMgr(std::move(devices));
 
     context_ = new EagerContext(
@@ -132,6 +133,8 @@ class PackedTensorHandleTest : public ::testing::Test {
       "/job:worker/replica:0/task:1/device:GPU:0",
       "/job:worker/replica:0/task:1/device:GPU:1"};
 
+  const char* host_name_ = "/job:worker/replica:0/task:0/device:CPU:0";
+
   StaticDeviceMgr* device_mgr_;
   EagerContext* context_;
 };
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 247b94dc58c..5bdb4601d37 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -820,7 +820,8 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, MultiDevice_CompositeDevice) {
   Status s;
   std::unique_ptr<CompositeDevice> composite_device =
       CompositeDevice::MakeDevice({device0_->name(), device1_->name()},
-                                  /*unique_device_id=*/0, &s);
+                                  /*unique_device_id=*/0,
+                                  device_mgr_->HostCPU()->parsed_name(), &s);
   TF_ASSERT_OK(s);
   AddCompositeDevice(composite_device.get());
 
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index fb7c4055136..7a3dce7db4e 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -241,6 +241,11 @@ def implicit_val_and_grad(f):
                        "function was being computed.")
 
     sources = [v.handle for v in variables]
+    for s in sources:
+      if getattr(s, "is_packed", False):
+        raise ValueError(
+            "GradientTape.gradient is not supported on packed EagerTensors yet."
+        )
     grad = imperative_grad.imperative_grad(this_tape, nest.flatten(end_node),
                                            sources)
     return end_node, list(zip(grad, variables))
@@ -548,6 +553,10 @@ def make_vjp(f, params=None, persistent=True):
       ]
       args = _ensure_unique_tensor_objects(parameter_positions, args)
       for i in parameter_positions:
+        if getattr(args[i], "is_packed", False):
+          raise ValueError(
+              "GradientTape.gradient is not supported on packed EagerTensors"
+              "yet.")
         sources.append(args[i])
         tape.watch(this_tape, args[i])
       result = f(*args)
@@ -1032,6 +1041,10 @@ class GradientTape(object):
             logging.WARN, "The dtype of the source tensor must be "
             "floating (e.g. tf.float32) when calling GradientTape.gradient, "
             "got %r", t.dtype)
+      if getattr(t, "is_packed", False):
+        raise ValueError(
+            "GradientTape.gradient is not supported on packed EagerTensors yet."
+        )
 
     if output_gradients is not None:
       output_gradients = [None if x is None else ops.convert_to_tensor(x)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index 86b3d5cf95f..604a960afd5 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -1123,6 +1123,22 @@ class Context(object):
     pywrap_tfe.TFE_Py_RegisterCustomDevice(self._handle, device_capsule,
                                            device_name, device_info_capsule)
 
+  def pack_eager_tensors(self, tensors):
+    """Pack multiple `EagerTensor`s of the same dtype and shape.
+
+    Args:
+      tensors: a list of EagerTensors to pack.
+
+    Returns:
+      A packed EagerTensor.
+    """
+    self.ensure_initialized()
+    if self._lazy_remote_inputs_copy is not None and (
+        not self._lazy_remote_inputs_copy):
+      raise ValueError("Packing eager tensors is not supported when "
+                       "lazy_remote_inputs_copy is disabled.")
+    return pywrap_tfe.TFE_Py_PackEagerTensors(self._handle, tensors)
+
   def remove_function(self, name):
     """Remove a function from the context.
 
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 4e68f1460d9..078ca8b8878 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -186,6 +186,43 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(AttributeError, 'no attribute'):
       add(c)
 
+  def testPackedVariable(self):
+    with ops.device('/cpu:0'):
+      v0_0 = resource_variable_ops.ResourceVariable(1.0)
+    with ops.device('/cpu:1'):
+      v0_1 = resource_variable_ops.ResourceVariable(2.0)
+      v1_0 = resource_variable_ops.ResourceVariable(3.0)
+    with ops.device('/cpu:2'):
+      v1_1 = resource_variable_ops.ResourceVariable(4.0)
+
+    packed_var_0 = ops.pack_eager_tensors([v0_0.handle, v0_1.handle])
+    packed_var_1 = ops.pack_eager_tensors([v1_0.handle, v1_1.handle])
+
+    # TODO(b/145922293): use ResourceVariable.assign_add and
+    # ResourceVariable.read_value directly once we support packing multiple
+    # ResourceVariable into one ResourceVariable.
+    @def_function.function
+    def read_var():
+      resource_variable_ops.assign_add_variable_op(
+          packed_var_0, constant_op.constant(5.0))
+      resource_variable_ops.assign_add_variable_op(
+          packed_var_1, constant_op.constant(6.0))
+      with ops.device('/cpu:0'):
+        read0 = resource_variable_ops.read_variable_op(
+            packed_var_0, dtype=dtypes.float32)
+      with ops.device('/cpu:1'):
+        read1 = resource_variable_ops.read_variable_op(
+            packed_var_0, dtype=dtypes.float32)
+        read2 = resource_variable_ops.read_variable_op(
+            packed_var_1, dtype=dtypes.float32)
+      with ops.device('/cpu:2'):
+        read3 = resource_variable_ops.read_variable_op(
+            packed_var_1, dtype=dtypes.float32)
+
+      return read0, read1, read2, read3
+
+    self.assertAllEqual(read_var(), (1 + 5, 2 + 5, 3 + 6, 4 + 6))
+
   def testImplementsAttributeBasic(self):
     v = def_function.function(
         experimental_implements='func')(lambda x, y: x + y)
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index a72f74b38b8..b209ddb6162 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -345,6 +345,8 @@ typedef struct EagerTensor {
   char unused[kMaxEagerTensorParentSize];
   TFE_TensorHandle* handle;
   int64_t id;
+  // Indicates whether it's a packed tensor or not.
+  bool is_packed;
   // This mirrors tensorflow.core.framework.ops.Tensor._handle_data Which will
   // be None for tensors of type other than DT_RESOURCE. For DT_RESOURCE
   // tensors, this will contain a serialized HandleData proto with shape
@@ -418,6 +420,7 @@ bool MaybeInvokeCreatedOnEagerTensorProfiler(EagerTensor* created_tensor) {
 int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
   self->id = get_uid();
   self->handle = nullptr;
+  self->is_packed = false;
   Py_INCREF(Py_None);
   self->handle_data = Py_None;
   Py_INCREF(Py_None);
@@ -647,6 +650,11 @@ static PyObject* EagerTensor_backing_device(EagerTensor* self) {
 #endif
 }
 
+// Getter `is_packed`.
+static PyObject* EagerTensor_is_packed(EagerTensor* self) {
+  return PyBool_FromLong(self->is_packed);
+}
+
 static PyGetSetDef EagerTensor_getsetters[] = {
     {const_cast<char*>("_id"), (getter)EagerTensor_getid, nullptr,
      const_cast<char*>("Tensor ID."), nullptr},
@@ -655,6 +663,9 @@ static PyGetSetDef EagerTensor_getsetters[] = {
     {const_cast<char*>("backing_device"), (getter)EagerTensor_backing_device,
      nullptr, const_cast<char*>("Device on which tensor's memory is resident."),
      nullptr},
+    {const_cast<char*>("is_packed"), (getter)EagerTensor_is_packed, nullptr,
+     const_cast<char*>("Whether the EagerTensor is a packed tensor or not."),
+     nullptr},
     {const_cast<char*>("_handle_data"), (getter)EagerTensor_handle_data,
      (setter)EagerTensor_sethandle_data,
      const_cast<char*>("Shape/DType data if the EagerTensor is a DT_RESOURCE"),
@@ -813,7 +824,8 @@ TFE_TensorHandle* EagerTensor_Handle(const PyObject* o) {
   return reinterpret_cast<const EagerTensor*>(o)->handle;
 }
 
-PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
+PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle,
+                                const bool is_packed) {
   if (handle == nullptr) {
     return nullptr;
   }
@@ -821,6 +833,7 @@ PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
       EagerTensorType->tp_new(EagerTensorType, EmptyTuple(), EmptyDict()));
   if (t != nullptr) {
     t->id = get_uid();
+    t->is_packed = is_packed;
     Py_INCREF(Py_None);
     t->handle_data = Py_None;
     Py_INCREF(Py_None);
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 92a0a200e3d..a5c9c181539 100755
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -129,7 +129,8 @@ void TFE_DeleteContextCapsule(PyObject* context);
 bool EagerTensor_CheckExact(const PyObject* o);
 
 // Helper function to construct a new EagerTensor from a TFE_TensorHandle.
-PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle);
+PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle,
+                                const bool is_packed = false);
 
 // Extracts the handle inside EagerTensor object `o`. Returns nullptr on error.
 TFE_TensorHandle* EagerTensor_Handle(const PyObject* o);
diff --git a/tensorflow/python/eager/remote_test.py b/tensorflow/python/eager/remote_test.py
index 32fe6372f77..710e7bf5f9d 100644
--- a/tensorflow/python/eager/remote_test.py
+++ b/tensorflow/python/eager/remote_test.py
@@ -40,6 +40,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.training import server_lib
 from tensorflow.python.training.server_lib import ClusterSpec
@@ -324,6 +325,36 @@ class MultiWorkersTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(remote_function(constant_op.constant([1.0])), [3.0])
 
+  def testMultiDeviceFunctionWithPackedVariable(self):
+    with ops.device('/job:worker/replica:0/task:0/device:CPU:0'):
+      var0 = resource_variable_ops.ResourceVariable(1.0)
+    with ops.device('/job:worker/replica:0/task:1/device:CPU:0'):
+      var1 = resource_variable_ops.ResourceVariable(2.0)
+
+    packed_var = ops.pack_eager_tensors([var0.handle, var1.handle])
+    self.assertEqual(packed_var.device,
+                     '/job:localhost/replica:0/task:0/device:COMPOSITE:0')
+    self.assertEqual(packed_var.backing_device,
+                     '/job:localhost/replica:0/task:0/device:COMPOSITE:0')
+
+    @def_function.function
+    def add_variables():
+      with ops.device('/job:worker/replica:0/task:0/device:CPU:0'):
+        read0 = resource_variable_ops.read_variable_op(
+            packed_var, dtype=dtypes.float32)
+      with ops.device('/job:worker/replica:0/task:1/device:CPU:0'):
+        read1 = resource_variable_ops.read_variable_op(
+            packed_var, dtype=dtypes.float32)
+
+      return read0 + read1
+
+    # Run the function on a remote device
+    with ops.device('/job:worker/replica:0/task:0'):
+      self.assertAllEqual(add_variables().numpy(), 3.0)
+
+    # Run the function on a local worker
+    self.assertAllEqual(add_variables().numpy(), 3.0)
+
   @test_util.eager_lazy_remote_copy_on_and_off
   def testMultiDeviceFunctionOnRemoteDeviceWithWait(self):
     with ops.device('/job:worker/replica:0/task:1'):
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 43652d51eae..5b6dac5be34 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -1394,6 +1394,65 @@ def _error_prefix(name):
   return "" if name is None else "%s: " % name
 
 
+def pack_eager_tensors(tensors, ctx=None):
+  """Pack multiple `EagerTensor`s of the same dtype and shape.
+
+  Args:
+    tensors: a list of EagerTensors to pack.
+    ctx: context.context().
+
+  Returns:
+    A packed EagerTensor.
+  """
+  if not isinstance(tensors, list):
+    raise TypeError("tensors must be a list or a tuple: %s" % tensors)
+
+  if not tensors:
+    raise ValueError("Empty tensors is unexpected for packing.")
+
+  dtype = tensors[0].dtype
+  shape = tensors[0].shape
+  handle_data = tensors[0]._handle_data  # pylint: disable=protected-access
+  is_resource = dtype == dtypes.resource
+  for i in range(len(tensors)):
+    t = tensors[i]
+    if not isinstance(t, EagerTensor):
+      raise TypeError("tensors must be a list of EagerTensors: %s" % t)
+
+    if t.dtype != dtype:
+      raise ValueError(
+          "All tensors being packed should have the same dtype %s, "
+          "but the %d-th tensor is of dtype %s" % (dtype, i, t.dtype))
+    if t.shape != shape:
+      raise ValueError(
+          "All tensors being packed should have the same shape %s, "
+          "but the %d-th tensor is of shape %s" % (shape, i, t.shape))
+    # pylint: disable=protected-access
+    if is_resource and t._handle_data != handle_data:
+      raise ValueError(
+          "All tensors being packed should have the same handle data %s, "
+          "but the %d-th tensor is of handle data %s" %
+          (handle_data, i, t._handle_data))
+    # pylint: enable=protected-access
+
+  if ctx is None:
+    ctx = context.context()
+
+  # Propogate handle data for resource variables
+  packed_tensor = ctx.pack_eager_tensors(tensors)
+  if handle_data is not None:
+    packed_tensor._handle_data = handle_data  # pylint: disable=protected-access
+
+  def grad_fun(_):
+    raise ValueError(
+        "Gradients through pack_eager_tensors are not supported yet.")
+
+  tape.record_operation("pack_eager_tensors", [packed_tensor], tensors,
+                        grad_fun)
+
+  return packed_tensor
+
+
 def convert_to_tensor(value,
                       dtype=None,
                       name=None,
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 322df8ffac8..11193155999 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -34,6 +34,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as eager_function
 from tensorflow.python.eager import wrap_function
+from tensorflow.python.framework import config
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import device as pydev
@@ -3408,5 +3409,51 @@ class CustomConvertToCompositeTensorTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x_, tensor_util.constant_value(y_))
 
 
+@test_util.disable_tfrt("Packing EagerTensors is not supported yet.")
+class PackEagerTensorTest(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    super(PackEagerTensorTest, self).setUp()
+    context._reset_context()
+    cpus = config.list_physical_devices("CPU")
+    # Set 2 virtual CPUs
+    config.set_logical_device_configuration(cpus[0], [
+        context.LogicalDeviceConfiguration(),
+        context.LogicalDeviceConfiguration(),
+    ])
+
+  def testPack(self):
+    with context.eager_mode():
+      with ops.device("CPU:0"):
+        var0 = resource_variable_ops.ResourceVariable(1.0)
+        c0 = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+      with ops.device("CPU:1"):
+        var1 = resource_variable_ops.ResourceVariable(2.0)
+        var2 = resource_variable_ops.ResourceVariable([3.0])
+        c1 = constant_op.constant([9.0])
+
+      packed_var0 = ops.pack_eager_tensors([var0.handle, var1.handle])
+      self.assertTrue(packed_var0.is_packed)
+      self.assertEqual(packed_var0.dtype, var0.handle.dtype)
+      self.assertEqual(packed_var0.shape, var0.handle.shape)
+      self.assertEqual(packed_var0._handle_data, var0.handle._handle_data)
+      self.assertIn("COMPOSITE:0", packed_var0.device)
+      self.assertIn("COMPOSITE:0", packed_var0.backing_device)
+      with self.assertRaises(errors.InvalidArgumentError):
+        packed_var0.numpy()
+
+      # Different dtypes
+      with self.assertRaises(ValueError):
+        ops.pack_eager_tensors([var0.handle, c1])
+
+      # Different shapes
+      with self.assertRaises(ValueError):
+        ops.pack_eager_tensors([c0, c1])
+
+      # Different handle data
+      with self.assertRaises(ValueError):
+        ops.pack_eager_tensors([var0.handle, var2.handle])
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index 836cafbd494..efcd912f430 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -210,6 +210,22 @@ TFE_OutputTensorHandles InputTFE_OutputTensorHandles(
   return output_tensor_handles;
 }
 
+// Packs multiple `EagerTensor`s of the same dtype and shape into one
+// `EagerTensor`.
+py::object TFE_Py_PackEagerTensors_wrapper(const py::handle& context,
+                                           const py::handle& tensors) {
+  TFE_Context* ctx = tensorflow::InputTFE_Context(context);
+  TFE_InputTensorHandles handles = InputTFE_InputTensorHandles(tensors);
+  tensorflow::Safe_TF_StatusPtr status = tensorflow::make_safe(TF_NewStatus());
+  int size = handles.size();
+  TFE_TensorHandle* packed_handle =
+      TFE_CreatePackedTensorHandle(ctx, handles.data(), &size, status.get());
+  tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get());
+  PyObject* packed_tensor =
+      EagerTensorFromHandle(packed_handle, /*is_packed=*/true);
+  return tensorflow::PyoOrThrow(packed_tensor);
+}
+
 // This function was created from fusing the typemap logic in platform/base.i.
 py::object TFE_Py_ExecuteCancelable_wrapper(
     const py::handle& context, const char* device_name, const char* op_name,
@@ -558,6 +574,10 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
   m.def("TFE_Py_InitEagerTensor", [](const py::handle& o) {
     return tensorflow::PyoOrThrow(TFE_Py_InitEagerTensor(o.ptr()));
   });
+  m.def("TFE_Py_PackEagerTensors",
+        [](const py::handle& context, const py::handle& handles) {
+          return tensorflow::TFE_Py_PackEagerTensors_wrapper(context, handles);
+        });
   m.def("TFE_Py_SetEagerTensorProfiler", &TFE_Py_SetEagerTensorProfiler);
   m.def("TFE_Py_RegisterJVPFunction", [](const py::handle& o) {
     return tensorflow::PyoOrThrow(TFE_Py_RegisterJVPFunction(o.ptr()));

From 4001e3dad3c6340b0c2001d89b3954f189e9aeb5 Mon Sep 17 00:00:00 2001
From: Sachin Joglekar <srjoglekar@google.com>
Date: Mon, 18 May 2020 15:22:44 -0700
Subject: [PATCH 380/412] Updates GPU delegate documentation with experimental
 quant support

PiperOrigin-RevId: 312165090
Change-Id: I8fb624f71101fce6a379ed24f6002f8f4b60245d
---
 tensorflow/lite/g3doc/performance/gpu.md      |   2 +-
 .../lite/g3doc/performance/gpu_advanced.md    | 189 ++++++++----------
 .../g3doc/performance/model_optimization.md   |   6 +-
 3 files changed, 84 insertions(+), 113 deletions(-)

diff --git a/tensorflow/lite/g3doc/performance/gpu.md b/tensorflow/lite/g3doc/performance/gpu.md
index 8762afb4c83..b5abf46f845 100644
--- a/tensorflow/lite/g3doc/performance/gpu.md
+++ b/tensorflow/lite/g3doc/performance/gpu.md
@@ -31,7 +31,7 @@ models.
 For a step-by-step tutorial, watch the
 [GPU Delegate for Android](https://youtu.be/Xkhgre8r5G0) video.
 
-Note: This requires OpenGL ES 3.1 or higher.
+Note: This requires OpenCL or OpenGL ES (3.1 or higher).
 
 #### Step 1. Clone the TensorFlow source code and open it in Android Studio
 
diff --git a/tensorflow/lite/g3doc/performance/gpu_advanced.md b/tensorflow/lite/g3doc/performance/gpu_advanced.md
index 9f47c2e55e8..dce3eb8db6b 100644
--- a/tensorflow/lite/g3doc/performance/gpu_advanced.md
+++ b/tensorflow/lite/g3doc/performance/gpu_advanced.md
@@ -1,9 +1,9 @@
 # TensorFlow Lite on GPU
 
 [TensorFlow Lite](https://www.tensorflow.org/mobile/tflite/) supports several
-hardware accelerators.  This document describes how to use the GPU backend using
-the TensorFlow Lite delegate APIs on Android (requires OpenGL ES 3.1 or higher)
-and iOS (requires iOS 8 or later).
+hardware accelerators. This document describes how to use the GPU backend using
+the TensorFlow Lite delegate APIs on Android (requires OpenCL or OpenGL ES 3.1
+and higher) and iOS (requires iOS 8 or later).
 
 ## Benefits of GPU Acceleration
 
@@ -35,25 +35,33 @@ power and generating less heat than the same task run on a CPU.
 TensorFlow Lite on GPU supports the following ops in 16-bit and 32-bit float
 precision:
 
-* `ADD v1`
-* `AVERAGE_POOL_2D v1`
-* `CONCATENATION v1`
-* `CONV_2D v1`
-* `DEPTHWISE_CONV_2D v1-2`
-* `FULLY_CONNECTED v1`
-* `LOGISTIC v1`
-* `MAX_POOL_2D v1`
-* `MUL v1`
-* `PAD v1`
-* `PRELU v1`
-* `RELU v1`
-* `RELU6 v1`
-* `RESHAPE v1`
-* `RESIZE_BILINEAR v1`
-* `SOFTMAX v1`
-* `STRIDED_SLICE v1`
-* `SUB v1`
-* `TRANSPOSE_CONV v1`
+*   `ADD`
+*   `AVERAGE_POOL_2D`
+*   `CONCATENATION`
+*   `CONV_2D`
+*   `DEPTHWISE_CONV_2D v1-2`
+*   `EXP`
+*   `FULLY_CONNECTED`
+*   `LOGISTIC`
+*   `LSTM v2 (Basic LSTM only)`
+*   `MAX_POOL_2D`
+*   `MAXIMUM`
+*   `MINIMUM`
+*   `MUL`
+*   `PAD`
+*   `PRELU`
+*   `RELU`
+*   `RELU6`
+*   `RESHAPE`
+*   `RESIZE_BILINEAR v1-3`
+*   `SOFTMAX`
+*   `STRIDED_SLICE`
+*   `SUB`
+*   `TRANSPOSE_CONV`
+
+By default, all ops are only supported at version 1. Enabling the
+[experimental quantization support](gpu_advanced.md#running-quantized-models-experimental-android-only)
+allows the appropriate versions; for example, ADD v2.
 
 ## Basic Usage
 
@@ -82,8 +90,8 @@ delegate.close();
 ### Android (C/C++)
 
 For C/C++ usage of TensorFlow Lite GPU on Android, the GPU delegate can be
-created with `TfLiteGpuDelegateCreate()` and destroyed with
-`TfLiteGpuDelegateDelete()`.
+created with `TfLiteGpuDelegateV2Create()` and destroyed with
+`TfLiteGpuDelegateV2Delete()`.
 
 ```c++
 // Set up interpreter.
@@ -94,15 +102,7 @@ std::unique_ptr<Interpreter> interpreter;
 InterpreterBuilder(*model, op_resolver)(&interpreter);
 
 // NEW: Prepare GPU delegate.
-const TfLiteGpuDelegateOptions options = {
-  .metadata = NULL,
-  .compile_options = {
-    .precision_loss_allowed = 1,  // FP16
-    .preferred_gl_object_type = TFLITE_GL_OBJECT_TYPE_FASTEST,
-    .dynamic_batch_enabled = 0,   // Not fully functional yet
-  },
-};
-auto* delegate = TfLiteGpuDelegateCreate(&options);
+auto* delegate = TfLiteGpuDelegateV2Create(/*default options=*/nullptr);
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 
 // Run inference.
@@ -111,9 +111,13 @@ if (interpreter->Invoke() != kTfLiteOk) return false;
 ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
 
 // NEW: Clean up.
-TfLiteGpuDelegateDelete(delegate);
+TfLiteGpuDelegateV2Delete(delegate);
 ```
 
+Take a look at `TfLiteGpuDelegateOptionsV2` to create a delegate instance with
+custom options. You can initialize the default options with
+`TfLiteGpuDelegateOptionsV2Default()` and then modify them as necessary.
+
 TFLite GPU for Android C/C++ uses the [Bazel](https://bazel.io) build system.
 The delegate can be built, for example, using the following command:
 
@@ -165,6 +169,43 @@ called.
 
 ## Advanced Usage
 
+### Running quantized models (Experimental, Android only)
+
+The GPU delegate already supports
+[float16 quantized](https://www.tensorflow.org/lite/performance/post_training_float16_quant)
+models. There is experimental support on Android to run 8-bit quantized as well.
+This includes all flavors of quantization, including:
+
+*   Models trained with
+    [Quantization-aware training](https://www.tensorflow.org/lite/convert/quantization)
+*   [Post-training dynamic-range quantization](https://www.tensorflow.org/lite/performance/post_training_quant)
+*   [Post-training full-integer quantization](https://www.tensorflow.org/lite/performance/post_training_integer_quant)
+
+To optimize performance, use models that have floating-point input & output
+tensors.
+
+This feature can be enabled using delegate options as follows:
+
+**C++ API**
+
+```c++
+// NEW: Prepare custom options with feature enabled.
+TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
+options.experimental_flags |= TFLITE_GPU_EXPERIMENTAL_FLAGS_ENABLE_QUANT;
+
+auto* delegate = TfLiteGpuDelegateV2Create(options);
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+```
+
+**Java API**
+
+```java
+// NEW: Prepare GPU delegate with feature turned on.
+GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedModelsAllowed(true));
+
+Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
+```
+
 ### Delegate Options for iOS
 
 `NewGpuDelegate()` accepts a `struct` of options.
@@ -210,7 +251,7 @@ While it is convenient to use `nullptr`, we recommend that you explicitly set
 the options, to avoid any unexpected behavior if default values are changed in
 the future.
 
-### Input/Output Buffers
+### Input/Output Buffers (iOS only)
 
 To do computation on the GPU, data must be made available to the GPU. This often
 requires performing a memory copy. It is desirable not to cross the CPU/GPU
@@ -229,80 +270,10 @@ To achieve best performance, TensorFlow Lite makes it possible for users to
 directly read from and write to the TensorFlow hardware buffer and bypass
 avoidable memory copies.
 
-#### Android
-
-Assuming the image input is in the GPU memory, it must first be converted to an
-OpenGL Shader Storage Buffer Object (SSBO). You can associate a TfLiteTensor to
-a user-prepared SSBO with `Interpreter.bindGlBufferToTensor()`. Note that
-`Interpreter.bindGlBufferToTensor()` must be called before
-`Interpreter.modifyGraphWithDelegate()`.
-
-```java
-// Ensure a valid EGL rendering context.
-EGLContext eglContext = eglGetCurrentContext();
-if (eglContext.equals(EGL_NO_CONTEXT)) return false;
-
-// Create an SSBO.
-int[] id = new int[1];
-glGenBuffers(id.length, id, 0);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]);
-glBufferData(GL_SHADER_STORAGE_BUFFER, inputSize, null, GL_STREAM_COPY);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);  // unbind
-int inputSsboId = id[0];
-
-// Create interpreter.
-Interpreter interpreter = new Interpreter(tfliteModel);
-Tensor inputTensor = interpreter.getInputTensor(0);
-GpuDelegate gpuDelegate = new GpuDelegate();
-// The buffer must be bound before the delegate is installed.
-gpuDelegate.bindGlBufferToTensor(inputTensor, inputSsboId);
-interpreter.modifyGraphWithDelegate(gpuDelegate);
-
-// Run inference; the null input argument indicates use of the bound buffer for input.
-fillSsboWithCameraImageTexture(inputSsboId);
-float[] outputArray = new float[outputSize];
-interpreter.runInference(null, outputArray);
-```
-
-A similar approach can be applied to the output tensor. In that case,
-`Interpreter.Options.setAllowBufferHandleOutput(true)` should be passed on, to
-disable the default copying of the network's output from GPU memory to CPU
-memory.
-
-```java
-// Ensure a valid EGL rendering context.
-EGLContext eglContext = eglGetCurrentContext();
-if (eglContext.equals(EGL_NO_CONTEXT)) return false;
-
-// Create a SSBO.
-int[] id = new int[1];
-glGenBuffers(id.length, id, 0);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, id[0]);
-glBufferData(GL_SHADER_STORAGE_BUFFER, outputSize, null, GL_STREAM_COPY);
-glBindBuffer(GL_SHADER_STORAGE_BUFFER, 0);  // unbind
-int outputSsboId = id[0];
-
-// Create interpreter.
-Interpreter.Options options = (new Interpreter.Options()).setAllowBufferHandleOutput(true);
-Interpreter interpreter = new Interpreter(tfliteModel, options);
-Tensor outputTensor = interpreter.getOutputTensor(0);
-GpuDelegate gpuDelegate = new GpuDelegate();
-// The buffer must be bound before the delegate is installed.
-gpuDelegate.bindGlBufferToTensor(outputTensor, outputSsboId);
-interpreter.modifyGraphWithDelegate(gpuDelegate);
-
-// Run inference; the null output argument indicates use of the bound buffer for output.
-ByteBuffer input = getCameraImageByteBuffer();
-interpreter.runInference(input, null);
-renderOutputSsbo(outputSsboId);
-```
-
-#### iOS
-
 Assuming the image input is in GPU memory, it must first be converted to a
 `MTLBuffer` object for Metal. You can associate a TfLiteTensor to a
-user-prepared `MTLBuffer` with `BindMetalBufferToTensor()`. Note that
-`BindMetalBufferToTensor()` must be called before
+user-prepared `MTLBuffer` with `TFLGpuDelegateBindMetalBufferToTensor()`. Note
+that `TFLGpuDelegateBindMetalBufferToTensor()` must be called before
 `Interpreter::ModifyGraphWithDelegate()`. Additionally, the inference output is,
 by default, copied from GPU memory to CPU memory. This behavior can be turned
 off by calling `Interpreter::SetAllowBufferHandleOutput(true)` during
@@ -312,8 +283,8 @@ initialization.
 // Prepare GPU delegate.
 auto* delegate = NewGpuDelegate(nullptr);
 interpreter->SetAllowBufferHandleOutput(true);  // disable default gpu->cpu copy
-if (!BindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false;
-if (!BindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false;
+if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->inputs()[0], user_provided_input_buffer)) return false;
+if (!TFLGpuDelegateBindMetalBufferToTensor(delegate, interpreter->outputs()[0], user_provided_output_buffer)) return false;
 if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
 
 // Run inference.
diff --git a/tensorflow/lite/g3doc/performance/model_optimization.md b/tensorflow/lite/g3doc/performance/model_optimization.md
index feb6cfecea6..c66b06f9b59 100644
--- a/tensorflow/lite/g3doc/performance/model_optimization.md
+++ b/tensorflow/lite/g3doc/performance/model_optimization.md
@@ -89,9 +89,9 @@ The following types of quantization are available in TensorFlow Lite:
 Technique                                                                                               | Data requirements                | Size reduction | Accuracy                    | Supported hardware
 ------------------------------------------------------------------------------------------------------- | -------------------------------- | -------------- | --------------------------- | ------------------
 [Post-training float16 quantization](post_training_float16_quant.ipynb)                                 | No data                          | Up to 50%      | Insignificant accuracy loss | CPU, GPU
-[Post-training dynamic range quantization](post_training_quant.ipynb)                                   | No data                          | Up to 75%      | Accuracy loss               | CPU
-[Post-training integer quantization](post_training_integer_quant.ipynb)                                 | Unlabelled representative sample | Up to 75%      | Smaller accuracy loss       | CPU, EdgeTPU, Hexagon DSP
-[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Labelled training data           | Up to 75%      | Smallest accuracy loss      | CPU, EdgeTPU, Hexagon DSP
+[Post-training dynamic range quantization](post_training_quant.ipynb)                                   | No data                          | Up to 75%      | Accuracy loss               | CPU, GPU (Android)
+[Post-training integer quantization](post_training_integer_quant.ipynb)                                 | Unlabelled representative sample | Up to 75%      | Smaller accuracy loss       | CPU, GPU (Android), EdgeTPU, Hexagon DSP
+[Quantization-aware training](http://www.tensorflow.org/model_optimization/guide/quantization/training) | Labelled training data           | Up to 75%      | Smallest accuracy loss      | CPU, GPU (Android), EdgeTPU, Hexagon DSP
 
 Below are the latency and accuracy results for post-training quantization and
 quantization-aware training on a few models. All latency numbers are measured on

From f5c5747f134b3dfd42b1d546f1842aa2e1e70670 Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Mon, 18 May 2020 15:29:57 -0700
Subject: [PATCH 381/412] Re-enable signal kernel tests on py38

PiperOrigin-RevId: 312166420
Change-Id: Ie18cf2e29d8a05d57675ce3e75b06509205a4e61
---
 tensorflow/python/kernel_tests/signal/BUILD     |  1 -
 .../python/kernel_tests/signal/test_util.py     |  4 +---
 .../kernel_tests/signal/window_ops_test.py      | 17 ++++++++---------
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index adb12a5e850..bd893184570 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -149,7 +149,6 @@ cuda_py_tests(
     python_version = "PY3",
     shard_count = 4,
     tags = [
-        "no_oss_py38",  #TODO(b/151631881)
         "no_windows_gpu",
     ],
     deps = [
diff --git a/tensorflow/python/kernel_tests/signal/test_util.py b/tensorflow/python/kernel_tests/signal/test_util.py
index 1e95fe4b28f..e8d477a843b 100644
--- a/tensorflow/python/kernel_tests/signal/test_util.py
+++ b/tensorflow/python/kernel_tests/signal/test_util.py
@@ -50,7 +50,7 @@ def grappler_optimize(graph, fetches=None, config_proto=None):
   return tf_optimizer.OptimizeGraph(config_proto, metagraph)
 
 
-def tflite_convert(fn, input_templates, use_mlir=False):
+def tflite_convert(fn, input_templates):
   """Converts the provided fn to tf.lite model.
 
   Args:
@@ -59,7 +59,6 @@ def tflite_convert(fn, input_templates, use_mlir=False):
     input_templates: A list of Tensors, ndarrays or TensorSpecs describing the
       inputs that fn expects. The actual values of the Tensors or ndarrays are
       unused.
-    use_mlir: Experimental. Whether to use the tf.lite MLIR converter.
 
   Returns:
     The serialized tf.lite model.
@@ -67,7 +66,6 @@ def tflite_convert(fn, input_templates, use_mlir=False):
   fn = def_function.function(fn)
   concrete_func = fn.get_concrete_function(*input_templates)
   converter = lite.TFLiteConverterV2([concrete_func])
-  converter.experimental_new_converter = use_mlir
   return converter.convert()
 
 
diff --git a/tensorflow/python/kernel_tests/signal/window_ops_test.py b/tensorflow/python/kernel_tests/signal/window_ops_test.py
index 9f5fe6f64c7..9432e70c7f2 100644
--- a/tensorflow/python/kernel_tests/signal/window_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/window_ops_test.py
@@ -156,15 +156,14 @@ class WindowOpsTest(test.TestCase, parameterized.TestCase):
       self.assertLen(rewritten_graph.node, 1)
 
   @parameterized.parameters(
-      # Due to control flow, only MLIR is supported.
       # Only float32 is supported.
-      (window_ops.hann_window, 10, False, dtypes.float32, True),
-      (window_ops.hann_window, 10, True, dtypes.float32, True),
-      (window_ops.hamming_window, 10, False, dtypes.float32, True),
-      (window_ops.hamming_window, 10, True, dtypes.float32, True),
-      (window_ops.vorbis_window, 12, None, dtypes.float32, True))
-  def test_tflite_convert(self, window_fn, window_length, periodic, dtype,
-                          use_mlir):
+      (window_ops.hann_window, 10, False, dtypes.float32),
+      (window_ops.hann_window, 10, True, dtypes.float32),
+      (window_ops.hamming_window, 10, False, dtypes.float32),
+      (window_ops.hamming_window, 10, True, dtypes.float32),
+      (window_ops.vorbis_window, 12, None, dtypes.float32))
+  def test_tflite_convert(self, window_fn, window_length, periodic, dtype):
+
     def fn(window_length):
       try:
         return window_fn(window_length, periodic=periodic, dtype=dtype)
@@ -172,7 +171,7 @@ class WindowOpsTest(test.TestCase, parameterized.TestCase):
         return window_fn(window_length, dtype=dtype)
 
     tflite_model = test_util.tflite_convert(
-        fn, [tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32)], use_mlir)
+        fn, [tensor_spec.TensorSpec(shape=[], dtype=dtypes.int32)])
     window_length = np.array(window_length).astype(np.int32)
     actual_output, = test_util.evaluate_tflite_model(
         tflite_model, [window_length])

From 94108993a3adc322b67d35244c8488ead4034dee Mon Sep 17 00:00:00 2001
From: Michael Gester <mgester@google.com>
Date: Mon, 18 May 2020 15:35:17 -0700
Subject: [PATCH 382/412] Allow static result shape for unranked operand in
 shape verifier

Previously, a static result shape for an unranked operand produced an error in
shape verifier. This was too restrictive because shape inference is often
incomplete at this point.

PiperOrigin-RevId: 312167322
Change-Id: Ia198f07699174a4ea3c77099c9408def95e058be
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc      | 9 ++++++---
 tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir | 6 +++---
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 78623ca3c61..69b8f15320f 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -2603,9 +2603,12 @@ LogicalResult VerifyShapeOperandAndResult(Operation *op, Type operand_type,
              << variadic_idx_str << " to match rank of operand"
              << variadic_idx_str;
   } else if (result_ranked_type.hasStaticShape()) {
-    // The operand is an unranked tensor, verify that the result is dynamic.
-    return op->emitOpError("requires dynamic shape result")
-           << variadic_idx_str << " for unranked operand" << variadic_idx_str;
+    // The operand is an unranked tensor, print a warning if the result
+    // is static.
+    // Note: We do not handle this situation as an error, this would be too
+    // restrictive due to incompleteness of shape inference at this point.
+    op->emitWarning("has static shape result")
+        << variadic_idx_str << " for unranked operand" << variadic_idx_str;
   }
 
   Type element_type = result_ranked_type.getElementType();
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index ffa287e0e53..3560fec7b7d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -1326,7 +1326,7 @@ func @testShapeMismatchDim(tensor<1x32x32x16xf32>) -> tensor<2xi32> {
 
 func @testShapeWrongResultDimDynamic(tensor<*xf32>) -> tensor<2xi32> {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{requires dynamic shape result for unranked operand}}
+  // expected-warning @+1 {{has static shape result for unranked operand}}
   %0 = "tf.Shape"(%arg0) {T = "tfdtype$DT_FLOAT", output = "tfdtype$DT_INT32"} : (tensor<*xf32>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }
@@ -1370,7 +1370,7 @@ func @testShapeNMismatchDim(tensor<1x32x32x16xf32>) -> tensor<2xi32> {
 
 func @testShapeNWrongResultDimDynamic(tensor<*xf32>) -> tensor<2xi32> {
 ^bb0(%arg0: tensor<*xf32>):
-  // expected-error @+1 {{requires dynamic shape result #1 for unranked operand #1}}
+  // expected-warning @+1 {{has static shape result #1 for unranked operand #1}}
   %0:2 = "tf.ShapeN"(%arg0, %arg0) : (tensor<*xf32>, tensor<*xf32>) -> (tensor<?xi32>, tensor<2xi32>)
   return %0#1 : tensor<2xi32>
 }
@@ -1428,7 +1428,7 @@ func @testVariableShapeMismatchDim(%arg0: tensor<*x!tf.resource<tensor<1x32x32x1
 // -----
 
 func @testVariableShapeWrongResultDimDynamic(%arg0: tensor<*x!tf.resource<tensor<*xf32>>>) -> tensor<2xi32> {
-  // expected-error @+1 {{requires dynamic shape result for unranked operand}}
+  // expected-warning @+1 {{has static shape result for unranked operand}}
   %0 = "tf.VariableShape"(%arg0) {output = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource<tensor<*xf32>>>) -> tensor<2xi32>
   return %0 : tensor<2xi32>
 }

From 1acf6989bf72de324f61be20491a7c017a7da5c6 Mon Sep 17 00:00:00 2001
From: Gaurav Jain <gjn@google.com>
Date: Mon, 18 May 2020 15:51:05 -0700
Subject: [PATCH 383/412] Fix argument check tests to work in eager mode

PiperOrigin-RevId: 312170271
Change-Id: Ie7ffb52cf63559255b5463d651eb72b924a3c3bf
---
 .../core/kernels/reverse_sequence_op.cc       | 44 +++++-----
 .../kernel_tests/reverse_sequence_op_test.py  | 83 +++++++++----------
 tensorflow/python/ops/array_ops.py            |  8 +-
 3 files changed, 67 insertions(+), 68 deletions(-)

diff --git a/tensorflow/core/kernels/reverse_sequence_op.cc b/tensorflow/core/kernels/reverse_sequence_op.cc
index 0e112133915..b5b62bc76ca 100644
--- a/tensorflow/core/kernels/reverse_sequence_op.cc
+++ b/tensorflow/core/kernels/reverse_sequence_op.cc
@@ -43,9 +43,9 @@ typedef Eigen::GpuDevice GPUDevice;
 template <typename Device, typename Tlen>
 void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
-  const Tensor& seq_lens = context->input(1);
+  const Tensor& seq_lengths = context->input(1);
 
-  auto seq_lens_t = seq_lens.vec<Tlen>();
+  auto seq_lens_t = seq_lengths.vec<Tlen>();
 
   std::vector<Tlen> seq_lens_vec(seq_lens_t.size());
 
@@ -56,15 +56,16 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
   OP_REQUIRES(context, seq_dim < input.dims(),
-              errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("seq_dim must be < input rank", " ( ",
                                       seq_dim, " vs. ", input.dims(), ")"));
   OP_REQUIRES(context, batch_dim < input.dims(),
-              errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("batch_dim must be < input rank", " ( ",
                                       batch_dim, " vs. ", input.dims(), ")"));
-  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim),
-              errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim,
-                                      "), ", "(", seq_lens.NumElements(),
-                                      " vs. ", input.dim_size(batch_dim), ")"));
+  OP_REQUIRES(
+      context, seq_lengths.NumElements() == input.dim_size(batch_dim),
+      errors::InvalidArgument("Length of seq_lengths != input.dims(", batch_dim,
+                              "), ", "(", seq_lengths.NumElements(), " vs. ",
+                              input.dim_size(batch_dim), ")"));
 
   for (size_t d = 0; d < seq_lens_vec.size(); ++d) {
     OP_REQUIRES(context, seq_lens_vec[d] >= 0,
@@ -77,21 +78,22 @@ void CheckErrors(OpKernelContext* context, int batch_dim, int seq_dim) {
 
 void CheckErrorsGPU(OpKernelContext* context, int batch_dim, int seq_dim) {
   const Tensor& input = context->input(0);
-  const Tensor& seq_lens = context->input(1);
+  const Tensor& seq_lengths = context->input(1);
 
   OP_REQUIRES(context, batch_dim != seq_dim,
               errors::InvalidArgument("batch_dim == seq_dim == ", seq_dim));
   OP_REQUIRES(context, seq_dim < input.dims(),
-              errors::InvalidArgument("seq_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("seq_dim must be < input rank", " ( ",
                                       seq_dim, " vs. ", input.dims(), ")"));
   OP_REQUIRES(context, batch_dim < input.dims(),
-              errors::InvalidArgument("batch_dim must be < input.dims()", "( ",
+              errors::InvalidArgument("batch_dim must be < input rank", " ( ",
                                       batch_dim, " vs. ", input.dims(), ")"));
 
-  OP_REQUIRES(context, seq_lens.NumElements() == input.dim_size(batch_dim),
-              errors::InvalidArgument("len(seq_lens) != input.dims(", batch_dim,
-                                      "), ", "(", seq_lens.NumElements(),
-                                      " vs. ", input.dim_size(batch_dim), ")"));
+  OP_REQUIRES(
+      context, seq_lengths.NumElements() == input.dim_size(batch_dim),
+      errors::InvalidArgument("Length of seq_lengths != input.dims(", batch_dim,
+                              "), ", "(", seq_lengths.NumElements(), " vs. ",
+                              input.dim_size(batch_dim), ")"));
 }
 
 template <>
@@ -117,14 +119,14 @@ class ReverseSequenceOp : public OpKernel {
 
   void Compute(OpKernelContext* context) override {
     const Tensor& input = context->input(0);
-    const Tensor& seq_lens = context->input(1);
+    const Tensor& seq_lengths = context->input(1);
 
     // Preliminary validation of sizes.
-    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lens.shape()),
-                errors::InvalidArgument("seq_lens input must be 1-dim, not ",
-                                        seq_lens.dims()));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(seq_lengths.shape()),
+                errors::InvalidArgument("seq_lengths must be 1-dim, not ",
+                                        seq_lengths.dims()));
 
-    auto seq_lens_t = seq_lens.vec<Tlen>();
+    auto seq_lens_t = seq_lengths.vec<Tlen>();
 
     CheckErrors<Device, Tlen>(context, batch_dim_, seq_dim_);
     if (!context->status().ok()) return;
@@ -186,7 +188,7 @@ namespace functor {
   void ReverseSequence<GPUDevice, T, Tlen, Dims>::Compute(             \
       const GPUDevice& d, typename TTypes<T, Dims>::ConstTensor input, \
       int32 batch_dim, int32 seq_dim,                                  \
-      typename TTypes<Tlen>::ConstVec seq_lens,                        \
+      typename TTypes<Tlen>::ConstVec seq_lengths,                     \
       typename TTypes<T, Dims>::Tensor output);                        \
   extern template struct ReverseSequence<GPUDevice, T, Tlen, Dims>;
 
diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
index 05307c9834a..267decff38b 100644
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@@ -19,10 +19,11 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
-from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
@@ -135,56 +136,52 @@ class ReverseSequenceTest(test.TestCase):
     print("ReverseSequence gradient error = %g" % err)
     self.assertLess(err, 1e-8)
 
-  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
-    t = array_ops.reverse_sequence(
-        array_ops.placeholder(
-            dtypes.float32, shape=None),
-        seq_lengths=array_ops.placeholder(
-            dtypes.int64, shape=(32,)),
-        batch_axis=0,
-        seq_axis=1)
-    self.assertIs(t.get_shape().ndims, None)
+    # Enter graph mode since we want to test partial shapes
+    with context.graph_mode():
+      t = array_ops.reverse_sequence(
+          array_ops.placeholder(dtypes.float32, shape=None),
+          seq_lengths=array_ops.placeholder(dtypes.int64, shape=(32,)),
+          batch_axis=0,
+          seq_axis=1)
+      self.assertIs(t.get_shape().ndims, None)
 
+  def testInvalidArguments(self):
     # Batch size mismatched between input and seq_lengths.
-    with self.assertRaises(ValueError):
-      array_ops.reverse_sequence(
-          array_ops.placeholder(
-              dtypes.float32, shape=(32, 2, 3)),
-          seq_lengths=array_ops.placeholder(
-              dtypes.int64, shape=(33,)),
-          seq_axis=3)
+    # seq_length too long
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 (r"Dimensions must be equal|"
+                                  r"Length of seq_lengths != input.dims\(0\)")):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2, 2], seq_axis=1)
+
+    # seq_length too short
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 (r"Dimensions must be equal|"
+                                  r"Length of seq_lengths != input.dims\(0\)")):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2], seq_axis=1)
+
+    # Invalid seq_length shape
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 ("Shape must be rank 1 but is rank 2|"
+                                  "seq_lengths must be 1-dim")):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [[2, 2]], seq_axis=1)
 
     # seq_axis out of bounds.
-    with self.assertRaisesRegexp(ValueError, "seq_dim must be < input rank"):
-      array_ops.reverse_sequence(
-          array_ops.placeholder(
-              dtypes.float32, shape=(32, 2, 3)),
-          seq_lengths=array_ops.placeholder(
-              dtypes.int64, shape=(32,)),
-          seq_axis=3)
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 "seq_dim must be < input rank"):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], seq_axis=2)
 
     # batch_axis out of bounds.
-    with self.assertRaisesRegexp(ValueError, "batch_dim must be < input rank"):
-      array_ops.reverse_sequence(
-          array_ops.placeholder(
-              dtypes.float32, shape=(32, 2, 3)),
-          seq_lengths=array_ops.placeholder(
-              dtypes.int64, shape=(32,)),
-          seq_axis=0,
-          batch_axis=3)
+    with self.assertRaisesRegexp((ValueError, errors.InvalidArgumentError),
+                                 "batch_dim must be < input rank"):
+      array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2],
+                                 seq_axis=1,
+                                 batch_axis=3)
 
-    with self.cached_session():
-      inputs = array_ops.placeholder(dtypes.float32, shape=(32, 2, 3))
-      seq_lengths = array_ops.placeholder(dtypes.int64, shape=(32,))
-      output = array_ops.reverse_sequence(
-          inputs, seq_lengths=seq_lengths,
-          seq_axis=0)  # batch_axis default is 0
-      with self.assertRaisesOpError("batch_dim == seq_dim"):
-        output.eval(feed_dict={
-            inputs: np.random.rand(32, 2, 3),
-            seq_lengths: xrange(32)
-        })
+    with self.assertRaisesRegexp((errors.OpError, errors.InvalidArgumentError),
+                                 "batch_dim == seq_dim == 0"):
+      output = array_ops.reverse_sequence([[1, 2], [3, 4]], [2, 2], seq_axis=0)
+      self.evaluate(output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index a2640925a38..ce0755fc782 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -4473,8 +4473,8 @@ def reverse_sequence(input,
   dimension `seq_axis`.
 
   The elements of `seq_lengths` must obey `seq_lengths[i] <=
-  input.dims[seq_dim]`, and `seq_lengths` must be a vector of length
-  `input.dims[batch_dim]`.
+  input.dims[seq_axis]`, and `seq_lengths` must be a vector of length
+  `input.dims[batch_axis]`.
 
   The output slice `i` along dimension `batch_axis` is then given by
   input slice `i`, with the first `seq_lengths[i]` slices along
@@ -4496,8 +4496,8 @@ def reverse_sequence(input,
   Args:
     input: A `Tensor`. The input to reverse.
     seq_lengths: A `Tensor`. Must be one of the following types: `int32`,
-      `int64`. 1-D with length `input.dims(batch_dim)` and `max(seq_lengths) <=
-      input.dims(seq_dim)`
+      `int64`. 1-D with length `input.dims(batch_axis)` and `max(seq_lengths) <=
+      input.dims(seq_axis)`
     seq_axis: An `int`. The dimension which is partially reversed.
     batch_axis: An optional `int`. Defaults to `0`. The dimension along which
       reversal is performed.

From ad6e816328507f80c30d25d73b0c03219d339dd6 Mon Sep 17 00:00:00 2001
From: Hanhan Wang <hanchung@google.com>
Date: Mon, 18 May 2020 16:06:46 -0700
Subject: [PATCH 384/412] Add lowering from xla_hlo/lhlo reverse op to Linalg.

This is only supported for static shape.

PiperOrigin-RevId: 312173157
Change-Id: Iab149f02153597ef5a967628397fcac9a4db1329
---
 .../xla/tests/hlo-legalize-to-linalg.mlir     | 13 ++++++++
 .../xla/tests/lhlo-legalize-to-linalg.mlir    | 13 ++++++++
 .../xla/transforms/xla_legalize_to_linalg.cc  | 30 +++++++++++++++++++
 3 files changed, 56 insertions(+)

diff --git a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
index a856ee5e83c..a27bf2cff79 100644
--- a/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/hlo-legalize-to-linalg.mlir
@@ -542,3 +542,16 @@ func @convert_f32_to_i32(%input: tensor<2x2xf32>) -> tensor<2x2xi32> {
 // CHECK-NEXT: ^bb0(%[[OPERAND_IN:.*]]: f32):
 // CHECK-NEXT:   %[[RESULT:.*]] = fptosi %[[OPERAND_IN]] : f32 to i32
 // CHECK-NEXT:   linalg.yield %[[RESULT]] : i32
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @reverse
+func @reverse(%input: tensor<2x3xf32>) -> tensor<2x3xf32> {
+  %result = "xla_hlo.reverse"(%input) {
+    dimensions = dense<1> : tensor<1xi64>
+  } : (tensor<2x3xf32>) -> tensor<2x3xf32>
+  return %result : tensor<2x3xf32>
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
diff --git a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
index bb8010b520c..626e905695c 100644
--- a/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/lhlo-legalize-to-linalg.mlir
@@ -636,3 +636,16 @@ func @reshape_2D_4D(%arg0: memref<12x42xi32>, %arg1 : memref<12x1x42x1xi32>) {
   return
 }
 // CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
+
+// -----
+
+// CHECK-DAG: #[[OPERAND_MAP:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 2)>
+// CHECK-DAG: #[[RESULT_MAP:.*]] = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK-LABEL: func @reverse
+func @reverse(%arg0: memref<2x3xf32>, %arg1: memref<2x3xf32>) {
+  "xla_lhlo.reverse"(%arg0, %arg1) {
+    dimensions = dense<1> : tensor<1xi64>
+  } : (memref<2x3xf32>, memref<2x3xf32>) -> ()
+  return
+}
+// CHECK: linalg.generic {{{.*}}indexing_maps = [#[[OPERAND_MAP]], #[[RESULT_MAP]]]
diff --git a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
index 799a20aa693..2b496677d62 100644
--- a/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/xla_legalize_to_linalg.cc
@@ -573,6 +573,34 @@ class ConstConverter : public OpConversionPattern<xla_lhlo::ConstOp> {
   }
 };
 
+// TODO(b/156787842): Support the lowering for dynamic shapes.
+template <typename OpTy, bool isLHLO = true>
+class ReverseConverter
+    : public DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
+                                     isLHLO> {
+ public:
+  using DataMovementOpConverter<ReverseConverter<OpTy, isLHLO>, OpTy,
+                                isLHLO>::DataMovementOpConverter;
+  static ArrayAttr getIndexingMapsAttr(OpTy op, Builder* b) {
+    auto resultType =
+        getXLAOpResultType<isLHLO>(op).template cast<ShapedType>();
+    auto nloops = resultType.getRank();
+    SmallVector<AffineExpr, 2> inputExprs;
+    inputExprs.reserve(nloops);
+    for (int i = 0; i < nloops; ++i)
+      inputExprs.push_back(b->getAffineDimExpr(i));
+    for (auto dim : op.dimensions()) {
+      int i = dim.getZExtValue();
+      if (resultType.isDynamicDim(i)) return {};
+      int n = resultType.getShape()[i];
+      inputExprs[i] = b->getAffineConstantExpr(n - 1) - inputExprs[i];
+    }
+    return b->getAffineMapArrayAttr(
+        {AffineMap::get(nloops, /*symbolCount=*/0, inputExprs, b->getContext()),
+         b->getMultiDimIdentityMap(nloops)});
+  }
+};
+
 class SliceConverter : public OpConversionPattern<xla_lhlo::SliceOp> {
  public:
   using OpConversionPattern<xla_lhlo::SliceOp>::OpConversionPattern;
@@ -642,6 +670,7 @@ void populateLHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_lhlo::SubOp>,
                    PointwiseToLinalgConverter<xla_lhlo::TanhOp>,
                    ReshapeAddRemoveDimConverter<xla_lhlo::ReshapeOp>,
+                   ReverseConverter<xla_lhlo::ReverseOp>,
                    ScalarPointwiseToStandardConverter<xla_lhlo::AddOp>,
                    SliceConverter
                   >(context);
@@ -742,6 +771,7 @@ void populateHLOToLinalgConversionPattern(MLIRContext* context,
                    PointwiseToLinalgConverter<xla_hlo::TanhOp, false>,
                    ReshapeAddRemoveDimConverter<xla_hlo::ReshapeOp, false>,
                    ReshapeOpConverter<xla_hlo::ReshapeOp, false>,
+                   ReverseConverter<xla_hlo::ReverseOp, false>,
                    TransposeConverter<xla_hlo::TransposeOp, false>>(context);
 }
 

From ad6798a2f62ae2cb7f433af7b721bf14b9850dde Mon Sep 17 00:00:00 2001
From: Berkin Ilbeyi <berkin@google.com>
Date: Mon, 18 May 2020 17:01:57 -0700
Subject: [PATCH 385/412] [XLA] Fix alternate memory allocation of conditional
 operands.

Consider the following flattened HLO schedule of a conditional:

1: a = fusion()
   true_computation:
2:    parameter = parameter(0)
3:    ...
4:    ...
   false_computation:
5:    parameter = parameter(0)
6:    ...
7:    ...
8: conditional = conditional(pred, a, a)
9: b = fusion(a)

When we had a tensor that was a conditional operand (e.g. "a" in the example),
we reserved the alternate memory for the entire 1-8 range. This meant that when
we tried to allocate inside the called computations of the conditional, the
offset we picked wasn't available since it would fall within the 1-8 range. This
CL now reserves the conditional until the parameter of the earliest called
computations (1-2 range).

To allow efficient use of alternate memory by avoiding a very large conditional
from claiming the offset for the entire called computation, the conditional
operand might die within the called computation, allowing other HLOs inside the
called computations to reclaim that alternate memory offset. This creates a
subtlety for subsequent uses of conditional operands (e.g. "a" is used by a
fusion at 9). These subsequent uses will force evictions (and then do another
prefetch). After optimization, the graph might look like the following:

  a (Alternate Mem) = fusion()
  cs0 = copy-start(a)  # Must evict a because the allocation may die within
                       # called computation.
  cd0 (Default Mem) = copy-done(cs0)
  true_computation:
    parameter (Alternate Mem) = parameter(0)
    ...
    # parameter's alternate memory allocation may die here and another tensor
    # might use the same offset.
  false_computation:
    parameter (Alternate Mem) = parameter(0)
    ...
    # parameter's alternate memory allocation may die here and another tensor
    # might use the same offset.
  conditional = conditional(pred, a, a)
  cs1 = copy-start(cd0)  # May prefetch the value back to alternate memory.
  cd1 (Alternate Mem) = copy-done(cs1)
  b = fusion(cd1)

PiperOrigin-RevId: 312182824
Change-Id: I3ff5d019025ef96ced1aed4f6d170df677273348
---
 .../xla/service/memory_space_assignment.cc    | 296 ++++++++++++----
 .../xla/service/memory_space_assignment.h     |  18 +-
 .../service/memory_space_assignment_test.cc   | 321 +++++++++++++++++-
 3 files changed, 563 insertions(+), 72 deletions(-)

diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 431e6af2dc0..81a8a102402 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -502,7 +502,8 @@ bool AlternateMemoryBestFitHeap::IsIntervalAllowedInAlternateMemory(
 }
 
 bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
-    const HloUse& use) const {
+    const AllocationValue& value, const HloUse& use) const {
+  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
   if (use.instruction->opcode() == HloOpcode::kWhile) {
     HloComputation* while_body = use.instruction->while_body();
 
@@ -512,7 +513,6 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
     HloValue* parameter_value =
         &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
             while_body->parameter_instruction(0), use.operand_index);
-    const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
     int64 parameter_time =
         instruction_schedule.at(while_body->parameter_instruction(0));
     int64 root_time = instruction_schedule.at(while_body->root_instruction());
@@ -567,7 +567,54 @@ bool AlternateMemoryBestFitHeap::IsUseAllowedInAlternateMemory(
                  "there is a required default memory assignment.";
       return false;
     }
+  } else if (use.instruction->opcode() == HloOpcode::kConditional) {
+    // For any use of this conditional (the same value might be passed into
+    // multiple called computations), determine if the parameter->first use
+    // dependency is short.
+    int64 conditional_time = instruction_schedule.at(use.instruction);
+    for (const HloUse& other_use : value.uses()) {
+      if (other_use.instruction != use.instruction) {
+        continue;
+      }
+      HloComputation* called_computation =
+          use.instruction->called_computations().at(other_use.operand_number -
+                                                    1);
+      const HloInstruction* parameter_instruction =
+          called_computation->parameter_instruction(0);
+      HloValue* parameter_value =
+          &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+              parameter_instruction, other_use.operand_index);
+      int64 parameter_time = instruction_schedule.at(parameter_instruction);
+      int64 min_use_time = conditional_time;
+      for (const HloUse& parameter_use : parameter_value->uses()) {
+        if (parameter_use.instruction->parent() == called_computation &&
+            parameter_use.instruction->opcode() !=
+                HloOpcode::kGetTupleElement &&
+            parameter_use.instruction->opcode() != HloOpcode::kTuple &&
+            parameter_use.instruction->opcode() != HloOpcode::kBitcast) {
+          min_use_time = std::min(
+              min_use_time, instruction_schedule.at(parameter_use.instruction));
+        }
+      }
+      if (options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
+              parameter_value->shape(), parameter_time, min_use_time)) {
+        VLOG(4) << "Conditional allocation allowed in alternate memory for "
+                   "computation = "
+                << called_computation->name()
+                << ", parameter time = " << parameter_time
+                << ", min use time = " << min_use_time;
+        return true;
+      } else {
+        VLOG(4) << "Conditional allocation not allowed in alternate memory for "
+                   "computation = "
+                << called_computation->name()
+                << ", parameter time = " << parameter_time
+                << ", min use time = " << min_use_time;
+      }
+    }
+    return false;
   }
+
   return true;
 }
 
@@ -769,20 +816,12 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
         if (position.instruction->opcode() == HloOpcode::kConditional) {
           VLOG(3) << "Adding required assignment for condition output: "
                   << value->ToShortString();
-          required_assignments_[value].push_back(
-              {MemorySpace::kDefault,
-               instruction_schedule.at(position.instruction),
-               /*chunk=*/absl::nullopt});
+          AddRequiredAssignment(position.instruction, position.index,
+                                MemorySpace::kDefault);
           for (const HloComputation* called_computation :
                position.instruction->called_computations()) {
-            HloValue* root_value =
-                &alias_analysis_.dataflow_analysis().GetUniqueValueAt(
-                    called_computation->root_instruction(), position.index);
-            required_assignments_[root_value].push_back(
-                {MemorySpace::kDefault,
-                 instruction_schedule.at(
-                     called_computation->root_instruction()),
-                 /*chunk=*/absl::nullopt});
+            AddRequiredAssignment(called_computation->root_instruction(),
+                                  position.index, MemorySpace::kDefault);
           }
         }
       }
@@ -808,9 +847,13 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
       }
 
       // Iterate over the uses.
-      for (HloUse use : allocation_value.uses()) {
+      for (int use_idx = 0; use_idx < allocation_value.uses().size();
+           ++use_idx) {
+        const HloUse& use = allocation_value.uses().at(use_idx);
         int64 use_time = instruction_schedule.at(use.instruction);
         int64 latest_prefetch_time = use_time;
+        bool allow_no_copy_alternate_mem_allocation = true;
+        absl::optional<int64> earliest_prefetch_time = absl::nullopt;
 
         // Sequential calls include kWhile, kCall, and kConditional opcodes.
         bool is_sequential_call =
@@ -857,14 +900,41 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
             // when we look at uses within the while loop body.
             use_time =
                 instruction_schedule.at(while_body->parameter_instruction(0));
+          } else if (use.instruction->opcode() == HloOpcode::kConditional) {
+            // Replace the use time with the earliest parameter of called
+            // computations.
+            for (const HloComputation* called_computation :
+                 use.instruction->called_computations()) {
+              use_time = std::min(
+                  use_time, instruction_schedule.at(
+                                called_computation->parameter_instruction(0)));
+            }
           }
         }
 
         // Add a required assignment in default memory if the use not allowed in
         // alternate memory.
-        if (!IsUseAllowedInAlternateMemory(use)) {
-          required_assignments_[allocation_value.value()].push_back(
-              {MemorySpace::kDefault, use_time, /*chunk=*/absl::nullopt});
+        if (!IsUseAllowedInAlternateMemory(allocation_value, use)) {
+          AddRequiredAssignment(allocation_value.value(), use.instruction,
+                                MemorySpace::kDefault, use_time);
+        } else if (use_idx > 0) {
+          // We allow buffers in alternate memory that are passed into
+          // conditionals to give up their alternate memory allocation inside
+          // the called computation. This means that if a conditional operator
+          // has an alternate memory allocation, subsequent uses cannot use the
+          // same alternate memory allocation in order not to clobber data. So
+          // we force default memory allocation for these subsequent uses.
+          const HloUse& previous_use = allocation_value.uses().at(use_idx - 1);
+          if (previous_use.instruction->opcode() == HloOpcode::kConditional &&
+              previous_use.instruction != use.instruction) {
+            allow_no_copy_alternate_mem_allocation = false;
+            earliest_prefetch_time =
+                instruction_schedule.at(previous_use.instruction);
+            VLOG(3) << "Previous use (" << previous_use.ToString()
+                    << ") of use (" << use.ToString()
+                    << ") is a conditional, so this use will need to evict. "
+                    << "Earliest prefetch time = " << *earliest_prefetch_time;
+          }
         }
 
         // Bitcasts don't define buffers and don't directly consume buffers.
@@ -872,10 +942,16 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() {
         // bitcasts will be handled specially.
         if (use.instruction->opcode() != HloOpcode::kBitcast) {
           AllocationRequest request;
-          request.start_time = definition_time;
+          // Rarely, (e.g., when conditional true and false parameters are the
+          // same), definition time can be the time of the conditional and use
+          // time is the parameter use, which is less.
+          request.start_time = std::min(definition_time, use_time);
           request.end_time = use_time;
           request.latest_prefetch_time = latest_prefetch_time;
           request.size = interval.size;
+          request.allow_no_copy_alternate_mem_allocation =
+              allow_no_copy_alternate_mem_allocation;
+          request.earliest_prefetch_time = earliest_prefetch_time;
           request.preferred_offset = preferred_offset;
           request.use = use;
           request.allocation_value = &allocation_value;
@@ -1061,35 +1137,42 @@ void AlternateMemoryBestFitHeap::AddAliasedRequiredAssignment(
   if (aliased_allocation->memory_space() == MemorySpace::kAlternate) {
     chunk = aliased_allocation->chunk();
   }
-  const auto& instruction_schedule = hlo_live_range_.instruction_schedule();
-  HloValue* value =
-      &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index);
-  int64 instruction_time = instruction_schedule.at(instruction);
+  AddRequiredAssignment(instruction, index, aliased_allocation->memory_space(),
+                        chunk);
+}
+
+void AlternateMemoryBestFitHeap::AddRequiredAssignment(
+    const HloValue* value, const HloInstruction* instruction,
+    MemorySpaceAssignment::MemorySpace memory_space, int64 time,
+    absl::optional<HeapSimulator::Chunk> chunk) {
   // Check for existing required assignment at this time and make sure it is the
   // same as this if there is one.
-  auto existing_required_assignment =
-      RequiredMemoryAssignmentAt(value, instruction_time);
+  auto existing_required_assignment = RequiredMemoryAssignmentAt(value, time);
   if (existing_required_assignment) {
-    CHECK(aliased_allocation->memory_space() ==
-          existing_required_assignment->memory_space);
+    CHECK(memory_space == existing_required_assignment->memory_space)
+        << "inst = " << instruction->ToString() << " at " << time;
     CHECK((!chunk && !existing_required_assignment->chunk) ||
           chunk->offset == existing_required_assignment->chunk->offset);
-    VLOG(3) << "Not adding aliased required assignment because there is one "
-               "already: "
-            << value->ToShortString() << " at " << instruction_time << " at "
-            << (aliased_allocation->memory_space() == MemorySpace::kDefault
-                    ? "def"
-                    : "alt");
-    return;
+    VLOG(3) << "Not adding required assignment because there is one already: "
+            << value->ToShortString() << " at " << time << " at "
+            << (memory_space == MemorySpace::kDefault ? "def" : "alt");
+  } else {
+    VLOG(3) << "Adding required assignment: " << value->ToShortString()
+            << " at " << time << " at "
+            << (memory_space == MemorySpace::kDefault ? "def" : "alt");
+    required_assignments_[value].push_back({memory_space, time, chunk});
   }
+}
 
-  required_assignments_[value].push_back(
-      {aliased_allocation->memory_space(), instruction_time, chunk});
-  VLOG(3) << "Adding aliased required assignment: " << value->ToShortString()
-          << " at " << instruction_time << " at "
-          << (aliased_allocation->memory_space() == MemorySpace::kDefault
-                  ? "def"
-                  : "alt");
+void AlternateMemoryBestFitHeap::AddRequiredAssignment(
+    const HloInstruction* instruction, ShapeIndex index,
+    MemorySpace memory_space, absl::optional<Chunk> chunk) {
+  const HloValue* value =
+      &alias_analysis_.dataflow_analysis().GetUniqueValueAt(instruction, index);
+  int64 instruction_time =
+      hlo_live_range_.instruction_schedule().at(instruction);
+  AddRequiredAssignment(value, instruction, memory_space, instruction_time,
+                        chunk);
 }
 
 void AlternateMemoryBestFitHeap::AddInputAndOutputRequiredAssignments() {
@@ -1289,6 +1372,7 @@ bool AlternateMemoryBestFitHeap::FindAllocation(
   // First try keeping the allocation entirely in the alternate memory.
   if (required_memory_space_at_start != MemorySpace::kDefault &&
       required_memory_space_at_end != MemorySpace::kDefault &&
+      request.allow_no_copy_alternate_mem_allocation &&
       AllocateInAlternateMemoryNoCopy(request)) {
     return true;
   }
@@ -1618,9 +1702,14 @@ bool AlternateMemoryBestFitHeap::Prefetch(
   //                                     ^      ^
   //                                   Copy    Copy
   //                                   Start   Done
-  options_.prefetch_interval_picker->Begin(
-      request.use, prev_allocation_in_default_mem.earliest_available_time(),
-      request.latest_prefetch_time);
+  int64 earliest_prefetch_time =
+      prev_allocation_in_default_mem.earliest_available_time();
+  if (request.earliest_prefetch_time) {
+    earliest_prefetch_time =
+        std::max(earliest_prefetch_time, *request.earliest_prefetch_time);
+  }
+  options_.prefetch_interval_picker->Begin(request.use, earliest_prefetch_time,
+                                           request.latest_prefetch_time);
   VLOG(3) << "Trying prefetch picker = "
           << options_.prefetch_interval_picker->ToDebugString();
 
@@ -2435,6 +2524,34 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
            std::tuple<const HloValue*, Chunk, HeapSimulatorTrace::Event::Kind>>
       events;
 
+  auto add_allocation_and_verify = [&](int64 start_time, int64 end_time,
+                                       const Chunk& chunk,
+                                       const HloValue* value) {
+    events[std::make_tuple(start_time, /*is_free=*/false, value->id())] =
+        std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC);
+    events[std::make_tuple(end_time, /*is_free=*/true, value->id())] =
+        std::make_tuple(value, chunk, HeapSimulatorTrace::Event::FREE);
+
+    // Get the chunks overlapping in time and search if they overlap in space
+    // as well.
+    // TODO(berkin): For now checking against end_time - 1 (exclusive), but we
+    // really should check against end_time (inclusive) for cases where the
+    // operand can't share buffer with user (see
+    // HloDataflowAnalysis::CanShareOperandBufferWithUser).
+    for (const Chunk& overlapping_chunk :
+         interval_tree.ChunksOverlappingInTime(start_time, end_time - 1)) {
+      if (chunk.OverlapsWith(overlapping_chunk)) {
+        return InternalError(
+            ("Value %s (%d, %d) off: %d size: %d overlaps with another chunk"
+             " off: %d size: %d"),
+            value->ToShortString(), start_time, end_time, chunk.offset,
+            chunk.size, overlapping_chunk.offset, overlapping_chunk.size);
+      }
+    }
+    interval_tree.Add(start_time, end_time - 1, chunk);
+    return Status::OK();
+  };
+
   // Go through all instructions in the module to ensure CopyStart/CopyDone
   // instructions copy between alternate memory and default memory.
   for (const HloComputation* computation :
@@ -2470,34 +2587,73 @@ Status MemorySpaceAssignment::VerifyAndExportHeapSimulatorTrace() {
     for (const HloValue* value : buffer.values()) {
       const HloLiveRange::TimeBound& time_bound =
           hlo_live_range->buffer_live_ranges().at(value);
-      events[std::make_tuple(time_bound.start, /*is_free=*/false,
-                             value->id())] =
-          std::make_tuple(value, chunk, HeapSimulatorTrace::Event::ALLOC);
-      events[std::make_tuple(time_bound.end, /*is_free=*/true, value->id())] =
-          std::make_tuple(value, chunk, HeapSimulatorTrace::Event::FREE);
-
-      VLOG(3) << " buffer: " << buffer.ToString()
-              << " value: " << value->ToShortString() << ": ("
-              << time_bound.start << ", " << time_bound.end
-              << ") off: " << chunk.offset << ", size: " << chunk.size;
-      // Get the chunks overlapping in time and search if they overlap in space
-      // as well.
-      // TODO(berkin): For now checking against end_time - 1 (exclusive), but we
-      // really should check against end_time (inclusive) for cases where the
-      // operand can't share buffer with user (see
-      // HloDataflowAnalysis::CanShareOperandBufferWithUser).
-      for (const Chunk& overlapping_chunk :
-           interval_tree.ChunksOverlappingInTime(time_bound.start,
-                                                 time_bound.end - 1)) {
-        if (chunk.OverlapsWith(overlapping_chunk)) {
-          return InternalError(
-              ("Buffer %s (%d, %d) off: %d size: %d overlaps with another chunk"
-               " off: %d size: %d"),
-              buffer.ToString(), time_bound.start, time_bound.end, chunk.offset,
-              chunk.size, overlapping_chunk.offset, overlapping_chunk.size);
+      const HloInstruction* last_use_instruction = nullptr;
+      int64 last_use_time = time_bound.start;
+      for (const HloUse& use : value->uses()) {
+        int64 use_time =
+            hlo_live_range->instruction_schedule().at(use.instruction);
+        if (use_time > last_use_time) {
+          last_use_time = use_time;
+          last_use_instruction = use.instruction;
         }
       }
-      interval_tree.Add(time_bound.start, time_bound.end - 1, chunk);
+
+      if (last_use_instruction &&
+          last_use_instruction->opcode() == HloOpcode::kConditional) {
+        // Special case when verifying conditional: we internally split the use
+        // of alternate memory in conditionals, so fish them out from the
+        // conditionals.
+        VLOG(3) << " Splitting conditional buffer: " << buffer.ToString()
+                << " value: " << value->ToShortString() << ": ("
+                << time_bound.start << ", " << time_bound.end
+                << ") off: " << chunk.offset << ", size: " << chunk.size;
+        int64 earliest_computation_start_time = time_bound.end;
+        for (const HloComputation* called_computation :
+             last_use_instruction->called_computations()) {
+          earliest_computation_start_time =
+              std::min(earliest_computation_start_time,
+                       hlo_live_range->computation_span_times()
+                           .at(called_computation)
+                           .start);
+          int64 parameter_time = -1;
+          int64 last_use_time = -1;
+          for (const HloPosition& position : value->positions()) {
+            if (position.instruction->opcode() == HloOpcode::kParameter &&
+                position.instruction->parent() == called_computation) {
+              parameter_time = hlo_live_range->instruction_schedule().at(
+                  position.instruction);
+              break;
+            }
+          }
+          for (const HloUse& use : value->uses()) {
+            if (use.instruction->parent() == called_computation) {
+              last_use_time = std::max(
+                  last_use_time,
+                  hlo_live_range->instruction_schedule().at(use.instruction));
+            }
+          }
+          if (last_use_time != -1) {
+            CHECK_NE(parameter_time, -1);
+            VLOG(3) << "  computation: " << called_computation->name() << ": ("
+                    << parameter_time << ", " << last_use_time << ")";
+            TF_RETURN_IF_ERROR(add_allocation_and_verify(
+                parameter_time, last_use_time, chunk, value));
+          }
+        }
+        VLOG(3) << "  from beginning until first computation: ("
+                << time_bound.start << ", "
+                << (earliest_computation_start_time - 1) << ")";
+        TF_RETURN_IF_ERROR(add_allocation_and_verify(
+            time_bound.start, earliest_computation_start_time - 1, chunk,
+            value));
+      } else {
+        VLOG(3) << " buffer: " << buffer.ToString()
+                << " value: " << value->ToShortString() << ": ("
+                << time_bound.start << ", " << time_bound.end
+                << ") off: " << chunk.offset << ", size: " << chunk.size;
+        TF_RETURN_IF_ERROR(add_allocation_and_verify(
+            time_bound.start, time_bound.end, chunk, value));
+      }
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index 727b8da6c08..340446d21dd 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -816,11 +816,16 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   // use_times is a sorted sequence of the times of all uses.
   // latest_prefetch_time is the latest time we can schedule the CopyDone for a
   // prefetch.
+  // If allow_no_copy_alternate_mem_allocation is false, an eviction is forced.
+  // If earliest_prefetch_time is set, prefetches cannot start before this
+  // value.
   struct AllocationRequest {
     int64 start_time;
     int64 end_time;
     int64 latest_prefetch_time;
     int64 size;
+    bool allow_no_copy_alternate_mem_allocation;
+    absl::optional<int64> earliest_prefetch_time;
     absl::optional<int64> preferred_offset;
     HloUse use;
     MemorySpaceAssignment::AllocationValue* allocation_value;
@@ -841,7 +846,8 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
   bool IsIntervalAllowedInAlternateMemory(const BufferInterval& interval) const;
 
   // Returns true if the use is allowed in the alternate memory.
-  bool IsUseAllowedInAlternateMemory(const HloUse& use) const;
+  bool IsUseAllowedInAlternateMemory(const AllocationValue& value,
+                                     const HloUse& use) const;
 
   // Given an HloValue, creates AllocationValue objects and corresponding
   // AllocationSequences and appends them into allocation_sequence_list_.
@@ -895,6 +901,16 @@ class AlternateMemoryBestFitHeap : public GlobalDecreasingSizeBestFitHeap {
       const HloInstruction* instruction, ShapeIndex index,
       const MemorySpaceAssignment::Allocation* aliased_allocation);
 
+  // This sets a required assignment. CHECK fails if there is a conflicting
+  // required assignment at the same time.
+  void AddRequiredAssignment(const HloValue* value,
+                             const HloInstruction* instruction,
+                             MemorySpace memory_space, int64 time,
+                             absl::optional<Chunk> chunk = absl::nullopt);
+  void AddRequiredAssignment(const HloInstruction* instruction,
+                             ShapeIndex index, MemorySpace memory_space,
+                             absl::optional<Chunk> chunk = absl::nullopt);
+
   // Adds input and outputs as required assignments.
   void AddInputAndOutputRequiredAssignments();
 
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index 984f2e7b4ea..a9be3850d89 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -1663,6 +1663,324 @@ TEST_P(MemorySpaceAssignmentTest, ControlPredecessorsBug) {
   AssignMemorySpace(module.get());
 }
 
+TEST_P(MemorySpaceAssignmentTest, ConditionalShouldBeAllocatedInAlternateMem) {
+  // Checks if simple conditionals get alternate memory allocations.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg1 = f32[3]{0} negate(gte)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg2 = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}) tuple(copy)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple, tuple), true_computation=true_computation, false_computation=false_computation
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Check that copy and gtes got alternate memory allocations.
+    auto copy =
+        module->GetComputationWithName("entry")->GetInstructionWithName("copy");
+    EXPECT_EQ(copy->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto neg1 = module->GetComputationWithName("true_computation")
+                    ->GetInstructionWithName("neg1");
+    auto neg1_operand = neg1->operand(0);
+    EXPECT_EQ(neg1_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    auto neg2 = module->GetComputationWithName("false_computation")
+                    ->GetInstructionWithName("neg2");
+    auto neg2_operand = neg2->operand(0);
+    EXPECT_EQ(neg2_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, ConditionalAvoidsUnnecessaryPrefetch) {
+  // Checks if we avoid unnecessary allocation in alternate memory if the input
+  // won't be used in the computation for a long time.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}, f32[3]{0}) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    neg0 = f32[3]{0} negate(gte0)
+    neg1 = f32[3]{0} negate(neg0)
+    neg2 = f32[3]{0} negate(neg1)
+    neg3 = f32[3]{0} negate(neg2)
+    neg4 = f32[3]{0} negate(neg3)
+    neg5 = f32[3]{0} negate(neg4)
+    neg6 = f32[3]{0} negate(neg5)
+    neg7 = f32[3]{0} negate(neg6)
+    neg8 = f32[3]{0} negate(neg7)
+    neg9 = f32[3]{0} negate(neg8)
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    ROOT add = f32[3]{0} add(neg9, gte1)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple0 = (f32[3]{0}, f32[3]{0}) tuple(copy0, copy1)
+    tuple1 = (f32[3]{0}) tuple(copy0)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple0, tuple1), true_computation=true_computation, false_computation=false_computation
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Check that copy1 doesn't get unnecessarily allocated in alternate mem
+    // (due to long negate chain in true_computation) but is prefetched before
+    // add.
+    auto copy0 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy0");
+    EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto copy1 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy1");
+    EXPECT_EQ(copy1->shape().layout().memory_space(), kDefaultMemorySpace);
+    auto add = module->GetComputationWithName("true_computation")
+                   ->GetInstructionWithName("add");
+    auto add_operand = add->operand(1);
+    EXPECT_EQ(add_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, ConditionalMultiUse) {
+  // Make sure there is an evict when there is a conditional use followed by
+  // another use.
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}, f32[3]{0}) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    add0 = f32[3]{0} add(gte0, gte1)
+    neg0 = f32[3]{0} negate(add0)
+    neg1 = f32[3]{0} negate(neg0)
+    neg2 = f32[3]{0} negate(neg1)
+    neg3 = f32[3]{0} negate(neg2)
+    neg4 = f32[3]{0} negate(neg3)
+    neg5 = f32[3]{0} negate(neg4)
+    neg6 = f32[3]{0} negate(neg5)
+    neg7 = f32[3]{0} negate(neg6)
+    neg8 = f32[3]{0} negate(neg7)
+    ROOT neg9 = f32[3]{0} negate(neg8)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg = f32[3]{0} negate(gte)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple0 = (f32[3]{0}, f32[3]{0}) tuple(copy0, copy1)
+    tuple1 = (f32[3]{0}) tuple(copy0)
+    conditional = f32[3]{0} conditional(p1, tuple0, tuple1), true_computation=true_computation, false_computation=false_computation
+    ROOT add1 = f32[3]{0} add(copy1, conditional)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Make sure the copy1->add edge is in alternate memory. Before conditional,
+    // this should be evicted to default memory and neg uses the input from
+    // default memory.
+    auto copy1 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy1");
+    EXPECT_EQ(copy1->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto add0 = module->GetComputationWithName("true_computation")
+                    ->GetInstructionWithName("add0");
+    auto add0_operand = add0->operand(1);
+    EXPECT_EQ(add0_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    auto add1 =
+        module->GetComputationWithName("entry")->GetInstructionWithName("add1");
+    auto add1_operand = add1->operand(0);
+    EXPECT_EQ(add1_operand->shape().layout().memory_space(),
+              kDefaultMemorySpace);
+    EXPECT_EQ(add1_operand->opcode(), HloOpcode::kCopyDone);
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, ConditionalMultiUseInWhile) {
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg1 = f32[3]{0} negate(gte)
+  }
+
+  false_computation {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg2 = f32[3]{0} negate(gte)
+  }
+
+  while_cond {
+    p0 = (f32[3]{0}, f32[3]{0}, pred[]) parameter(0)
+    ROOT gte = pred[] get-tuple-element(p0), index=2
+  }
+
+  while_body {
+    p0 = (f32[3]{0}, f32[3]{0}, pred[]) parameter(0)
+    gte0 = f32[3]{0} get-tuple-element(p0), index=0
+    gte1 = f32[3]{0} get-tuple-element(p0), index=1
+    gte2 = pred[] get-tuple-element(p0), index=2
+    cond_tuple = (f32[3]{0}) tuple(gte0)
+    conditional = f32[3]{0} conditional(gte2, cond_tuple, cond_tuple), true_computation=true_computation, false_computation=false_computation
+    add = f32[3]{0} add(conditional, gte1)
+    neg0 = f32[3]{0} negate(add)
+    neg1 = f32[3]{0} negate(neg0)
+    ROOT tuple = (f32[3]{0}, f32[3]{0}, pred[]) tuple(gte0, neg1, gte2)
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy0 = f32[3]{0} copy(p0)
+    copy1 = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}, f32[3]{0}, pred[]) tuple(copy0, copy1, p1)
+    while = (f32[3]{0}, f32[3]{0}, pred[]) while(tuple), condition=while_cond, body=while_body
+    ROOT gte = f32[3]{0} get-tuple-element(while), index=1
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Make sure copy1/while{0}/cond_tuple{0} gets alternate memory allocation.
+    // This will force an eviction and a prefetch for while body root.
+    auto copy0 =
+        module->GetComputationWithName("entry")->GetInstructionWithName(
+            "copy0");
+    EXPECT_EQ(copy0->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto conditional = module->GetComputationWithName("while_body")
+                           ->GetInstructionWithName("conditional");
+    auto conditional_operand = conditional->operand(1);
+    EXPECT_EQ(ShapeUtil::GetSubshape(conditional_operand->shape(), {0})
+                  .layout()
+                  .memory_space(),
+              kAlternateMemorySpace);
+    auto while_root =
+        module->GetComputationWithName("while_body")->root_instruction();
+    auto while_root_operand = while_root->operand(0);
+    EXPECT_THAT(
+        while_root_operand,
+        op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
+                      op::AsyncCopy(kDefaultMemorySpace, kAlternateMemorySpace,
+                                    op::GetTupleElement(op::Parameter(0)))));
+  }
+}
+
+TEST_P(MemorySpaceAssignmentTest, NestedConditional) {
+  absl::string_view hlo_string = R"(
+  HloModule CondAllocation, is_scheduled=true
+
+  true_computation2 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg1 = f32[3]{0} negate(gte)
+  }
+
+  false_computation2 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg2 = f32[3]{0} negate(gte)
+  }
+
+  true_computation1 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    slice = f32[1]{0} slice(gte), slice={[0:1]}
+    bitcast = f32[] bitcast(slice)
+    constant = f32[] constant(0.0)
+    compare = pred[] compare(bitcast, constant), direction=GT
+    ROOT conditional = f32[3]{0} conditional(compare, p0, p0), true_computation=true_computation2, false_computation=false_computation2
+  }
+
+  false_computation1 {
+    p0 = (f32[3]{0}) parameter(0)
+    gte = f32[3]{0} get-tuple-element(p0), index=0
+    ROOT neg3 = f32[3]{0} negate(gte)
+  }
+
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)
+    p1 = pred[] parameter(1)
+    copy = f32[3]{0} copy(p0)
+    tuple = (f32[3]{0}) tuple(copy)
+    ROOT conditional = f32[3]{0} conditional(p1, tuple, tuple), true_computation=true_computation1, false_computation=false_computation1
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  AssignMemorySpace(module.get());
+
+  if (GetParam()) {
+    // Make sure alternate memory allocation gets propagated into both levels of
+    // conditional.
+    auto copy =
+        module->GetComputationWithName("entry")->GetInstructionWithName("copy");
+    EXPECT_EQ(copy->shape().layout().memory_space(), kAlternateMemorySpace);
+    auto neg1_operand = module->GetComputationWithName("true_computation2")
+                            ->GetInstructionWithName("neg1")
+                            ->operand(0);
+    auto neg2_operand = module->GetComputationWithName("false_computation2")
+                            ->GetInstructionWithName("neg2")
+                            ->operand(0);
+    auto neg3_operand = module->GetComputationWithName("false_computation1")
+                            ->GetInstructionWithName("neg3")
+                            ->operand(0);
+    EXPECT_EQ(neg1_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    EXPECT_EQ(neg2_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+    EXPECT_EQ(neg3_operand->shape().layout().memory_space(),
+              kAlternateMemorySpace);
+  }
+}
+
 TEST_P(MemorySpaceAssignmentTest,
        RequestIdentifierShouldNotBeAllocatedInAlternateMem) {
   // Ensure that request identifier returned by Send/Recv HLOs are not allocated
@@ -2149,7 +2467,8 @@ TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule3) {
   AssignMemorySpace(module.get(), -1, 5);
 }
 
-TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule4) {
+// TODO(berkin): This might be an incorrect input graph, investigate.
+TEST_P(MemorySpaceAssignmentTest, DISABLED_NonEntryComputationSchedule4) {
   auto module = CreateNewVerifiedModule();
   Shape shape = ShapeUtil::MakeShape(xla::F32, {2, 3});
   Shape shape2 = ShapeUtil::MakeShape(xla::F32, {3, 3});

From acaaab2504a94711a4c1084328c79c10b7c9a594 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 17:09:11 -0700
Subject: [PATCH 386/412] Rename TransformTensorV2 op to
 TransformTensorBilinearV2 op.

PiperOrigin-RevId: 312184091
Change-Id: I5450142e1022f72705bc5fbdf6c99c94cdbb346b
---
 tensorflow/lite/delegates/gpu/common/model_builder.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 46856a70a7c..964c8289f83 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -2350,7 +2350,7 @@ class TransformTensorOperationParser : public TFLiteOperationParser {
  private:
 };
 
-class TransformTensorV2OperationParser : public TFLiteOperationParser {
+class TransformTensorBilinearV2OperationParser : public TFLiteOperationParser {
  public:
   absl::Status IsSupported(const TfLiteContext* context,
                            const TfLiteNode* tflite_node,
@@ -2368,7 +2368,7 @@ class TransformTensorV2OperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddInput(node, 1));  // bbox
     RETURN_IF_ERROR(reader->AddOutputs(node));
 
-    std::string op_name = "transform_tensor_v2";
+    std::string op_name = "transform_tensor_bilinear_v2";
     node->operation.type = op_name;
     BHWC output_shape;
     RETURN_IF_ERROR(
@@ -2731,8 +2731,8 @@ std::unique_ptr<TFLiteOperationParser> NewOperationParser(
       if (custom_name == "TransformTensor") {
         return std::make_unique<TransformTensorOperationParser>();
       }
-      if (custom_name == "TransformTensorV2") {
-        return std::make_unique<TransformTensorV2OperationParser>();
+      if (custom_name == "TransformTensorBilinearV2") {
+        return std::make_unique<TransformTensorBilinearV2OperationParser>();
       }
       if (custom_name == "TransformLandmarks") {
         return std::make_unique<TransformLandmarksOperationParser>();

From 637c14abf840d83e0f6177694030455d6af35937 Mon Sep 17 00:00:00 2001
From: Zhenyu Tan <tanzheny@google.com>
Date: Mon, 18 May 2020 17:25:05 -0700
Subject: [PATCH 387/412] Add SparseCrossV2 which supports strong_hash with
 salt, and fingerprint doens't take `hash_key`. hash function will be run
 before FingerprintCat.

PiperOrigin-RevId: 312186543
Change-Id: I67a51645250b9d0714b757c85dabf1137e64b167
---
 .../base_api/api_def_SparseCrossHashed.pbtxt  | 104 +++
 .../base_api/api_def_SparseCrossV2.pbtxt      |  91 ++
 .../api_def_SparseCrossHashed.pbtxt           |   4 +
 .../python_api/api_def_SparseCrossV2.pbtxt    |   4 +
 tensorflow/core/kernels/sparse_cross_op.cc    | 805 ++++++++++++------
 tensorflow/core/ops/sparse_ops.cc             |  40 +
 .../kernel_tests/sparse_cross_op_test.py      | 592 +++++++++++++
 .../api/golden/v1/tensorflow.raw_ops.pbtxt    |   8 +
 .../api/golden/v2/tensorflow.raw_ops.pbtxt    |   8 +
 9 files changed, 1417 insertions(+), 239 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt
new file mode 100644
index 00000000000..2c4340cb9b7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCrossHashed.pbtxt
@@ -0,0 +1,104 @@
+op {
+  graph_op_name: "SparseCrossHashed"
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D.  Indices of each input `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D.   values of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "shapes"
+    description: <<END
+1-D.   Shapes of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "dense_inputs"
+    description: <<END
+2-D.    Columns represented by dense `Tensor`.
+END
+  }
+  in_arg {
+    name: "num_buckets"
+    description: <<END
+It is used if hashed_output is true.
+output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+END
+  }
+  in_arg {
+    name: "strong_hash"
+    description: <<END
+boolean, if true, siphash with salt will be used instead of farmhash.
+END
+  }
+  in_arg {
+    name: "salt"
+    description: <<END
+Specify the salt that will be used by the siphash function.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  Indices of the concatenated `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  Non-empty values of the concatenated or hashed
+`SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+1-D.  Shape of the concatenated `SparseTensor`.
+END
+  }
+  summary: "Generates sparse cross from a list of sparse and dense tensors."
+  description: <<END
+The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+representing features of one feature column. It outputs a 2D `SparseTensor` with
+the batchwise crosses of these features.
+
+For example, if the inputs are
+
+    inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+
+    inputs[2]: Tensor [["f"], ["g"]]
+
+then the output will be
+
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+if hashed_output=true then the output will be
+
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt
new file mode 100644
index 00000000000..0627d9b3909
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_SparseCrossV2.pbtxt
@@ -0,0 +1,91 @@
+op {
+  graph_op_name: "SparseCrossV2"
+  in_arg {
+    name: "indices"
+    description: <<END
+2-D.  Indices of each input `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "values"
+    description: <<END
+1-D.   values of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "shapes"
+    description: <<END
+1-D.   Shapes of each `SparseTensor`.
+END
+  }
+  in_arg {
+    name: "dense_inputs"
+    description: <<END
+2-D.    Columns represented by dense `Tensor`.
+END
+  }
+  in_arg {
+    name: "sep"
+    description: <<END
+string used when joining a list of string inputs, can be used as separator later.
+END
+  }
+  out_arg {
+    name: "output_indices"
+    description: <<END
+2-D.  Indices of the concatenated `SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_values"
+    description: <<END
+1-D.  Non-empty values of the concatenated or hashed
+`SparseTensor`.
+END
+  }
+  out_arg {
+    name: "output_shape"
+    description: <<END
+1-D.  Shape of the concatenated `SparseTensor`.
+END
+  }
+  summary: "Generates sparse cross from a list of sparse and dense tensors."
+  description: <<END
+The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+representing features of one feature column. It outputs a 2D `SparseTensor` with
+the batchwise crosses of these features.
+
+For example, if the inputs are
+
+    inputs[0]: SparseTensor with shape = [2, 2]
+    [0, 0]: "a"
+    [1, 0]: "b"
+    [1, 1]: "c"
+
+    inputs[1]: SparseTensor with shape = [2, 1]
+    [0, 0]: "d"
+    [1, 0]: "e"
+
+    inputs[2]: Tensor [["f"], ["g"]]
+
+then the output will be
+
+    shape = [2, 2]
+    [0, 0]: "a_X_d_X_f"
+    [1, 0]: "b_X_e_X_g"
+    [1, 1]: "c_X_e_X_g"
+
+if hashed_output=true then the output will be
+
+    shape = [2, 2]
+    [0, 0]: FingerprintCat64(
+                Fingerprint64("f"), FingerprintCat64(
+                    Fingerprint64("d"), Fingerprint64("a")))
+    [1, 0]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("b")))
+    [1, 1]: FingerprintCat64(
+                Fingerprint64("g"), FingerprintCat64(
+                    Fingerprint64("e"), Fingerprint64("c")))
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt
new file mode 100644
index 00000000000..2c830668733
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseCrossHashed.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseCrossHashed"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt
new file mode 100644
index 00000000000..dfa0a670c4c
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_SparseCrossV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "SparseCrossV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/sparse_cross_op.cc b/tensorflow/core/kernels/sparse_cross_op.cc
index c7c538a945f..9a80aad5d04 100644
--- a/tensorflow/core/kernels/sparse_cross_op.cc
+++ b/tensorflow/core/kernels/sparse_cross_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // Contains OP to generate sparse crosses.
 #include <assert.h>
+
 #include <limits>
 #include <string>
 #include <vector>
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/platform/strong_hash.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -42,7 +44,8 @@ class ColumnInterface {
   virtual int64 FeatureCount(int64 batch) const = 0;
 
   // Returns the fingerprint of nth feature from the specified batch.
-  virtual InternalType Feature(int64 batch, int64 n) const = 0;
+  virtual InternalType Feature(int64 batch, int64 n,
+                               bool strong_hash) const = 0;
 
   virtual ~ColumnInterface() {}
 };
@@ -63,7 +66,7 @@ class SparseTensorColumn : public ColumnInterface<InternalType> {
     return feature_counts_[batch];
   }
 
-  InternalType Feature(int64 batch, int64 n) const override;
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
 
   ~SparseTensorColumn() override {}
 
@@ -73,18 +76,69 @@ class SparseTensorColumn : public ColumnInterface<InternalType> {
   std::vector<int64> feature_start_indices_;
 };
 
+// A column that is backed by a sparse tensor.
+template <typename InternalType>
+class KeyedSparseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  KeyedSparseTensorColumn(const Tensor& values,
+                          std::vector<int64> feature_counts,
+                          std::vector<int64> feature_start_indices,
+                          std::vector<int64> key)
+      : values_(values),
+        feature_counts_(std::move(feature_counts)),
+        feature_start_indices_(std::move(feature_start_indices)) {
+    DCHECK_EQ(feature_counts_.size(), feature_start_indices_.size());
+    std::memcpy(key_, key.data(), sizeof(key_));
+  }
+
+  int64 FeatureCount(int64 batch) const override {
+    return feature_counts_[batch];
+  }
+
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
+
+  ~KeyedSparseTensorColumn() override {}
+
+ private:
+  const Tensor& values_;
+  uint64 key_[2];
+  std::vector<int64> feature_counts_;
+  std::vector<int64> feature_start_indices_;
+};
+
 // InternalType is int64 only when using HashCrosser.
 template <>
-int64 SparseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
+int64 SparseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                         bool strong_hash) const {
   const int64 start = feature_start_indices_[batch];
   if (DT_STRING == values_.dtype())
     return Fingerprint64(values_.vec<tstring>().data()[start + n]);
   return values_.vec<int64>().data()[start + n];
 }
 
+template <>
+int64 KeyedSparseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                              bool strong_hash) const {
+  const int64 start = feature_start_indices_[batch];
+  if (strong_hash) {
+    if (DT_STRING == values_.dtype()) {
+      return StrongKeyedHash(key_, values_.vec<tstring>()(start + n));
+    }
+    return StrongKeyedHash(
+        key_, {reinterpret_cast<const char*>(&values_.vec<int64>()(start + n)),
+               sizeof(values_.dtype())});
+  }
+  if (DT_STRING == values_.dtype())
+    return Fingerprint64(values_.vec<tstring>()(start + n));
+  return Fingerprint64(
+      {reinterpret_cast<const char*>(&values_.vec<int64>()(start + n)),
+       sizeof(values_.dtype())});
+}
+
 // InternalType is string or StringPiece when using StringCrosser.
 template <>
-tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
+tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                             bool strong_hash) const {
   const int64 start = feature_start_indices_[batch];
   if (DT_STRING == values_.dtype())
     return values_.vec<tstring>().data()[start + n];
@@ -92,8 +146,24 @@ tstring SparseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
 }
 
 template <>
-StringPiece SparseTensorColumn<StringPiece>::Feature(int64 batch,
-                                                     int64 n) const {
+tstring KeyedSparseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                                  bool strong_hash) const {
+  const int64 start = feature_start_indices_[batch];
+  if (DT_STRING == values_.dtype())
+    return values_.vec<tstring>().data()[start + n];
+  return std::to_string(values_.vec<int64>().data()[start + n]);
+}
+
+template <>
+StringPiece SparseTensorColumn<StringPiece>::Feature(int64 batch, int64 n,
+                                                     bool strong_hash) const {
+  const int64 start = feature_start_indices_[batch];
+  return values_.vec<tstring>().data()[start + n];
+}
+
+template <>
+StringPiece KeyedSparseTensorColumn<StringPiece>::Feature(
+    int64 batch, int64 n, bool strong_hash) const {
   const int64 start = feature_start_indices_[batch];
   return values_.vec<tstring>().data()[start + n];
 }
@@ -106,7 +176,7 @@ class DenseTensorColumn : public ColumnInterface<InternalType> {
 
   int64 FeatureCount(int64 batch) const override { return tensor_.dim_size(1); }
 
-  InternalType Feature(int64 batch, int64 n) const override;
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
 
   ~DenseTensorColumn() override {}
 
@@ -114,9 +184,46 @@ class DenseTensorColumn : public ColumnInterface<InternalType> {
   const Tensor& tensor_;
 };
 
+// A column that is backed by a dense tensor.
+template <typename InternalType>
+class KeyedDenseTensorColumn : public ColumnInterface<InternalType> {
+ public:
+  explicit KeyedDenseTensorColumn(const Tensor& tensor, std::vector<int64> key)
+      : tensor_(tensor) {
+    std::memcpy(key_, key.data(), sizeof(key_));
+  }
+
+  int64 FeatureCount(int64 batch) const override { return tensor_.dim_size(1); }
+
+  InternalType Feature(int64 batch, int64 n, bool strong_hash) const override;
+
+  ~KeyedDenseTensorColumn() override {}
+
+ private:
+  const Tensor& tensor_;
+  uint64 key_[2];
+};
+
 // InternalType is int64 only when using HashCrosser.
 template <>
-int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
+int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                        bool strong_hash) const {
+  if (DT_STRING == tensor_.dtype())
+    return Fingerprint64(tensor_.matrix<tstring>()(batch, n));
+  return tensor_.matrix<int64>()(batch, n);
+}
+
+template <>
+int64 KeyedDenseTensorColumn<int64>::Feature(int64 batch, int64 n,
+                                             bool strong_hash) const {
+  if (strong_hash) {
+    if (DT_STRING == tensor_.dtype()) {
+      return StrongKeyedHash(key_, tensor_.matrix<tstring>()(batch, n));
+    }
+    return StrongKeyedHash(
+        key_, {reinterpret_cast<const char*>(tensor_.matrix<int64>()(batch, n)),
+               sizeof(tensor_.dtype())});
+  }
   if (DT_STRING == tensor_.dtype())
     return Fingerprint64(tensor_.matrix<tstring>()(batch, n));
   return tensor_.matrix<int64>()(batch, n);
@@ -124,14 +231,28 @@ int64 DenseTensorColumn<int64>::Feature(int64 batch, int64 n) const {
 
 // Internal type is string or StringPiece when using StringCrosser.
 template <>
-tstring DenseTensorColumn<tstring>::Feature(int64 batch, int64 n) const {
+tstring DenseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                            bool strong_hash) const {
   if (DT_STRING == tensor_.dtype()) return tensor_.matrix<tstring>()(batch, n);
   return std::to_string(tensor_.matrix<int64>()(batch, n));
 }
 
 template <>
-StringPiece DenseTensorColumn<StringPiece>::Feature(int64 batch,
-                                                    int64 n) const {
+tstring KeyedDenseTensorColumn<tstring>::Feature(int64 batch, int64 n,
+                                                 bool strong_hash) const {
+  if (DT_STRING == tensor_.dtype()) return tensor_.matrix<tstring>()(batch, n);
+  return std::to_string(tensor_.matrix<int64>()(batch, n));
+}
+
+template <>
+StringPiece DenseTensorColumn<StringPiece>::Feature(int64 batch, int64 n,
+                                                    bool strong_hash) const {
+  return tensor_.matrix<tstring>()(batch, n);
+}
+
+template <>
+StringPiece KeyedDenseTensorColumn<StringPiece>::Feature(
+    int64 batch, int64 n, bool strong_hash) const {
   return tensor_.matrix<tstring>()(batch, n);
 }
 
@@ -169,24 +290,24 @@ class StringCrosser {
  public:
   StringCrosser(const std::vector<
                     std::unique_ptr<ColumnInterface<InternalType>>>& columns,
-                const int64 num_buckets_unused, const uint64 hash_key_unused)
-      : columns_(columns) {}
-
-  string Generate(const int64 batch_index,
-                  const std::vector<int>& permutation) const {
-    static const auto k_feature_separator = "_X_";
+                const int64 num_buckets_unused, const uint64 hash_key_unused,
+                const tstring k_feature_separator)
+      : columns_(columns), k_feature_separator_(k_feature_separator) {}
 
+  string Generate(const int64 batch_index, const std::vector<int>& permutation,
+                  bool unused_strong_hash) const {
     gtl::InlinedVector<InternalType, 6> cross_vec(columns_.size());
     for (int i = 0; i < permutation.size(); i++) {
-      cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i]);
+      cross_vec[i] = columns_[i]->Feature(batch_index, permutation[i], false);
     }
     // TODO(zakaria): this will copy the string twice, might effect
     // performance.
-    return absl::StrJoin(cross_vec, k_feature_separator);
+    return absl::StrJoin(cross_vec, k_feature_separator_);
   }
 
  private:
   const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns_;
+  const tstring k_feature_separator_;
 };
 
 // Generates the sparse crosses as nested hash to avoid string manipulations.
@@ -194,15 +315,16 @@ class HashCrosser {
  public:
   HashCrosser(
       const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns,
-      const int64 num_buckets, const uint64 hash_key)
+      const int64 num_buckets, const uint64 hash_key,
+      const tstring k_feature_separator_unused)
       : columns_(columns), num_buckets_(num_buckets), hash_key_(hash_key) {}
 
-  int64 Generate(const int64 batch_index,
-                 const std::vector<int>& permutation) const {
+  int64 Generate(const int64 batch_index, const std::vector<int>& permutation,
+                 bool unused_strong_hash) const {
     // Do the fingerprint concatenation on uint64.
     uint64 hashed_output = hash_key_;
     for (size_t i = 0; i < permutation.size(); ++i) {
-      uint64 hash_i = columns_[i]->Feature(batch_index, permutation[i]);
+      uint64 hash_i = columns_[i]->Feature(batch_index, permutation[i], false);
       hashed_output = FingerprintCat64(hashed_output, hash_i);
     }
     // The return value is int64 based on the number of buckets.
@@ -220,6 +342,39 @@ class HashCrosser {
   const uint64 hash_key_;
 };
 
+// Generates the sparse crosses as nested hash to avoid string manipulations.
+class HashCrosserV2 {
+ public:
+  HashCrosserV2(
+      const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns,
+      const int64 num_buckets, const uint64 hash_key_unused,
+      const tstring k_feature_separator_unused)
+      : columns_(columns), num_buckets_(num_buckets) {}
+
+  int64 Generate(const int64 batch_index, const std::vector<int>& permutation,
+                 bool strong_hash) const {
+    // Do the fingerprint concatenation on uint64.
+    uint64 hashed_output =
+        columns_[0]->Feature(batch_index, permutation[0], strong_hash);
+    for (size_t i = 1; i < permutation.size(); ++i) {
+      uint64 hash_i =
+          columns_[i]->Feature(batch_index, permutation[i], strong_hash);
+      hashed_output = FingerprintCat64(hashed_output, hash_i);
+    }
+    // The return value is int64 based on the number of buckets.
+    if (num_buckets_ > 0) {
+      return hashed_output % num_buckets_;
+    } else {
+      // To prevent negative output we take modulo to max int64.
+      return hashed_output % std::numeric_limits<int64>::max();
+    }
+  }
+
+ private:
+  const std::vector<std::unique_ptr<ColumnInterface<int64>>>& columns_;
+  const int64 num_buckets_;
+};
+
 // ProductIterator generates cartesian products based on indices.
 template <typename InternalType>
 class ProductIterator {
@@ -275,16 +430,264 @@ struct CrossTraits;
 template <typename InternalType>
 struct CrossTraits<false, InternalType> {
   typedef StringCrosser<InternalType> Crosser;
+  typedef StringCrosser<InternalType> CrosserV2;
   typedef OutputUpdater<tstring> Updater;
 };
 
 template <>
 struct CrossTraits<true, int64> {
   typedef HashCrosser Crosser;
+  typedef HashCrosserV2 CrosserV2;
   typedef OutputUpdater<int64> Updater;
 };
 }  // namespace
 
+// Calculate the batch size from either the shapes input or the dense input.
+int64 CalculateBatchSize(const OpInputList& shapes_list_in,
+                         const OpInputList& dense_list_in) {
+  if (shapes_list_in.size() > 0) {
+    return shapes_list_in[0].vec<int64>()(0);
+  }
+
+  if (dense_list_in.size() > 0) {
+    return dense_list_in[0].dim_size(0);
+  }
+
+  return 0;
+}
+
+// Validates input tensors.
+Status ValidateInput(const OpInputList& indices_list_in,
+                     const OpInputList& values_list_in,
+                     const OpInputList& shapes_list_in,
+                     const OpInputList& dense_list_in) {
+  const auto size = indices_list_in.size();
+  // Validates indices_list_in OpInputList.
+  for (int i = 0; i < size; i++) {
+    if (!TensorShapeUtils::IsMatrix(indices_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Input indices should be a matrix but received shape ",
+          indices_list_in[i].shape().DebugString(), " at position ", i);
+    }
+    if (indices_list_in[i].shape().dim_size(1) != 2) {
+      return errors::InvalidArgument("Expected D2 of index to be 2 got ",
+                                     indices_list_in[i].shape().dim_size(1),
+                                     " at position ", i);
+    }
+  }
+
+  // Validates values_list_in OpInputList.
+  if (values_list_in.size() != size) {
+    return errors::InvalidArgument("Expected ", size, " input values, got ",
+                                   values_list_in.size());
+  }
+  for (int i = 0; i < size; i++) {
+    if (!TensorShapeUtils::IsVector(values_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Input values should be a vector but received shape ",
+          values_list_in[i].shape().DebugString(), " at position ", i);
+    }
+    if (indices_list_in[i].shape().dim_size(0) !=
+        values_list_in[i].shape().dim_size(0)) {
+      return errors::InvalidArgument(
+          "Expected size of values to be ",
+          indices_list_in[i].shape().dim_size(0), " got ",
+          values_list_in[i].shape().dim_size(0), " at position ", i);
+    }
+  }
+
+  // Validates shapes_list_in OpInputList
+  if (shapes_list_in.size() != size) {
+    return errors::InvalidArgument("Expected ", size, " input shapes, got ",
+                                   shapes_list_in.size());
+  }
+  for (int i = 0; i < size; i++) {
+    if (!TensorShapeUtils::IsVector(shapes_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Input shapes should be a vector but received shape ",
+          shapes_list_in[i].shape().DebugString(), " at position ", i);
+    }
+
+    if (shapes_list_in[i].vec<int64>().size() != 2) {
+      return errors::InvalidArgument("shape should imply a 2D tensor, but got ",
+                                     shapes_list_in[i].shape().DebugString(),
+                                     " at position ", i);
+    }
+  }
+
+  // Validates dense_list_in OpInputList
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    if (!TensorShapeUtils::IsMatrix(dense_list_in[i].shape())) {
+      return errors::InvalidArgument(
+          "Dense inputs should be a matrix but received shape ",
+          dense_list_in[i].shape().DebugString(), " at position ", i);
+    }
+  }
+
+  // Validates batch sizes.  (Note: we do this after validating the input
+  // shapes, because CalculateBatchSize() depends on inputs having valid
+  // shapes).
+  const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  for (int i = 0; i < size; i++) {
+    if (shapes_list_in[i].vec<int64>()(0) != batch_size) {
+      return errors::InvalidArgument("Expected batch size ", batch_size,
+                                     " got ", shapes_list_in[i].vec<int64>()(0),
+                                     " at position ", i);
+    }
+  }
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    if (dense_list_in[i].dim_size(0) != batch_size) {
+      return errors::InvalidArgument("Expected batch size ", batch_size,
+                                     " got ", dense_list_in[i].dim_size(0),
+                                     " at dense tensor ", i);
+    }
+  }
+
+  return Status::OK();
+}
+
+// Extracts data about the features and populates feature data.
+void ExtractFeatureData(
+    const OpInputList& indices_list_in, int64 batch_size,
+    std::vector<std::vector<int64>>* feature_counts,
+    std::vector<std::vector<int64>>* feature_start_indices) {
+  gtl::InlinedVector<int64, 8> current_row(indices_list_in.size(), 0);
+  for (int b = 0; b < batch_size; b++) {
+    for (int i = 0; i < indices_list_in.size(); i++) {
+      const auto indices = indices_list_in[i].matrix<int64>();
+      int64 feature_count = 0;
+      int64 start_index = current_row[i];
+      // Loops until we reach next batch index for current feature column.
+      while (current_row[i] < indices_list_in[i].dim_size(0) &&
+             indices(current_row[i], 0) == b) {
+        feature_count++;
+        current_row[i]++;
+      }
+      (*feature_counts)[i].push_back(feature_count);
+      (*feature_start_indices)[i].push_back(start_index);
+    }
+  }
+}
+
+// Returns number of crosses for a given batch_index
+template <typename InternalType>
+int64 CrossCountByBatchIndex(
+    const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns,
+    int batch_index) {
+  int64 cross_count = 1;
+  for (int i = 0; i < columns.size(); i++) {
+    const auto feature_count = columns[i]->FeatureCount(batch_index);
+    // If one column is missing any feature, there won't be any cross.
+    if (feature_count == 0) {
+      return 0;
+    }
+    cross_count *= feature_count;
+  }
+  return cross_count;
+}
+
+// Generate the columns given the sparse and dense inputs.
+template <typename InternalType>
+std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
+GenerateColumnsFromInput(const OpInputList& indices_list_in,
+                         const OpInputList& values_list_in,
+                         const OpInputList& shapes_list_in,
+                         const OpInputList& dense_list_in) {
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
+  const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  const int64 number_of_columns = shapes_list_in.size();
+
+  std::vector<std::vector<int64>> feature_counts(number_of_columns,
+                                                 std::vector<int64>());
+  std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
+                                                        std::vector<int64>());
+
+  ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
+                     &feature_start_indices);
+
+  columns.reserve(values_list_in.size());
+  for (int i = 0; i < values_list_in.size(); ++i) {
+    columns.emplace_back(new SparseTensorColumn<InternalType>(
+        values_list_in[i], std::move(feature_counts[i]),
+        std::move(feature_start_indices[i])));
+  }
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    columns.emplace_back(new DenseTensorColumn<InternalType>(dense_list_in[i]));
+  }
+
+  return columns;
+}
+
+// Generate the columns given the sparse and dense inputs.
+template <typename InternalType>
+std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
+GenerateKeyedColumnsFromInput(const OpInputList& indices_list_in,
+                              const OpInputList& values_list_in,
+                              const OpInputList& shapes_list_in,
+                              const OpInputList& dense_list_in,
+                              std::vector<int64> keys) {
+  std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
+  const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+  const int64 number_of_columns = shapes_list_in.size();
+
+  std::vector<std::vector<int64>> feature_counts(number_of_columns,
+                                                 std::vector<int64>());
+  std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
+                                                        std::vector<int64>());
+
+  ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
+                     &feature_start_indices);
+
+  columns.reserve(values_list_in.size());
+  for (int i = 0; i < values_list_in.size(); ++i) {
+    columns.emplace_back(new KeyedSparseTensorColumn<InternalType>(
+        values_list_in[i], std::move(feature_counts[i]),
+        std::move(feature_start_indices[i]), keys));
+  }
+  for (int i = 0; i < dense_list_in.size(); ++i) {
+    columns.emplace_back(
+        new KeyedDenseTensorColumn<InternalType>(dense_list_in[i], keys));
+  }
+
+  return columns;
+}
+
+// Allocates output tensors with proper size and sets the shape tensor of
+// the output SparseTensor.
+// It also output_start_indices which contains the start indices for each
+// input in the output SparseTensor.
+template <typename InternalType>
+Status CreateOutputTensors(
+    const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>& columns,
+    int64 batch_size, OpKernelContext* context, Tensor** indices_out,
+    Tensor** values_out, Tensor** shape_out,
+    std::vector<int64>* output_start_indices) {
+  // Calculates dimensions for output tensors.
+  int64 cross_count_total = 0;
+  int64 max_cross_count = 0;
+  for (int64 b = 0; b < batch_size; b++) {
+    // For each input, sets starting indices in output SparseTensor
+    (*output_start_indices)[b] = cross_count_total;
+    const auto cross_count = CrossCountByBatchIndex(columns, b);
+    max_cross_count = std::max(max_cross_count, cross_count);
+    cross_count_total += cross_count;
+  }
+
+  // Allocates tensors.
+  TF_RETURN_IF_ERROR(context->allocate_output(
+      0, TensorShape({cross_count_total, 2}), indices_out));
+  TF_RETURN_IF_ERROR(context->allocate_output(
+      1, TensorShape({cross_count_total}), values_out));
+  TF_RETURN_IF_ERROR(context->allocate_output(2, TensorShape({2}), shape_out));
+
+  // Sets shape.
+  auto shape_vec = (*shape_out)->vec<int64>();
+  shape_vec(0) = batch_size;
+  shape_vec(1) = max_cross_count;
+
+  return Status::OK();
+}
+
 template <bool HASHED_OUTPUT, typename InternalType>
 class SparseCrossOp : public OpKernel {
  public:
@@ -312,11 +715,12 @@ class SparseCrossOp : public OpKernel {
                                           shapes_list_in, dense_list_in));
 
     std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns =
-        GenerateColumnsFromInput(indices_list_in, values_list_in,
-                                 shapes_list_in, dense_list_in);
+        GenerateColumnsFromInput<InternalType>(indices_list_in, values_list_in,
+                                               shapes_list_in, dense_list_in);
 
+    const tstring k_feature_separator = "_X_";
     typename CrossTraits<HASHED_OUTPUT, InternalType>::Crosser crosser(
-        columns, num_buckets_, hash_key_);
+        columns, num_buckets_, hash_key_, k_feature_separator);
     Tensor* indices_out;
     Tensor* values_out;
     Tensor* shape_out;
@@ -335,7 +739,8 @@ class SparseCrossOp : public OpKernel {
         int64 cross_count = 0;
         while (product_iterator.HasNext()) {
           const auto permutation = product_iterator.Next();
-          updater.Update(b, cross_count, crosser.Generate(b, permutation));
+          updater.Update(b, cross_count,
+                         crosser.Generate(b, permutation, false));
           cross_count++;
         }
       }
@@ -349,222 +754,138 @@ class SparseCrossOp : public OpKernel {
   }
 
  private:
-  // Validates input tensors.
-  Status ValidateInput(const OpInputList& indices_list_in,
-                       const OpInputList& values_list_in,
-                       const OpInputList& shapes_list_in,
-                       const OpInputList& dense_list_in) {
-    const auto size = indices_list_in.size();
-    // Validates indices_list_in OpInputList.
-    for (int i = 0; i < size; i++) {
-      if (!TensorShapeUtils::IsMatrix(indices_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Input indices should be a matrix but received shape ",
-            indices_list_in[i].shape().DebugString(), " at position ", i);
-      }
-      if (indices_list_in[i].shape().dim_size(1) != 2) {
-        return errors::InvalidArgument("Expected D2 of index to be 2 got ",
-                                       indices_list_in[i].shape().dim_size(1),
-                                       " at position ", i);
-      }
-    }
-
-    // Validates values_list_in OpInputList.
-    if (values_list_in.size() != size) {
-      return errors::InvalidArgument("Expected ", size, " input values, got ",
-                                     values_list_in.size());
-    }
-    for (int i = 0; i < size; i++) {
-      if (!TensorShapeUtils::IsVector(values_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Input values should be a vector but received shape ",
-            values_list_in[i].shape().DebugString(), " at position ", i);
-      }
-      if (indices_list_in[i].shape().dim_size(0) !=
-          values_list_in[i].shape().dim_size(0)) {
-        return errors::InvalidArgument(
-            "Expected size of values to be ",
-            indices_list_in[i].shape().dim_size(0), " got ",
-            values_list_in[i].shape().dim_size(0), " at position ", i);
-      }
-    }
-
-    // Validates shapes_list_in OpInputList
-    if (shapes_list_in.size() != size) {
-      return errors::InvalidArgument("Expected ", size, " input shapes, got ",
-                                     shapes_list_in.size());
-    }
-    for (int i = 0; i < size; i++) {
-      if (!TensorShapeUtils::IsVector(shapes_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Input shapes should be a vector but received shape ",
-            shapes_list_in[i].shape().DebugString(), " at position ", i);
-      }
-
-      if (shapes_list_in[i].vec<int64>().size() != 2) {
-        return errors::InvalidArgument(
-            "shape should imply a 2D tensor, but got ",
-            shapes_list_in[i].shape().DebugString(), " at position ", i);
-      }
-    }
-
-    // Validates dense_list_in OpInputList
-    for (int i = 0; i < dense_list_in.size(); ++i) {
-      if (!TensorShapeUtils::IsMatrix(dense_list_in[i].shape())) {
-        return errors::InvalidArgument(
-            "Dense inputs should be a matrix but received shape ",
-            dense_list_in[i].shape().DebugString(), " at position ", i);
-      }
-    }
-
-    // Validates batch sizes.  (Note: we do this after validating the input
-    // shapes, because CalculateBatchSize() depends on inputs having valid
-    // shapes).
-    const auto batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
-    for (int i = 0; i < size; i++) {
-      if (shapes_list_in[i].vec<int64>()(0) != batch_size) {
-        return errors::InvalidArgument(
-            "Expected batch size ", batch_size, " got ",
-            shapes_list_in[i].vec<int64>()(0), " at position ", i);
-      }
-    }
-    for (int i = 0; i < dense_list_in.size(); ++i) {
-      if (dense_list_in[i].dim_size(0) != batch_size) {
-        return errors::InvalidArgument("Expected batch size ", batch_size,
-                                       " got ", dense_list_in[i].dim_size(0),
-                                       " at dense tensor ", i);
-      }
-    }
-
-    return Status::OK();
-  }
-
-  // Calculate the batch size from either the shapes input or the dense input.
-  int64 CalculateBatchSize(const OpInputList& shapes_list_in,
-                           const OpInputList& dense_list_in) {
-    if (shapes_list_in.size() > 0) {
-      return shapes_list_in[0].vec<int64>()(0);
-    }
-
-    if (dense_list_in.size() > 0) {
-      return dense_list_in[0].dim_size(0);
-    }
-
-    return 0;
-  }
-
-  // Generate the columns given the sparse and dense inputs.
-  std::vector<std::unique_ptr<ColumnInterface<InternalType>>>
-  GenerateColumnsFromInput(const OpInputList& indices_list_in,
-                           const OpInputList& values_list_in,
-                           const OpInputList& shapes_list_in,
-                           const OpInputList& dense_list_in) {
-    std::vector<std::unique_ptr<ColumnInterface<InternalType>>> columns;
-    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
-    const int64 number_of_columns = shapes_list_in.size();
-
-    std::vector<std::vector<int64>> feature_counts(number_of_columns,
-                                                   std::vector<int64>());
-    std::vector<std::vector<int64>> feature_start_indices(number_of_columns,
-                                                          std::vector<int64>());
-
-    ExtractFeatureData(indices_list_in, batch_size, &feature_counts,
-                       &feature_start_indices);
-
-    columns.reserve(values_list_in.size());
-    for (int i = 0; i < values_list_in.size(); ++i) {
-      columns.emplace_back(new SparseTensorColumn<InternalType>(
-          values_list_in[i], std::move(feature_counts[i]),
-          std::move(feature_start_indices[i])));
-    }
-    for (int i = 0; i < dense_list_in.size(); ++i) {
-      columns.emplace_back(
-          new DenseTensorColumn<InternalType>(dense_list_in[i]));
-    }
-
-    return columns;
-  }
-
-  // Extracts data about the features and populates feature data.
-  void ExtractFeatureData(
-      const OpInputList& indices_list_in, int64 batch_size,
-      std::vector<std::vector<int64>>* feature_counts,
-      std::vector<std::vector<int64>>* feature_start_indices) {
-    gtl::InlinedVector<int64, 8> current_row(indices_list_in.size(), 0);
-    for (int b = 0; b < batch_size; b++) {
-      for (int i = 0; i < indices_list_in.size(); i++) {
-        const auto indices = indices_list_in[i].matrix<int64>();
-        int64 feature_count = 0;
-        int64 start_index = current_row[i];
-        // Loops until we reach next batch index for current feature column.
-        while (current_row[i] < indices_list_in[i].dim_size(0) &&
-               indices(current_row[i], 0) == b) {
-          feature_count++;
-          current_row[i]++;
-        }
-        (*feature_counts)[i].push_back(feature_count);
-        (*feature_start_indices)[i].push_back(start_index);
-      }
-    }
-  }
-
-  // Allocates output tensors with proper size and sets the shape tensor of
-  // the output SparseTensor.
-  // It also output_start_indices which contains the start indices for each
-  // input in the output SparseTensor.
-  Status CreateOutputTensors(
-      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
-          columns,
-      int64 batch_size, OpKernelContext* context, Tensor** indices_out,
-      Tensor** values_out, Tensor** shape_out,
-      std::vector<int64>* output_start_indices) {
-    // Calculates dimensions for output tensors.
-    int64 cross_count_total = 0;
-    int64 max_cross_count = 0;
-    for (int64 b = 0; b < batch_size; b++) {
-      // For each input, sets starting indices in output SparseTensor
-      (*output_start_indices)[b] = cross_count_total;
-      const auto cross_count = CrossCountByBatchIndex(columns, b);
-      max_cross_count = std::max(max_cross_count, cross_count);
-      cross_count_total += cross_count;
-    }
-
-    // Allocates tensors.
-    TF_RETURN_IF_ERROR(context->allocate_output(
-        0, TensorShape({cross_count_total, 2}), indices_out));
-    TF_RETURN_IF_ERROR(context->allocate_output(
-        1, TensorShape({cross_count_total}), values_out));
-    TF_RETURN_IF_ERROR(
-        context->allocate_output(2, TensorShape({2}), shape_out));
-
-    // Sets shape.
-    auto shape_vec = (*shape_out)->vec<int64>();
-    shape_vec(0) = batch_size;
-    shape_vec(1) = max_cross_count;
-
-    return Status::OK();
-  }
-
-  // Returns number of crosses for a given batch_index
-  int64 CrossCountByBatchIndex(
-      const std::vector<std::unique_ptr<ColumnInterface<InternalType>>>&
-          columns,
-      int batch_index) {
-    int64 cross_count = 1;
-    for (int i = 0; i < columns.size(); i++) {
-      const auto feature_count = columns[i]->FeatureCount(batch_index);
-      // If one column is missing any feature, there won't be any cross.
-      if (feature_count == 0) {
-        return 0;
-      }
-      cross_count *= feature_count;
-    }
-    return cross_count;
-  }
   int64 num_buckets_;
   uint64 hash_key_;
 };
 
+class SparseCrossV2Op : public OpKernel {
+ public:
+  explicit SparseCrossV2Op(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList indices_list_in;
+    OP_REQUIRES_OK(context, context->input_list("indices", &indices_list_in));
+    OpInputList values_list_in;
+    OP_REQUIRES_OK(context, context->input_list("values", &values_list_in));
+    OpInputList shapes_list_in;
+    OP_REQUIRES_OK(context, context->input_list("shapes", &shapes_list_in));
+    OpInputList dense_list_in;
+    OP_REQUIRES_OK(context,
+                   context->input_list("dense_inputs", &dense_list_in));
+
+    OP_REQUIRES_OK(context, ValidateInput(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in));
+
+    const Tensor* sep_t;
+    OP_REQUIRES_OK(context, context->input("sep", &sep_t));
+    const tstring separator = sep_t->scalar<tstring>()();
+
+    std::vector<std::unique_ptr<ColumnInterface<tstring>>> columns =
+        GenerateColumnsFromInput<tstring>(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in);
+    Tensor* indices_out;
+    Tensor* values_out;
+    Tensor* shape_out;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    std::vector<int64> output_start_indices(batch_size);
+    OP_REQUIRES_OK(
+        context,
+        CreateOutputTensors(columns, batch_size, context, &indices_out,
+                            &values_out, &shape_out, &output_start_indices));
+    StringCrosser<tstring> crosser(columns, 0, 0, separator);
+    OutputUpdater<tstring> updater(output_start_indices, indices_out,
+                                   values_out);
+    auto do_work = [&columns, crosser, updater](int64 begin, int64 end) {
+      for (int b = begin; b < end; b++) {
+        ProductIterator<tstring> product_iterator(columns, b);
+        int64 cross_count = 0;
+        while (product_iterator.HasNext()) {
+          const auto permutation = product_iterator.Next();
+          updater.Update(b, cross_count,
+                         crosser.Generate(b, permutation, false));
+          cross_count++;
+        }
+      }
+    };
+
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
+    Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+          kCostPerUnit, do_work);
+  }
+};
+
+class SparseCrossHashedOp : public OpKernel {
+ public:
+  explicit SparseCrossHashedOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    OpInputList indices_list_in;
+    OP_REQUIRES_OK(context, context->input_list("indices", &indices_list_in));
+    OpInputList values_list_in;
+    OP_REQUIRES_OK(context, context->input_list("values", &values_list_in));
+    OpInputList shapes_list_in;
+    OP_REQUIRES_OK(context, context->input_list("shapes", &shapes_list_in));
+    OpInputList dense_list_in;
+    OP_REQUIRES_OK(context,
+                   context->input_list("dense_inputs", &dense_list_in));
+
+    OP_REQUIRES_OK(context, ValidateInput(indices_list_in, values_list_in,
+                                          shapes_list_in, dense_list_in));
+
+    const Tensor* num_buckets_t;
+    OP_REQUIRES_OK(context, context->input("num_buckets", &num_buckets_t));
+    const int64 num_buckets = num_buckets_t->scalar<int64>()();
+
+    const Tensor* strong_hash_t;
+    OP_REQUIRES_OK(context, context->input("strong_hash", &strong_hash_t));
+    const bool strong_hash = strong_hash_t->scalar<bool>()();
+
+    const Tensor* salt_t;
+    OP_REQUIRES_OK(context, context->input("salt", &salt_t));
+    const auto salt = salt_t->flat<int64>();
+    std::vector<int64> key_{salt(0), salt(1)};
+
+    std::vector<std::unique_ptr<ColumnInterface<int64>>> columns =
+        GenerateKeyedColumnsFromInput<int64>(indices_list_in, values_list_in,
+                                             shapes_list_in, dense_list_in,
+                                             key_);
+    Tensor* indices_out;
+    Tensor* values_out;
+    Tensor* shape_out;
+    const int64 batch_size = CalculateBatchSize(shapes_list_in, dense_list_in);
+    std::vector<int64> output_start_indices(batch_size);
+    OP_REQUIRES_OK(
+        context,
+        CreateOutputTensors(columns, batch_size, context, &indices_out,
+                            &values_out, &shape_out, &output_start_indices));
+    const tstring unused_sep;
+    HashCrosserV2 crosser(columns, num_buckets, 0, unused_sep);
+    OutputUpdater<int64> updater(output_start_indices, indices_out, values_out);
+    auto do_work = [&columns, crosser, updater, strong_hash](int64 begin,
+                                                             int64 end) {
+      for (int b = begin; b < end; b++) {
+        ProductIterator<int64> product_iterator(columns, b);
+        int64 cross_count = 0;
+        while (product_iterator.HasNext()) {
+          const auto permutation = product_iterator.Next();
+          updater.Update(b, cross_count,
+                         crosser.Generate(b, permutation, strong_hash));
+          cross_count++;
+        }
+      }
+    };
+
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    // TODO(zakaria): optimize kCostPerUnit
+    const int kCostPerUnit = 5000 * indices_list_in.size();
+    Shard(worker_threads->num_threads, worker_threads->workers, batch_size,
+          kCostPerUnit, do_work);
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("SparseCross")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<tstring>("out_type")
@@ -589,4 +910,10 @@ REGISTER_KERNEL_BUILDER(Name("SparseCross")
                             .TypeConstraint<int64>("internal_type"),
                         SparseCrossOp<true, int64>);
 
+REGISTER_KERNEL_BUILDER(Name("SparseCrossV2").Device(DEVICE_CPU),
+                        SparseCrossV2Op);
+
+REGISTER_KERNEL_BUILDER(Name("SparseCrossHashed").Device(DEVICE_CPU),
+                        SparseCrossHashedOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index 85186c4a2d8..906cef1f5ec 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -272,6 +272,46 @@ REGISTER_OP("SparseCross")
       return Status::OK();
     });
 
+REGISTER_OP("SparseCrossV2")
+    .Input("indices: N * int64")
+    .Input("values: sparse_types")
+    .Input("shapes: N * int64")
+    .Input("dense_inputs: dense_types")
+    .Input("sep: string")
+    .Output("output_indices: int64")
+    .Output("output_values: string")
+    .Output("output_shape: int64")
+    .Attr("N: int >= 0")
+    .Attr("sparse_types: list({int64, string}) >= 0")
+    .Attr("dense_types: list({int64, string}) >= 0")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
+REGISTER_OP("SparseCrossHashed")
+    .Input("indices: N * int64")
+    .Input("values: sparse_types")
+    .Input("shapes: N * int64")
+    .Input("dense_inputs: dense_types")
+    .Input("num_buckets: int64")
+    .Input("strong_hash: bool")
+    .Input("salt: int64")
+    .Output("output_indices: int64")
+    .Output("output_values: int64")
+    .Output("output_shape: int64")
+    .Attr("N: int >= 0")
+    .Attr("sparse_types: list({int64, string}) >= 0")
+    .Attr("dense_types: list({int64, string}) >= 0")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Matrix(c->UnknownDim(), 2));
+      c->set_output(1, c->Vector(c->UnknownDim()));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
 REGISTER_OP("SparseSplit")
     .Input("split_dim: int64")
     .Input("indices: int64")
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index 5037f82af72..b352c1a080f 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -27,10 +27,55 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
+class BaseSparseCrossOpTest(test.TestCase):
+
+  def _sparse_tensor(self, data, batch_size=-1):
+    """Generates a SparseTensor.
+
+    Args:
+      data: Should be a list of list of strings or int64. Each item of the outer
+        list represents a batch. Each item of the batch is a feature of a
+        specific feature column.
+      batch_size: optional batch size, especially for cases when data has no
+        entry for some batches.
+
+    Returns:
+     A SparseTensor.
+    """
+    indices = []
+    values = []
+    max_col_count = 0
+    for batch, batch_ix in zip(data, range(len(data))):
+      for column, column_ix in zip(batch, range(len(batch))):
+        indices.append([batch_ix, column_ix])
+        values.append(column)
+        max_col_count = max(max_col_count, column_ix + 1)
+    shape = [batch_size if batch_size != -1 else len(data), max_col_count]
+    value_type = (
+        dtypes.string
+        if not values or isinstance(values[0], str) else dtypes.int64)
+    return sparse_tensor.SparseTensor(
+        constant_op.constant(indices, dtypes.int64, [len(indices), 2]),
+        constant_op.constant(values, value_type, [len(indices)]),
+        constant_op.constant(shape, dtypes.int64))
+
+  def _assert_sparse_tensor_equals(self, sp1, sp2):
+    self.assertAllEqual(sp1.indices.eval(), sp2.indices)
+    self.assertAllEqual(sp1.values.eval(), sp2.values)
+    self.assertAllEqual(sp1.dense_shape.eval(), sp2.dense_shape)
+
+  def _assert_sparse_tensor_empty(self, sp):
+    self.assertEqual(0, sp.indices.size)
+    self.assertEqual(0, sp.values.size)
+    # TODO(zakaria): check if we can ignore the first dim of the shape.
+    self.assertEqual(0, sp.dense_shape[1])
+
+
 class SparseCrossOpTest(test.TestCase):
 
   @test_util.run_deprecated_v1
@@ -459,5 +504,552 @@ class SparseCrossOpTest(test.TestCase):
       self.evaluate(sparse_ops.sparse_cross([st1, st2]))
 
 
+class SparseCrossV2OpTest(BaseSparseCrossOpTest):
+
+  @test_util.run_deprecated_v1
+  def test_sparse(self):
+    """Tests a simple scenario."""
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1'],
+                                    ['batch2-FC1-F1', 'batch2-FC1-F2']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'],
+                                    ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices],
+        values=[sp_inp_1.values, sp_inp_2.values],
+        shapes=[sp_inp_1.dense_shape, sp_inp_2.dense_shape],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_X_batch1-FC2-F1'],
+        ['batch2-FC1-F1_X_batch2-FC2-F1',
+         'batch2-FC1-F1_X_batch2-FC2-F2',
+         'batch2-FC1-F2_X_batch2-FC2-F1',
+         'batch2-FC1-F2_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_sparse_sep(self):
+    """Tests a simple scenario."""
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1'],
+                                    ['batch2-FC1-F1', 'batch2-FC1-F2']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'],
+                                    ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices],
+        values=[sp_inp_1.values, sp_inp_2.values],
+        shapes=[sp_inp_1.dense_shape, sp_inp_2.dense_shape],
+        dense_inputs=[],
+        sep='_Y_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_Y_batch1-FC2-F1'],
+        ['batch2-FC1-F1_Y_batch2-FC2-F1',
+         'batch2-FC1-F1_Y_batch2-FC2-F2',
+         'batch2-FC1-F2_Y_batch2-FC2-F1',
+         'batch2-FC1-F2_Y_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_dense(self):
+    """Tests only dense inputs."""
+    dense_inp_1 = constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
+                                        ['batch2-FC1-F1', 'batch2-FC1-F2']],
+                                       dtypes.string)
+    dense_inp_2 = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                        ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                       dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[dense_inp_1, dense_inp_2],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2',
+         'batch1-FC1-F2_X_batch1-FC2-F1', 'batch1-FC1-F2_X_batch1-FC2-F2'
+        ],
+        ['batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+         'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_dense_sep(self):
+    """Tests only dense inputs."""
+    dense_inp_1 = constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
+                                        ['batch2-FC1-F1', 'batch2-FC1-F2']],
+                                       dtypes.string)
+    dense_inp_2 = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                        ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                       dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[dense_inp_1, dense_inp_2],
+        sep='_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['batch1-FC1-F1_batch1-FC2-F1', 'batch1-FC1-F1_batch1-FC2-F2',
+         'batch1-FC1-F2_batch1-FC2-F1', 'batch1-FC1-F2_batch1-FC2-F2'
+        ],
+        ['batch2-FC1-F1_batch2-FC2-F1', 'batch2-FC1-F1_batch2-FC2-F2',
+         'batch2-FC1-F2_batch2-FC2-F1', 'batch2-FC1-F2_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_integer_mixed_string_sparse(self):
+    """Tests mixed type."""
+    sp_inp_1 = self._sparse_tensor([[11], [333, 55555]])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'],
+                                    ['batch2-FC2-F1', 'batch2-FC2-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices],
+        values=[sp_inp_1.values, sp_inp_2.values],
+        shapes=[sp_inp_1.dense_shape, sp_inp_2.dense_shape],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['11_X_batch1-FC2-F1'],
+        ['333_X_batch2-FC2-F1', '333_X_batch2-FC2-F2',
+         '55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_integer_mixed_string_dense(self):
+    """Tests mixed dense inputs."""
+    dense_inp_1 = constant_op.constant([[11, 333], [55555, 999999]],
+                                       dtypes.int64)
+    dense_inp_2 = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                        ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                       dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[dense_inp_1, dense_inp_2],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    # pyformat: disable
+    expected_out = self._sparse_tensor([
+        ['11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2',
+         '333_X_batch1-FC2-F1', '333_X_batch1-FC2-F2'
+        ],
+        ['55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2',
+         '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
+        ]])
+    # pyformat: enable
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_sparse_cross_dense(self):
+    """Tests sparse and dense inputs."""
+    sp_inp = self._sparse_tensor([['batch1-FC1-F1'],
+                                  ['batch2-FC1-F1', 'batch2-FC1-F2']])
+    dense_inp = constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
+                                      ['batch2-FC2-F1', 'batch2-FC2-F2']],
+                                     dtypes.string)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp.indices],
+        values=[sp_inp.values],
+        shapes=[sp_inp.dense_shape],
+        dense_inputs=[dense_inp],
+        sep='_X_')
+    expected_out = self._sparse_tensor(
+        [['batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2'],
+         [
+             'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2',
+             'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
+         ]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_permutation_3x3x3(self):
+    """Tests 3x3x3 permutation."""
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor(
+        [['batch1-FC2-F1', 'batch1-FC2-F2', 'batch1-FC2-F3']])
+    sp_inp_3 = self._sparse_tensor(
+        [['batch1-FC3-F1', 'batch1-FC3-F2', 'batch1-FC3-F3']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F1_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F3_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F2_X_batch1-FC3-F3',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F3'
+    ]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_permutation_3x1x2(self):
+    """Tests 3x1x2 permutation."""
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_large_batch(self):
+    """Tests with large batch size to force multithreading."""
+    batch_size = 5000
+    col1 = []
+    col2 = []
+    col3 = []
+    for b in range(batch_size):
+      col1.append(
+          ['batch%d-FC1-F1' % b,
+           'batch%d-FC1-F2' % b,
+           'batch%d-FC1-F3' % b])
+      col2.append(['batch%d-FC2-F1' % b])
+      col3.append(['batch%d-FC3-F1' % b, 'batch%d-FC3-F2' % b])
+    sp_inp_1 = self._sparse_tensor(col1)
+    sp_inp_2 = self._sparse_tensor(col2)
+    sp_inp_3 = self._sparse_tensor(col3)
+
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+
+    col_out = []
+    for b in range(batch_size):
+      col_out.append([
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F1_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F2_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F1' % (b, b, b),
+          'batch%d-FC1-F3_X_batch%d-FC2-F1_X_batch%d-FC3-F2' % (b, b, b)
+      ])
+
+    expected_out = self._sparse_tensor(col_out)
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_one_column_empty(self):
+    """Tests when one column is empty.
+
+    The crossed tensor should be empty.
+    """
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']])
+    sp_inp_2 = self._sparse_tensor([], 1)
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_empty(self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_some_columns_empty(self):
+    """Tests when more than one columns are empty.
+
+    Cross for the corresponding batch should be empty.
+    """
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']], 2)
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1']], 2)
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']], 2)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    expected_out = self._sparse_tensor([[
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F1_X_batch1-FC2-F1_X_batch1-FC3-F2',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F1',
+        'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
+    ]], 2)
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_all_columns_empty(self):
+    """Tests when all columns are empty.
+
+    The crossed tensor should be empty.
+    """
+    sp_inp_1 = self._sparse_tensor([])
+    sp_inp_2 = self._sparse_tensor([])
+    sp_inp_3 = self._sparse_tensor([])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_v2(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        sep='_X_')
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_empty(self.evaluate(out))
+
+
+class SparseCrossHashedOpTest(BaseSparseCrossOpTest):
+
+  @test_util.run_deprecated_v1
+  def test_hashed_zero_bucket_no_hash_key(self):
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=0,
+        salt=[1, 1],
+        strong_hash=False)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[9186962005966787372]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+    # salt is not being used when `strong_hash` is False.
+    inds_2, vals_2, shapes_2 = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=0,
+        salt=[137, 173],
+        strong_hash=False)
+    out_2 = sparse_tensor.SparseTensor(inds_2, vals_2, shapes_2)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out_2))
+
+  @test_util.run_deprecated_v1
+  def test_hashed_output(self):
+    sp_inp_1 = self._sparse_tensor([['batch1-FC1-F1']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=100,
+        salt=[137, 173],
+        strong_hash=False)
+    # Check actual hashed output to prevent unintentional hashing changes.
+    expected_out = self._sparse_tensor([[79]])
+    out = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(out))
+
+  @test_util.run_deprecated_v1
+  def test_hashed_has_no_collision(self):
+    """Tests that fingerprint concatenation has no collisions."""
+    # Although the last 10 bits of 359 and 1024+359 are identical.
+    # As a result, all the crosses shouldn't collide.
+    t1 = constant_op.constant([[359], [359 + 1024]], dtype=dtypes.int64)
+    t2 = constant_op.constant(
+        [list(range(10)), list(range(10))], dtype=dtypes.int64)
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[],
+        values=[],
+        shapes=[],
+        dense_inputs=[t2, t1],
+        num_buckets=1024,
+        salt=[137, 173],
+        strong_hash=False)
+    cross = sparse_tensor.SparseTensor(inds, vals, shapes)
+    cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
+    with session.Session():
+      values = self.evaluate(cross_dense)
+      self.assertTrue(numpy.not_equal(values[0], values[1]).all())
+
+  def test_hashed_3x1x2(self):
+    """Tests 3x1x2 permutation with hashed output."""
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        num_buckets=1000,
+        salt=[137, 173],
+        strong_hash=False)
+    output = sparse_tensor.SparseTensor(inds, vals, shapes)
+    with self.cached_session():
+      out = self.evaluate(output)
+      self.assertEqual(6, len(out.values))
+      self.assertAllEqual([[0, i] for i in range(6)], out.indices)
+      self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
+      all_values_are_different = len(out.values) == len(set(out.values))
+      self.assertTrue(all_values_are_different)
+
+  def test_hashed_different_salt(self):
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=False,
+        num_buckets=1000,
+        salt=[137, 173])
+    output = sparse_tensor.SparseTensor(inds, vals, shapes)
+    inds_2, vals_2, shapes_2 = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=True,
+        num_buckets=1000,
+        salt=[137, 1])
+    output_2 = sparse_tensor.SparseTensor(inds_2, vals_2, shapes_2)
+    with self.cached_session():
+      out = self.evaluate(output)
+      out_2 = self.evaluate(output_2)
+      self.assertAllEqual(out.indices, out_2.indices)
+      self.assertNotAllEqual(out.values, out_2.values)
+
+  def test_sep_ignored_in_hashed_out(self):
+    sp_inp_1 = self._sparse_tensor(
+        [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']])
+    sp_inp_2 = self._sparse_tensor([['batch1-FC2-F1']])
+    sp_inp_3 = self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
+    inds, vals, shapes = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=True,
+        num_buckets=1000,
+        salt=[137, 173])
+    output = sparse_tensor.SparseTensor(inds, vals, shapes)
+    inds_2, vals_2, shapes_2 = gen_sparse_ops.sparse_cross_hashed(
+        indices=[sp_inp_1.indices, sp_inp_2.indices, sp_inp_3.indices],
+        values=[sp_inp_1.values, sp_inp_2.values, sp_inp_3.values],
+        shapes=[
+            sp_inp_1.dense_shape, sp_inp_2.dense_shape, sp_inp_3.dense_shape
+        ],
+        dense_inputs=[],
+        strong_hash=True,
+        num_buckets=1000,
+        salt=[137, 173])
+    output_2 = sparse_tensor.SparseTensor(inds_2, vals_2, shapes_2)
+    with self.cached_session():
+      out = self.evaluate(output)
+      out_2 = self.evaluate(output_2)
+      self.assertAllEqual(out.indices, out_2.indices)
+      self.assertAllEqual(out.values, out_2.values)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 44fb74ac63a..f798ebf25fd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -4100,6 +4100,14 @@ tf_module {
     name: "SparseCross"
     argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_type\', \'internal_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SparseCrossHashed"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'num_buckets\', \'strong_hash\', \'salt\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseCrossV2"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'sep\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SparseDenseCwiseAdd"
     argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 44fb74ac63a..f798ebf25fd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -4100,6 +4100,14 @@ tf_module {
     name: "SparseCross"
     argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'hashed_output\', \'num_buckets\', \'hash_key\', \'out_type\', \'internal_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "SparseCrossHashed"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'num_buckets\', \'strong_hash\', \'salt\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "SparseCrossV2"
+    argspec: "args=[\'indices\', \'values\', \'shapes\', \'dense_inputs\', \'sep\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "SparseDenseCwiseAdd"
     argspec: "args=[\'sp_indices\', \'sp_values\', \'sp_shape\', \'dense\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "

From 3753d9ff839762d7c64e7f5b0e2ac69fbd4f1b32 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Mon, 18 May 2020 17:42:11 -0700
Subject: [PATCH 388/412] Remove the unnecessary address-returning operator and
 lamda expression.

PiperOrigin-RevId: 312188829
Change-Id: Ia17acc7e84f79846ee1bd7aeab9ca0800905c52c
---
 tensorflow/lite/tools/evaluation/utils.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 3807814fee1..33967b6f4ea 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -119,7 +119,7 @@ TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
 #if defined(__ANDROID__)
 TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options) {
   return TfLiteDelegatePtr(TfLiteGpuDelegateV2Create(options),
-                           TfLiteGpuDelegateV2Delete);
+                           &TfLiteGpuDelegateV2Delete);
 }
 #endif  // defined(__ANDROID__)
 
@@ -184,7 +184,9 @@ TfLiteDelegatePtr CreateXNNPACKDelegate() {
 TfLiteDelegatePtr CreateXNNPACKDelegate(
     const TfLiteXNNPackDelegateOptions* xnnpack_options) {
   auto xnnpack_delegate = TfLiteXNNPackDelegateCreate(xnnpack_options);
-  return TfLiteDelegatePtr(xnnpack_delegate, TfLiteXNNPackDelegateDelete);
+  return TfLiteDelegatePtr(xnnpack_delegate, [](TfLiteDelegate* delegate) {
+    TfLiteXNNPackDelegateDelete(delegate);
+  });
 }
 
 TfLiteDelegatePtr CreateXNNPACKDelegate(int num_threads) {

From 456a61ddb1b4d774b68caf046193a44c5cbe4c24 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 18 May 2020 18:24:26 -0700
Subject: [PATCH 389/412] Hexagon Delegate Skip tensors which are not available
 from type checking. -1 means optional tensor and not available.

PiperOrigin-RevId: 312194000
Change-Id: I390ccaad7a72892ebba09ad66af3404e43da7ff4
---
 tensorflow/lite/experimental/delegates/hexagon/utils.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/lite/experimental/delegates/hexagon/utils.cc b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
index ae7f6994657..d6e5e7bc8cd 100644
--- a/tensorflow/lite/experimental/delegates/hexagon/utils.cc
+++ b/tensorflow/lite/experimental/delegates/hexagon/utils.cc
@@ -42,6 +42,8 @@ bool InputsWithCorrectTypes(
     const std::vector<std::vector<TfLiteType>>& per_input_possible_types) {
   if (node->inputs->size != per_input_possible_types.size()) return false;
   for (int i = 0; i < per_input_possible_types.size(); ++i) {
+    // Skip optional tensor.
+    if (node->inputs->data[i] == -1) continue;
     bool type_found = false;
     for (auto possible_type : per_input_possible_types[i]) {
       if (TensorTypeMatch(node->inputs->data[i], context, possible_type)) {

From 97c4543c9dff244413e0105a7f5cbd0a1c02d08b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 18:27:25 -0700
Subject: [PATCH 390/412] Go: Update generated wrapper functions for TensorFlow
 ops.

PiperOrigin-RevId: 312194335
Change-Id: I90d2f0daa1b6f701101c54b1fbac25a17367ced6
---
 tensorflow/go/op/wrappers.go | 467 ++++++++++++++++++++++-------------
 1 file changed, 300 insertions(+), 167 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 04c36ed3399..7efdcf181d9 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -26103,6 +26103,173 @@ func FusedPadConv2D(scope *Scope, input tf.Output, paddings tf.Output, filter tf
 	return op.Output(0)
 }
 
+// Adjust the hue of one or more images.
+//
+// `images` is a tensor of at least 3 dimensions.  The last dimension is
+// interpreted as channels, and must be three.
+//
+// The input image is considered in the RGB colorspace. Conceptually, the RGB
+// colors are first mapped into HSV. A delta is then applied all the hue values,
+// and then remapped back to RGB colorspace.
+//
+// Arguments:
+//	images: Images to adjust.  At least 3-D.
+//	delta: A float delta to add to the hue.
+//
+// Returns The hue-adjusted image or images.
+func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "AdjustHue",
+		Input: []tf.Input{
+			images, delta,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// List of the given size with empty elements.
+//
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListReserve",
+		Input: []tf.Input{
+			element_shape, num_elements,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Clips tensor values to a specified min and max.
+//
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
+//
+// Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
+//
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ClipByValue",
+		Input: []tf.Input{
+			t, clip_value_min, clip_value_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
+type Conv2DBackpropFilterAttr func(optionalAttr)
+
+// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
+// If not specified, defaults to true
+func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["use_cudnn_on_gpu"] = value
+	}
+}
+
+// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
+//
+// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
+// dimension, the amount of padding inserted before and after the dimension is
+// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
+// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
+// If not specified, defaults to <>
+func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["explicit_paddings"] = value
+	}
+}
+
+// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
+//
+// value: 1-D tensor of length 4.  The dilation factor for each dimension of
+// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
+// element on that dimension. The dimension order is determined by the value of
+// `data_format`, see above for details. Dilations in the batch and depth
+// dimensions must be 1.
+// If not specified, defaults to <i:1 i:1 i:1 i:1 >
+func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
+	return func(m optionalAttr) {
+		m["dilations"] = value
+	}
+}
+
+// Computes the gradients of convolution with respect to the filter.
+//
+// Arguments:
+//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
+//	filter_sizes: An integer vector representing the tensor shape of `filter`,
+// where `filter` is a 4-D
+// `[filter_height, filter_width, in_channels, out_channels]` tensor.
+//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
+// Gradients w.r.t. the output of the convolution.
+//	strides: The stride of the sliding window for each dimension of the input
+// of the convolution. Must be in the same order as the dimension specified with
+// format.
+//	padding: The type of padding algorithm to use.
+//
+// Returns 4-D with shape
+// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
+// the `filter` input of the convolution.
+func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Conv2DBackpropFilter",
+		Input: []tf.Input{
+			input, filter_sizes, out_backprop,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // ConfigureDistributedTPUAttr is an optional argument to ConfigureDistributedTPU.
 type ConfigureDistributedTPUAttr func(optionalAttr)
 
@@ -30655,57 +30822,6 @@ func QuantizedAvgPool(scope *Scope, input tf.Output, min_input tf.Output, max_in
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Clips tensor values to a specified min and max.
-//
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
-//
-// Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
-//
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ClipByValue",
-		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// List of the given size with empty elements.
-//
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
-		Input: []tf.Input{
-			element_shape, num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // VariableShapeAttr is an optional argument to VariableShape.
 type VariableShapeAttr func(optionalAttr)
 
@@ -34196,6 +34312,74 @@ func SparseReduceMax(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	num_buckets: It is used if hashed_output is true.
+// output = hashed_value%num_buckets if num_buckets > 0 else hashed_value.
+//	strong_hash: boolean, if true, siphash with salt will be used instead of farmhash.
+//	salt: Specify the salt that will be used by the siphash function.
+//
+// Returns:
+//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+//	output_values: 1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.
+//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCrossHashed(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, num_buckets tf.Output, strong_hash tf.Output, salt tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseCrossHashed",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs), num_buckets, strong_hash, salt,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
 type QuantizedInstanceNormAttr func(optionalAttr)
 
@@ -34457,6 +34641,71 @@ func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
+// Generates sparse cross from a list of sparse and dense tensors.
+//
+// The op takes two lists, one of 2D `SparseTensor` and one of 2D `Tensor`, each
+// representing features of one feature column. It outputs a 2D `SparseTensor` with
+// the batchwise crosses of these features.
+//
+// For example, if the inputs are
+//
+//     inputs[0]: SparseTensor with shape = [2, 2]
+//     [0, 0]: "a"
+//     [1, 0]: "b"
+//     [1, 1]: "c"
+//
+//     inputs[1]: SparseTensor with shape = [2, 1]
+//     [0, 0]: "d"
+//     [1, 0]: "e"
+//
+//     inputs[2]: Tensor [["f"], ["g"]]
+//
+// then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: "a_X_d_X_f"
+//     [1, 0]: "b_X_e_X_g"
+//     [1, 1]: "c_X_e_X_g"
+//
+// if hashed_output=true then the output will be
+//
+//     shape = [2, 2]
+//     [0, 0]: FingerprintCat64(
+//                 Fingerprint64("f"), FingerprintCat64(
+//                     Fingerprint64("d"), Fingerprint64("a")))
+//     [1, 0]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("b")))
+//     [1, 1]: FingerprintCat64(
+//                 Fingerprint64("g"), FingerprintCat64(
+//                     Fingerprint64("e"), Fingerprint64("c")))
+//
+// Arguments:
+//	indices: 2-D.  Indices of each input `SparseTensor`.
+//	values: 1-D.   values of each `SparseTensor`.
+//	shapes: 1-D.   Shapes of each `SparseTensor`.
+//	dense_inputs: 2-D.    Columns represented by dense `Tensor`.
+//	sep: string used when joining a list of string inputs, can be used as separator later.
+//
+// Returns:
+//	output_indices: 2-D.  Indices of the concatenated `SparseTensor`.
+//	output_values: 1-D.  Non-empty values of the concatenated or hashed
+// `SparseTensor`.
+//	output_shape: 1-D.  Shape of the concatenated `SparseTensor`.
+func SparseCrossV2(scope *Scope, indices []tf.Output, values []tf.Output, shapes []tf.Output, dense_inputs []tf.Output, sep tf.Output) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseCrossV2",
+		Input: []tf.Input{
+			tf.OutputList(indices), tf.OutputList(values), tf.OutputList(shapes), tf.OutputList(dense_inputs), sep,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // Pads a tensor with mirrored values.
 //
 // This operation pads a `input` with mirrored values according to the `paddings`
@@ -36887,34 +37136,6 @@ func QueueDequeueUpToV2(scope *Scope, handle tf.Output, n tf.Output, component_t
 	return components
 }
 
-// Adjust the hue of one or more images.
-//
-// `images` is a tensor of at least 3 dimensions.  The last dimension is
-// interpreted as channels, and must be three.
-//
-// The input image is considered in the RGB colorspace. Conceptually, the RGB
-// colors are first mapped into HSV. A delta is then applied all the hue values,
-// and then remapped back to RGB colorspace.
-//
-// Arguments:
-//	images: Images to adjust.  At least 3-D.
-//	delta: A float delta to add to the hue.
-//
-// Returns The hue-adjusted image or images.
-func AdjustHue(scope *Scope, images tf.Output, delta tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AdjustHue",
-		Input: []tf.Input{
-			images, delta,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Says whether the targets are in the top `K` predictions.
 //
 // This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
@@ -48489,94 +48710,6 @@ func RetrieveTPUEmbeddingFTRLParameters(scope *Scope, num_shards int64, shard_id
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
-type Conv2DBackpropFilterAttr func(optionalAttr)
-
-// Conv2DBackpropFilterUseCudnnOnGpu sets the optional use_cudnn_on_gpu attribute to value.
-// If not specified, defaults to true
-func Conv2DBackpropFilterUseCudnnOnGpu(value bool) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["use_cudnn_on_gpu"] = value
-	}
-}
-
-// Conv2DBackpropFilterExplicitPaddings sets the optional explicit_paddings attribute to value.
-//
-// value: If `padding` is `"EXPLICIT"`, the list of explicit padding amounts. For the ith
-// dimension, the amount of padding inserted before and after the dimension is
-// `explicit_paddings[2 * i]` and `explicit_paddings[2 * i + 1]`, respectively. If
-// `padding` is not `"EXPLICIT"`, `explicit_paddings` must be empty.
-// If not specified, defaults to <>
-func Conv2DBackpropFilterExplicitPaddings(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["explicit_paddings"] = value
-	}
-}
-
-// Conv2DBackpropFilterDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func Conv2DBackpropFilterDataFormat(value string) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Conv2DBackpropFilterDilations sets the optional dilations attribute to value.
-//
-// value: 1-D tensor of length 4.  The dilation factor for each dimension of
-// `input`. If set to k > 1, there will be k-1 skipped cells between each filter
-// element on that dimension. The dimension order is determined by the value of
-// `data_format`, see above for details. Dilations in the batch and depth
-// dimensions must be 1.
-// If not specified, defaults to <i:1 i:1 i:1 i:1 >
-func Conv2DBackpropFilterDilations(value []int64) Conv2DBackpropFilterAttr {
-	return func(m optionalAttr) {
-		m["dilations"] = value
-	}
-}
-
-// Computes the gradients of convolution with respect to the filter.
-//
-// Arguments:
-//	input: 4-D with shape `[batch, in_height, in_width, in_channels]`.
-//	filter_sizes: An integer vector representing the tensor shape of `filter`,
-// where `filter` is a 4-D
-// `[filter_height, filter_width, in_channels, out_channels]` tensor.
-//	out_backprop: 4-D with shape `[batch, out_height, out_width, out_channels]`.
-// Gradients w.r.t. the output of the convolution.
-//	strides: The stride of the sliding window for each dimension of the input
-// of the convolution. Must be in the same order as the dimension specified with
-// format.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D with shape
-// `[filter_height, filter_width, in_channels, out_channels]`.  Gradient w.r.t.
-// the `filter` input of the convolution.
-func Conv2DBackpropFilter(scope *Scope, input tf.Output, filter_sizes tf.Output, out_backprop tf.Output, strides []int64, padding string, optional ...Conv2DBackpropFilterAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Conv2DBackpropFilter",
-		Input: []tf.Input{
-			input, filter_sizes, out_backprop,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // LRNGradAttr is an optional argument to LRNGrad.
 type LRNGradAttr func(optionalAttr)
 

From 714092f36095ec762a5806fbe3c0fad7ec162e8e Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Mon, 18 May 2020 18:36:37 -0700
Subject: [PATCH 391/412] Disable flaky tensorflow/c/eager:c_api_test

PiperOrigin-RevId: 312195494
Change-Id: I7cbd78f2142ef586e6ca78da73c2cf53304ae3b6
---
 tensorflow/c/eager/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 0180b4bdee2..24593806c65 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -319,6 +319,7 @@ tf_cuda_cc_test(
     tags = [
         "noguitar",  # TODO(b/155445984): flaky
         #"guitar",
+        "notap",  # TODO(b/156981931): flaky
         "multi_gpu",
     ],
     deps = [

From d3886d23d7c5f423390b4c570842fe2c31f24ff5 Mon Sep 17 00:00:00 2001
From: Andrew Audibert <aaudibert@google.com>
Date: Mon, 18 May 2020 18:54:25 -0700
Subject: [PATCH 392/412] Move compression_utils to core/data.

This is in preparation for adding a CompressElementOp, which will use CompressElement to compress a dataset element in a tf.data service agnostic way.

PiperOrigin-RevId: 312197651
Change-Id: I3558b2f5036dcf4c91ed9059a7b896351c79da40
---
 tensorflow/core/data/BUILD                    | 47 ++++++++++++++++++-
 .../data/{service => }/compression_utils.cc   | 21 +++++----
 .../data/{service => }/compression_utils.h    | 11 ++---
 .../{service => }/compression_utils_test.cc   |  8 ++--
 tensorflow/core/data/dataset.proto            | 27 +++++++++++
 tensorflow/core/data/service/BUILD            | 38 ++-------------
 tensorflow/core/data/service/common.proto     | 19 --------
 .../core/data/service/data_service_test.cc    |  4 +-
 tensorflow/core/data/service/worker.proto     |  1 +
 tensorflow/core/data/service/worker_impl.cc   |  6 +--
 .../core/kernels/data/experimental/BUILD      |  4 +-
 .../experimental/data_service_dataset_op.cc   |  6 +--
 12 files changed, 106 insertions(+), 86 deletions(-)
 rename tensorflow/core/data/{service => }/compression_utils.cc (90%)
 rename tensorflow/core/data/{service => }/compression_utils.h (82%)
 rename tensorflow/core/data/{service => }/compression_utils_test.cc (89%)
 create mode 100644 tensorflow/core/data/dataset.proto

diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index 9c58be108fc..e42c46d6348 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -1,5 +1,10 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/core/platform:build_config.bzl", "tf_protos_all")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_additional_all_protos",
+    "tf_proto_library",
+    "tf_protos_all",
+)
 
 package(
     default_visibility = [
@@ -10,6 +15,46 @@ package(
 
 exports_files(["LICENSE"])
 
+cc_library(
+    name = "compression_utils",
+    srcs = ["compression_utils.cc"],
+    hdrs = [
+        "compression_utils.h",
+    ],
+    deps = [
+        ":dataset_proto_cc",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/profiler/lib:traceme",
+        "@com_google_absl//absl/memory",
+    ],
+)
+
+tf_cc_test(
+    name = "compression_utils_test",
+    srcs = ["compression_utils_test.cc"],
+    deps = [
+        ":compression_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/kernels/data:dataset_test_base",
+    ],
+)
+
+tf_proto_library(
+    name = "dataset_proto",
+    srcs = ["dataset.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+)
+
 cc_library(
     name = "standalone",
     srcs = ["standalone.cc"],
diff --git a/tensorflow/core/data/service/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
similarity index 90%
rename from tensorflow/core/data/service/compression_utils.cc
rename to tensorflow/core/data/compression_utils.cc
index c4a47e1b00e..ea06a082128 100644
--- a/tensorflow/core/data/service/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -21,11 +21,11 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
-namespace service_util {
 
-Status Compress(const std::vector<Tensor>& element, CompressedElement* out) {
+Status CompressElement(const std::vector<Tensor>& element,
+                       CompressedElement* out) {
   tensorflow::profiler::TraceMe activity(
-      "Compress", tensorflow::profiler::TraceMeLevel::kInfo);
+      "CompressElement", tensorflow::profiler::TraceMeLevel::kInfo);
 
   // Step 1: Determine the total uncompressed size. This requires serializing
   // non-memcopyable tensors, which we save to use again later.
@@ -51,7 +51,8 @@ Status Compress(const std::vector<Tensor>& element, CompressedElement* out) {
   char* position = uncompressed.mdata();
   int non_memcpy_component_index = 0;
   for (auto& component : element) {
-    ComponentMetadata* metadata = out->mutable_component_metadata()->Add();
+    CompressedComponentMetadata* metadata =
+        out->mutable_component_metadata()->Add();
     metadata->set_dtype(component.dtype());
     component.shape().AsProto(metadata->mutable_tensor_shape());
     if (DataTypeCanUseMemcpy(component.dtype())) {
@@ -74,10 +75,10 @@ Status Compress(const std::vector<Tensor>& element, CompressedElement* out) {
   return Status::OK();
 }
 
-Status Uncompress(const CompressedElement& compressed,
-                  std::vector<Tensor>* out) {
+Status UncompressElement(const CompressedElement& compressed,
+                         std::vector<Tensor>* out) {
   tensorflow::profiler::TraceMe activity(
-      "Uncompress", tensorflow::profiler::TraceMeLevel::kInfo);
+      "UncompressElement", tensorflow::profiler::TraceMeLevel::kInfo);
   int num_components = compressed.component_metadata_size();
   out->clear();
   out->reserve(num_components);
@@ -92,7 +93,8 @@ Status Uncompress(const CompressedElement& compressed,
   tensor_proto_strs.reserve(num_components);
   int64 total_size = 0;
   for (int i = 0; i < num_components; ++i) {
-    const ComponentMetadata& metadata = compressed.component_metadata(i);
+    const CompressedComponentMetadata& metadata =
+        compressed.component_metadata(i);
     if (DataTypeCanUseMemcpy(metadata.dtype())) {
       out->emplace_back(metadata.dtype(), metadata.tensor_shape());
       TensorBuffer* buffer = DMAHelper::buffer(&out->back());
@@ -146,6 +148,5 @@ Status Uncompress(const CompressedElement& compressed,
   return Status::OK();
 }
 
-}  // namespace service_util
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/compression_utils.h b/tensorflow/core/data/compression_utils.h
similarity index 82%
rename from tensorflow/core/data/service/compression_utils.h
rename to tensorflow/core/data/compression_utils.h
index 96698aaaf09..5e033771272 100644
--- a/tensorflow/core/data/service/compression_utils.h
+++ b/tensorflow/core/data/compression_utils.h
@@ -16,24 +16,23 @@ limitations under the License.
 #define TENSORFLOW_CORE_DATA_SERVICE_COMPRESSION_UTILS_H_
 
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#include "tensorflow/core/data/service/common.pb.h"
+#include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/platform/status.h"
 
 namespace tensorflow {
 namespace data {
-namespace service_util {
 
 // Compresses the components of `element` into the `CompressedElement` proto.
 //
 // In addition to writing the actual compressed bytes, `Compress` fills
 // out the per-component metadata for the `CompressedElement`.
-Status Compress(const std::vector<Tensor>& element, CompressedElement* out);
+Status CompressElement(const std::vector<Tensor>& element,
+                       CompressedElement* out);
 
 // Uncompresses a `CompressedElement` into a vector of tensor components.
-Status Uncompress(const CompressedElement& compressed,
-                  std::vector<Tensor>* out);
+Status UncompressElement(const CompressedElement& compressed,
+                         std::vector<Tensor>* out);
 
-}  // namespace service_util
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/data/service/compression_utils_test.cc b/tensorflow/core/data/compression_utils_test.cc
similarity index 89%
rename from tensorflow/core/data/service/compression_utils_test.cc
rename to tensorflow/core/data/compression_utils_test.cc
index b5da13efeed..eb220092f88 100644
--- a/tensorflow/core/data/service/compression_utils_test.cc
+++ b/tensorflow/core/data/compression_utils_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/kernels/data/dataset_test_base.h"
@@ -20,7 +20,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace data {
-namespace service_util {
 
 class ParameterizedCompressionUtilsTest
     : public DatasetOpsTestBase,
@@ -29,9 +28,9 @@ class ParameterizedCompressionUtilsTest
 TEST_P(ParameterizedCompressionUtilsTest, RoundTrip) {
   std::vector<Tensor> element = GetParam();
   CompressedElement compressed;
-  TF_ASSERT_OK(Compress(element, &compressed));
+  TF_ASSERT_OK(CompressElement(element, &compressed));
   std::vector<Tensor> round_trip_element;
-  TF_ASSERT_OK(Uncompress(compressed, &round_trip_element));
+  TF_ASSERT_OK(UncompressElement(compressed, &round_trip_element));
   TF_EXPECT_OK(
       ExpectEqual(element, round_trip_element, /*compare_order=*/true));
 }
@@ -50,6 +49,5 @@ std::vector<std::vector<Tensor>> TestCases() {
 INSTANTIATE_TEST_SUITE_P(Instantiation, ParameterizedCompressionUtilsTest,
                          ::testing::ValuesIn(TestCases()));
 
-}  // namespace service_util
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/dataset.proto b/tensorflow/core/data/dataset.proto
new file mode 100644
index 00000000000..27a36364e76
--- /dev/null
+++ b/tensorflow/core/data/dataset.proto
@@ -0,0 +1,27 @@
+syntax = "proto3";
+
+package tensorflow.data;
+
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+
+// This file contains protocol buffers for working with tf.data Datasets.
+
+// Metadata describing a compressed component of a dataset element.
+message CompressedComponentMetadata {
+  // The dtype of the component tensor.
+  .tensorflow.DataType dtype = 1;
+  // The shape of the component tensor.
+  .tensorflow.TensorShapeProto tensor_shape = 2;
+  // Size of the uncompressed tensor bytes. For tensors serialized as
+  // TensorProtos, this is TensorProto::BytesAllocatedLong(). For raw Tensors,
+  // this is the size of the buffer underlying the Tensor.
+  int64 tensor_size_bytes = 3;
+}
+
+message CompressedElement {
+  // Compressed tensor bytes for all components of the element.
+  bytes data = 1;
+  // Metadata for the components of the element.
+  repeated CompressedComponentMetadata component_metadata = 2;
+}
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index 5413493cb78..b87f4f171cd 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -44,6 +44,7 @@ tf_proto_library(
     cc_api_version = 2,
     protodeps = tf_additional_all_protos() + [
         ":common_proto",
+        "//tensorflow/core/data:dataset_proto",
     ],
 )
 
@@ -84,7 +85,6 @@ cc_library(
     ],
     deps = [
         ":common_proto_cc",
-        ":compression_utils",
         ":credentials_factory",
         ":grpc_util",
         ":master_cc_grpc_proto",
@@ -98,6 +98,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/data:compression_utils",
         "//tensorflow/core/data:standalone",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
@@ -129,39 +130,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "compression_utils",
-    srcs = ["compression_utils.cc"],
-    hdrs = [
-        "compression_utils.h",
-    ],
-    deps = [
-        ":common_proto_cc",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/profiler/lib:traceme",
-        "@com_google_absl//absl/memory",
-    ],
-)
-
-tf_cc_test(
-    name = "compression_utils_test",
-    srcs = ["compression_utils_test.cc"],
-    deps = [
-        ":compression_utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/kernels/data:dataset_test_base",
-    ],
-)
-
 cc_library(
     name = "credentials_factory",
     srcs = ["credentials_factory.cc"],
@@ -317,7 +285,6 @@ tf_cc_test(
     srcs = ["data_service_test.cc"],
     tags = ["no_windows"],
     deps = [
-        ":compression_utils",
         ":data_service",
         ":grpc_master_impl",
         ":grpc_util",
@@ -333,6 +300,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/data:compression_utils",
         "//tensorflow/core/kernels/data:dataset_test_base",
         "@com_google_absl//absl/strings",
         tf_grpc_cc_dependency(),
diff --git a/tensorflow/core/data/service/common.proto b/tensorflow/core/data/service/common.proto
index 6dfa698764b..4bde56fe1ca 100644
--- a/tensorflow/core/data/service/common.proto
+++ b/tensorflow/core/data/service/common.proto
@@ -3,7 +3,6 @@ syntax = "proto3";
 package tensorflow.data;
 
 import "tensorflow/core/framework/graph.proto";
-import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 
 message DatasetDef {
@@ -12,24 +11,6 @@ message DatasetDef {
   GraphDef graph = 1;
 }
 
-message ComponentMetadata {
-  // The dtype of the component tensor.
-  .tensorflow.DataType dtype = 1;
-  // The shape of the component tensor.
-  .tensorflow.TensorShapeProto tensor_shape = 2;
-  // Size of the uncompressed tensor bytes. For tensors serialized as
-  // TensorProtos, this is TensorProto::BytesAllocatedLong(). For raw Tensors,
-  // this is the size of the buffer underlying the Tensor.
-  int64 tensor_size_bytes = 3;
-}
-
-message CompressedElement {
-  // Compressed tensor bytes for all components of the element.
-  bytes data = 1;
-  // Metadata for the components of the element.
-  repeated ComponentMetadata component_metadata = 2;
-}
-
 message TaskDef {
   // The dataset to iterate over.
   // TODO(aaudibert): load the dataset from disk instead of passing it here.
diff --git a/tensorflow/core/data/service/data_service_test.cc b/tensorflow/core/data/service/data_service_test.cc
index 73a46bad3d0..bd01cb90a66 100644
--- a/tensorflow/core/data/service/data_service_test.cc
+++ b/tensorflow/core/data/service/data_service_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "grpcpp/create_channel.h"
 #include "grpcpp/security/credentials.h"
 #include "absl/strings/str_split.h"
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/master.grpc.pb.h"
 #include "tensorflow/core/data/service/master.pb.h"
@@ -74,7 +74,7 @@ Status CheckWorkerOutput(const std::string& worker_address, int64 task_id,
       return errors::Internal("Reached end of sequence too early.");
     }
     std::vector<Tensor> element;
-    TF_RETURN_IF_ERROR(service_util::Uncompress(compressed, &element));
+    TF_RETURN_IF_ERROR(UncompressElement(compressed, &element));
     TF_RETURN_IF_ERROR(DatasetOpsTestBase::ExpectEqual(element, expected,
                                                        /*compare_order=*/true));
   }
diff --git a/tensorflow/core/data/service/worker.proto b/tensorflow/core/data/service/worker.proto
index 04b8f03474c..51c6899f540 100644
--- a/tensorflow/core/data/service/worker.proto
+++ b/tensorflow/core/data/service/worker.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 
 package tensorflow.data;
 
+import "tensorflow/core/data/dataset.proto";
 import "tensorflow/core/data/service/common.proto";
 
 message ProcessTaskRequest {
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index 8d00825227b..b4be18ebccd 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
 #include "tensorflow/core/data/service/credentials_factory.h"
 #include "tensorflow/core/data/service/grpc_util.h"
 #include "tensorflow/core/data/service/master.grpc.pb.h"
@@ -135,8 +135,8 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
 
   if (!end_of_sequence) {
     VLOG(3) << "Producing an element for task " << request->task_id();
-    TF_RETURN_IF_ERROR(service_util::Compress(
-        outputs, response->mutable_compressed_element()));
+    TF_RETURN_IF_ERROR(
+        CompressElement(outputs, response->mutable_compressed_element()));
   }
   response->set_end_of_sequence(end_of_sequence);
 
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 4ddfd99951c..85f8af878ee 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -131,8 +131,8 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/data/service:common_proto_cc",
-        "//tensorflow/core/data/service:compression_utils",
+        "//tensorflow/core/data:compression_utils",
+        "//tensorflow/core/data:dataset_proto_cc",
         "//tensorflow/core/data/service:data_service",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/kernels/data:dataset_utils",
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 56077a671fb..3f8e778d1d8 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
-#include "tensorflow/core/data/service/common.pb.h"
-#include "tensorflow/core/data/service/compression_utils.h"
+#include "tensorflow/core/data/compression_utils.h"
+#include "tensorflow/core/data/dataset.pb.h"
 #include "tensorflow/core/data/service/data_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/dataset.h"
@@ -496,7 +496,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
 
       std::vector<Tensor> element;
       if (!end_of_sequence) {
-        TF_RETURN_IF_ERROR(service_util::Uncompress(compressed, &element));
+        TF_RETURN_IF_ERROR(UncompressElement(compressed, &element));
       }
       mutex_lock l(mu_);
       if (end_of_sequence) {

From efd77d2e45f2958615a15812d225caa093f1e5af Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Mon, 18 May 2020 19:21:37 -0700
Subject: [PATCH 393/412] Adding skip record functionality to snapshot utils.

PiperOrigin-RevId: 312200718
Change-Id: Icba0dfd19ffc6ddc0ca49f58d241beff7cd27714
---
 .../data/experimental/snapshot_util.cc        | 39 ++++++++++++++++---
 .../kernels/data/experimental/snapshot_util.h |  3 ++
 2 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.cc b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
index 6c4d6424146..877d05ebb3f 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.cc
@@ -62,7 +62,7 @@ Status Writer::Create(Env* env, const std::string& filename,
 }
 
 Status Writer::Initialize(tensorflow::Env* env) {
-  TF_RETURN_IF_ERROR(env->NewWritableFile(filename_, &dest_));
+  TF_RETURN_IF_ERROR(env->NewAppendableFile(filename_, &dest_));
 #if defined(IS_SLIM_BUILD)
   if (compression_type_ != io::compression::kNone) {
     LOG(ERROR) << "Compression is unsupported on mobile platforms. Turning "
@@ -228,13 +228,14 @@ class Reader::Dataset : public DatasetBase {
   explicit Dataset(const std::string& filename, const std::string& compression,
                    const int64 version, const DataTypeVector& dtypes,
                    const std::vector<PartialTensorShape>& shapes,
-                   DatasetContext::Params params)
+                   const int64 start_index, DatasetContext::Params params)
       : DatasetBase(DatasetContext(std::move(params))),
         filename_(filename),
         compression_(compression),
         version_(version),
         dtypes_(dtypes),
-        shapes_(shapes) {}
+        shapes_(shapes),
+        start_index_(start_index) {}
 
   const DataTypeVector& output_dtypes() const override { return dtypes_; }
 
@@ -268,6 +269,7 @@ class Reader::Dataset : public DatasetBase {
   int64 version_;
   DataTypeVector dtypes_;
   std::vector<PartialTensorShape> shapes_;
+  const int64 start_index_;
 
   class Iterator : public DatasetIterator<Dataset> {
    public:
@@ -275,9 +277,10 @@ class Reader::Dataset : public DatasetBase {
         : DatasetIterator<Dataset>(params) {}
 
     Status Initialize(IteratorContext* ctx) override {
-      return Reader::Create(ctx->env(), dataset()->filename_,
-                            dataset()->compression_, dataset()->version_,
-                            dataset()->dtypes_, &reader_);
+      TF_RETURN_IF_ERROR(Reader::Create(
+          ctx->env(), dataset()->filename_, dataset()->compression_,
+          dataset()->version_, dataset()->dtypes_, &reader_));
+      return reader_->SkipRecords(dataset()->start_index_);
     }
 
    protected:
@@ -397,17 +400,32 @@ Status Reader::MakeNestedDataset(Env* env,
                                  const string& compression_type, int version,
                                  const DataTypeVector& dtypes,
                                  const std::vector<PartialTensorShape>& shapes,
+                                 const int64 start_index,
                                  DatasetBase** output) {
   std::vector<DatasetBase*> datasets;
 
   datasets.reserve(filenames.size());
   for (const auto& filename : filenames) {
+    // TODO(frankchn): The reading pattern could be controlled in a non-round
+    // robin fashion, so we cannot assume a round-robin manner when restoring.
+    int64 dataset_start_index = start_index / filenames.size();
+    if (start_index % filenames.size() > datasets.size()) {
+      dataset_start_index++;
+    }
+
     datasets.push_back(
         new Dataset(filename, compression_type, version, dtypes, shapes,
+                    dataset_start_index,
                     DatasetContext::Params({"snapshot_util::Reader::Dataset",
                                             "snapshot_util_reader_Dataset"})));
   }
 
+  // Rotate the vector such that the first dataset contains the next element
+  // to be produced.
+  std::rotate(datasets.begin(),
+              datasets.begin() + (start_index % filenames.size()),
+              datasets.end());
+
   *output = new NestedDataset(
       datasets, DatasetContext::Params({"snapshot_util::Reader::NestedDataset",
                                         "snapshot_util_reader_NestedDataset"}));
@@ -463,6 +481,15 @@ Status Reader::Initialize(Env* env) {
   return Status::OK();
 }
 
+Status Reader::SkipRecords(int64 num_records) {
+  // TODO(frankchn): Optimize to not parse the entire Tensor and actually skip.
+  for (int i = 0; i < num_records; ++i) {
+    std::vector<Tensor> unused_tensors;
+    TF_RETURN_IF_ERROR(ReadTensors(&unused_tensors));
+  }
+  return Status::OK();
+}
+
 Status Reader::ReadTensors(std::vector<Tensor>* read_tensors) {
   profiler::TraceMe activity(
       [&]() { return absl::StrCat(kClassName, kSeparator, "ReadTensors"); },
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_util.h b/tensorflow/core/kernels/data/experimental/snapshot_util.h
index dd15c591a22..79299bb79b4 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_util.h
+++ b/tensorflow/core/kernels/data/experimental/snapshot_util.h
@@ -130,10 +130,13 @@ class Reader {
                                   const string& compression_type, int version,
                                   const DataTypeVector& dtypes,
                                   const std::vector<PartialTensorShape>& shapes,
+                                  const int64 start_index,
                                   DatasetBase** output);
 
   Status ReadTensors(std::vector<Tensor>* read_tensors);
 
+  Status SkipRecords(int64 num_records);
+
  private:
   explicit Reader(const std::string& filename, const string& compression_type,
                   int version, const DataTypeVector& dtypes);

From d98a0e601762228e0c0666f964530a470432bade Mon Sep 17 00:00:00 2001
From: Scott Zhu <scottzhu@google.com>
Date: Mon, 18 May 2020 19:27:50 -0700
Subject: [PATCH 394/412] Move tf.keras.layers.featureDenseFeature back to
 Keras package.

PiperOrigin-RevId: 312201284
Change-Id: I8e51198c62a8e79ef493a173d7f4f8ab65f300eb
---
 tensorflow/python/feature_column/BUILD        |  20 -
 .../feature_column/feature_column_lib.py      |   8 +-
 .../feature_column/feature_column_v2_test.py  | 326 -------------
 .../feature_column/keras_integration_test.py  |   2 +-
 .../sequence_feature_column_test.py           |  49 --
 .../feature_column/serialization_test.py      |  66 ---
 tensorflow/python/keras/feature_column/BUILD  |  78 +++
 .../python/keras/feature_column/__init__.py   |   0
 .../feature_column/dense_features.py          |   5 -
 .../feature_column/dense_features_test.py     | 452 +++++++++++++++++-
 .../feature_column/dense_features_v2.py       |   7 +-
 .../feature_column/dense_features_v2_test.py  |   2 +-
 ...equence_feature_column_integration_test.py |   2 +-
 .../python/keras/layers/serialization.py      |  18 +-
 .../saving/saved_model/saved_model_test.py    |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   2 +-
 ...sorflow.keras.layers.-dense-features.pbtxt |   4 +-
 17 files changed, 545 insertions(+), 498 deletions(-)
 create mode 100644 tensorflow/python/keras/feature_column/__init__.py
 rename tensorflow/python/{ => keras}/feature_column/dense_features.py (97%)
 rename tensorflow/python/{ => keras}/feature_column/dense_features_test.py (60%)
 rename tensorflow/python/{ => keras}/feature_column/dense_features_v2.py (94%)
 rename tensorflow/python/{ => keras}/feature_column/dense_features_v2_test.py (99%)

diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index d67cdf9cc06..786c26c009a 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -55,8 +55,6 @@ py_library(
 py_library(
     name = "feature_column_v2",
     srcs = [
-        "dense_features.py",
-        "dense_features_v2.py",
         "feature_column_v2.py",
         "sequence_feature_column.py",
         "serialization.py",
@@ -126,15 +124,6 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
-    name = "dense_features_test",
-    srcs = ["dense_features_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":feature_column_test_main_lib",
-    ],
-)
-
 py_library(
     name = "feature_column_test_main_lib",
     srcs = ["feature_column_test.py"],
@@ -177,15 +166,6 @@ tf_py_test(
     deps = [":feature_column_v2_test_main_lib"],
 )
 
-tf_py_test(
-    name = "dense_features_v2_test",
-    srcs = ["dense_features_v2_test.py"],
-    tags = ["no_pip"],
-    deps = [
-        ":feature_column_v2_test_main_lib",
-    ],
-)
-
 py_library(
     name = "feature_column_v2_test_main_lib",
     srcs = ["feature_column_v2_test.py"],
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index afe14f55bfc..bda20ff3f2c 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -19,13 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import,g-bad-import-order
-# We import dense_features_v2 first so that the V1 DenseFeatures is the default
-# if users directly import feature_column_lib.
-from tensorflow.python.feature_column.dense_features_v2 import *
-from tensorflow.python.feature_column.dense_features import *
 from tensorflow.python.feature_column.feature_column import *
 from tensorflow.python.feature_column.feature_column_v2 import *
 from tensorflow.python.feature_column.sequence_feature_column import *
 from tensorflow.python.feature_column.serialization import *
+# We import dense_features_v2 first so that the V1 DenseFeatures is the default
+# if users directly import feature_column_lib.
+from tensorflow.python.keras.feature_column.dense_features_v2 import *
+from tensorflow.python.keras.feature_column.dense_features import *
 from tensorflow.python.keras.feature_column.sequence_feature_column import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index cba87a51c23..076515c84b8 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,7 +31,6 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
@@ -5582,23 +5581,6 @@ class IndicatorColumnTest(test.TestCase):
       self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
       self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
-    with ops.Graph().as_default():
-      features = {
-          'animal':
-              sparse_tensor.SparseTensor(
-                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
-      }
-      net = df.DenseFeatures([animal])(features)
-
-      self.evaluate(variables_lib.global_variables_initializer())
-      self.evaluate(lookup_ops.tables_initializer())
-
-      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     animal = fc.indicator_column(
@@ -6271,191 +6253,6 @@ class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
                           self.evaluate(predictions))
 
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': True,
-          'partition_variables': False,
-      }, {
-          'testcase_name': 'dont_use_safe_embedding_lookup',
-          'use_safe_embedding_lookup': False,
-          'partition_variables': False,
-      }, {
-          'testcase_name': 'use_safe_embedding_lookup_partitioned',
-          'use_safe_embedding_lookup': True,
-          'partition_variables': True,
-      }, {
-          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
-          'use_safe_embedding_lookup': False,
-          'partition_variables': True,
-      })
-  @test_util.run_deprecated_v1
-  def test_dense_features(self, use_safe_embedding_lookup, partition_variables):
-    # Inputs.
-    vocabulary_size = 4
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.),  # id 2
-        (9., 13.)  # id 3
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      if partition_variables:
-        self.assertEqual([vocabulary_size, embedding_dimension],
-                         partition_info.full_shape)
-        self.assertAllEqual((2, embedding_dimension), shape)
-      else:
-        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-        self.assertIsNone(partition_info)
-
-      self.assertEqual(dtypes.float32, dtype)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    partitioner = None
-    if partition_variables:
-      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
-    with variable_scope.variable_scope('vars', partitioner=partitioner):
-      embedding_column = fc.embedding_column(
-          categorical_column,
-          dimension=embedding_dimension,
-          initializer=_initializer,
-          use_safe_embedding_lookup=use_safe_embedding_lookup)
-
-      # Provide sparse input and get dense result.
-      l = df.DenseFeatures((embedding_column,))
-      dense_features = l({'aaa': sparse_input})
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    if partition_variables:
-      self.assertCountEqual(
-          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
-           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
-          tuple([v.name for v in global_vars]))
-    else:
-      self.assertCountEqual(
-          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
-          tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    if partition_variables:
-      self.assertCountEqual(
-          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
-           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertCountEqual(
-          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
-          tuple([v.name for v in trainable_vars]))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-    if use_safe_embedding_lookup:
-      self.assertIn('SparseFillEmptyRows',
-                    [x.type for x in ops.get_default_graph().get_operations()])
-    else:
-      self.assertNotIn(
-          'SparseFillEmptyRows',
-          [x.type for x in ops.get_default_graph().get_operations()])
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_not_trainable(self):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        # example 2, ids []
-        # example 3, ids [1]
-        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
-        values=(2, 0, 1, 1),
-        dense_shape=(4, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0, ids [2], embedding = [7, 11]
-        (7., 11.),
-        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        (2., 3.5),
-        # example 2, ids [], embedding = [0, 0]
-        (0., 0.),
-        # example 3, ids [1], embedding = [3, 5]
-        (3., 5.),
-    )
-
-    # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column,
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=False)
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures((embedding_column,))({
-        'aaa': sparse_input
-    })
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
-                          tuple([v.name for v in global_vars]))
-    self.assertCountEqual([],
-                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
   @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
@@ -7389,129 +7186,6 @@ class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
       # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
       self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
-  def _test_dense_features(self, trainable=True):
-    # Inputs.
-    vocabulary_size = 3
-    sparse_input_a = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 4)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_b = sparse_tensor.SparseTensorValue(
-        # example 0, ids [0]
-        # example 1, ids []
-        indices=((0, 0),),
-        values=(0,),
-        dense_shape=(2, 5))
-    sparse_input_c = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 1), (1, 1), (1, 3)),
-        values=(2, 0, 1),
-        dense_shape=(2, 5))
-    sparse_input_d = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids []
-        indices=((0, 1),),
-        values=(2,),
-        dense_shape=(2, 5))
-
-    # Embedding variable.
-    embedding_dimension = 2
-    embedding_values = (
-        (1., 2.),  # id 0
-        (3., 5.),  # id 1
-        (7., 11.)  # id 2
-    )
-
-    def _initializer(shape, dtype, partition_info=None):
-      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
-      self.assertEqual(dtypes.float32, dtype)
-      self.assertIsNone(partition_info)
-      return embedding_values
-
-    # Expected lookup result, using combiner='mean'.
-    expected_lookups = (
-        # example 0:
-        # A ids [2], embedding = [7, 11]
-        # B ids [0], embedding = [1, 2]
-        # C ids [2], embedding = [7, 11]
-        # D ids [2], embedding = [7, 11]
-        (7., 11., 1., 2., 7., 11., 7., 11.),
-        # example 1:
-        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # B ids [], embedding = [0, 0]
-        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # D ids [], embedding = [0, 0]
-        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
-    )
-
-    # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
-        key='bbb', num_buckets=vocabulary_size)
-    categorical_column_c = fc.categorical_column_with_identity(
-        key='ccc', num_buckets=vocabulary_size)
-    categorical_column_d = fc.categorical_column_with_identity(
-        key='ddd', num_buckets=vocabulary_size)
-
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
-        [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
-        [categorical_column_c, categorical_column_d],
-        dimension=embedding_dimension,
-        initializer=_initializer,
-        trainable=trainable)
-
-    features = {
-        'aaa': sparse_input_a,
-        'bbb': sparse_input_b,
-        'ccc': sparse_input_c,
-        'ddd': sparse_input_d
-    }
-
-    # Provide sparse input and get dense result.
-    dense_features = df.DenseFeatures(
-        feature_columns=(embedding_column_b, embedding_column_a,
-                         embedding_column_c, embedding_column_d))(
-                             features)
-
-    # Assert expected embedding variable and lookups.
-    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertCountEqual(
-        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-        tuple([v.name for v in global_vars]))
-    for v in global_vars:
-      self.assertIsInstance(v, variables_lib.Variable)
-    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    if trainable:
-      self.assertCountEqual(
-          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
-          tuple([v.name for v in trainable_vars]))
-    else:
-      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
-    shared_embedding_vars = global_vars
-
-    self.evaluate(variables_lib.global_variables_initializer())
-    self.evaluate(lookup_ops.tables_initializer())
-
-    self.assertAllEqual(embedding_values,
-                        self.evaluate(shared_embedding_vars[0]))
-    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
-
-  @test_util.run_deprecated_v1
-  def test_dense_features(self):
-    self._test_dense_features()
-
-  @test_util.run_deprecated_v1
-  def test_dense_features_no_trainable(self):
-    self._test_dense_features(trainable=False)
-
   @test_util.run_deprecated_v1
   def test_serialization(self):
 
diff --git a/tensorflow/python/feature_column/keras_integration_test.py b/tensorflow/python/feature_column/keras_integration_test.py
index e0677e84e50..456c0204350 100644
--- a/tensorflow/python/feature_column/keras_integration_test.py
+++ b/tensorflow/python/feature_column/keras_integration_test.py
@@ -23,12 +23,12 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python import tf2
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import dense_features_v2
 from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column import feature_column_v2
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column import dense_features_v2
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.keras.premade import linear
 from tensorflow.python.keras.premade import wide_deep
diff --git a/tensorflow/python/feature_column/sequence_feature_column_test.py b/tensorflow/python/feature_column/sequence_feature_column_test.py
index 3d5d24ec03a..d0cf5ee7670 100644
--- a/tensorflow/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/python/feature_column/sequence_feature_column_test.py
@@ -24,7 +24,6 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import serialization
@@ -111,54 +110,6 @@ class ConcatenateContextInputTest(test.TestCase, parameterized.TestCase):
       sfc.concatenate_context_input(context_input, seq_input)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesTest(test.TestCase):
-  """Tests DenseFeatures with sequence feature columns."""
-
-  def test_embedding_column(self):
-    """Tests that error is raised for sequence embedding column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
-
-    input_layer = dense_features.DenseFeatures([embedding_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In embedding_column: aaa_embedding\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-  def test_indicator_column(self):
-    """Tests that error is raised for sequence indicator column."""
-    vocabulary_size = 3
-    sparse_input = sparse_tensor.SparseTensorValue(
-        # example 0, ids [2]
-        # example 1, ids [0, 1]
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 0, 1),
-        dense_shape=(2, 2))
-
-    categorical_column_a = sfc.sequence_categorical_column_with_identity(
-        key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
-
-    input_layer = dense_features.DenseFeatures([indicator_column_a])
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'In indicator_column: aaa_indicator\. categorical_column must not be '
-        r'of type SequenceCategoricalColumn\.'):
-      _ = input_layer({'aaa': sparse_input})
-
-
 def _assert_sparse_tensor_value(test_case, expected, actual):
   _assert_sparse_tensor_indices_shape(test_case, expected, actual)
 
diff --git a/tensorflow/python/feature_column/serialization_test.py b/tensorflow/python/feature_column/serialization_test.py
index 78b72746ac9..881ca0cca5e 100644
--- a/tensorflow/python/feature_column/serialization_test.py
+++ b/tensorflow/python/feature_column/serialization_test.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import serialization
 from tensorflow.python.framework import test_util
@@ -114,71 +113,6 @@ class FeatureColumnSerializationTest(test.TestCase):
     self.assertIs(new_price.normalizer_fn, _custom_fn)
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_get_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_identity(
-                key='b', num_buckets=3), dimension=2)]
-    orig_layer = dense_features.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    self.assertEqual(config['name'], orig_layer.name)
-    self.assertEqual(config['trainable'], trainable)
-    self.assertLen(config['feature_columns'], 2)
-    self.assertEqual(
-        config['feature_columns'][0]['class_name'], 'NumericColumn')
-    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
-    self.assertEqual(
-        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
-
-  @parameterized.named_parameters(
-      ('default', None, None),
-      ('trainable', True, 'trainable'),
-      ('not_trainable', False, 'frozen'))
-  def test_from_config(self, trainable, name):
-    cols = [fc.numeric_column('a'),
-            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
-                'b', vocabulary_list=['1', '2', '3']), dimension=2),
-            fc.indicator_column(fc.categorical_column_with_hash_bucket(
-                key='c', hash_bucket_size=3))]
-    orig_layer = dense_features.DenseFeatures(
-        cols, trainable=trainable, name=name)
-    config = orig_layer.get_config()
-
-    new_layer = dense_features.DenseFeatures.from_config(config)
-
-    self.assertEqual(new_layer.name, orig_layer.name)
-    self.assertEqual(new_layer.trainable, trainable)
-    self.assertLen(new_layer._feature_columns, 3)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a')
-    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
-    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
-    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
-
-  def test_crossed_column(self):
-    a = fc.categorical_column_with_vocabulary_list(
-        'a', vocabulary_list=['1', '2', '3'])
-    b = fc.categorical_column_with_vocabulary_list(
-        'b', vocabulary_list=['1', '2', '3'])
-    ab = fc.crossed_column([a, b], hash_bucket_size=2)
-    cols = [fc.indicator_column(ab)]
-
-    orig_layer = dense_features.DenseFeatures(cols)
-    config = orig_layer.get_config()
-
-    new_layer = dense_features.DenseFeatures.from_config(config)
-
-    self.assertLen(new_layer._feature_columns, 1)
-    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class LinearModelLayerSerializationTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/keras/feature_column/BUILD b/tensorflow/python/keras/feature_column/BUILD
index 650efcceb52..94097c28d73 100644
--- a/tensorflow/python/keras/feature_column/BUILD
+++ b/tensorflow/python/keras/feature_column/BUILD
@@ -12,11 +12,88 @@ exports_files(["LICENSE"])
 
 py_library(
     name = "feature_column",
+    srcs = ["__init__.py"],
     deps = [
+        ":dense_features",
+        ":dense_features_v2",
         ":sequence_feature_column",
     ],
 )
 
+py_library(
+    name = "dense_features",
+    srcs = [
+        "dense_features.py",
+    ],
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python:util",
+        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/keras:backend",
+    ],
+)
+
+py_library(
+    name = "dense_features_v2",
+    srcs = [
+        "dense_features_v2.py",
+    ],
+    deps = [
+        ":dense_features",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tf_export",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+tf_py_test(
+    name = "dense_features_test",
+    srcs = ["dense_features_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
+tf_py_test(
+    name = "dense_features_v2_test",
+    srcs = ["dense_features_v2_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":dense_features_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/feature_column:feature_column_v2",
+    ],
+)
+
 py_library(
     name = "sequence_feature_column",
     srcs = ["sequence_feature_column.py"],
@@ -59,6 +136,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
+        ":dense_features",
         ":sequence_feature_column",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
diff --git a/tensorflow/python/keras/feature_column/__init__.py b/tensorflow/python/keras/feature_column/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/python/feature_column/dense_features.py b/tensorflow/python/keras/feature_column/dense_features.py
similarity index 97%
rename from tensorflow/python/feature_column/dense_features.py
rename to tensorflow/python/keras/feature_column/dense_features.py
index 6feef185815..820f1a6b1b7 100644
--- a/tensorflow/python/feature_column/dense_features.py
+++ b/tensorflow/python/keras/feature_column/dense_features.py
@@ -23,7 +23,6 @@ import json
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
-from tensorflow.python.keras.layers import serialization as layer_serialization
 from tensorflow.python.util import serialization
 from tensorflow.python.util.tf_export import keras_export
 
@@ -173,7 +172,3 @@ class DenseFeatures(fc._BaseFeaturesLayer):  # pylint: disable=protected-access
           cols_to_output_tensors[column] = processed_tensors
         output_tensors.append(processed_tensors)
     return self._verify_and_concat_tensors(output_tensors)
-
-
-layer_serialization.inject_feature_column_v1_objects(
-    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/feature_column/dense_features_test.py b/tensorflow/python/keras/feature_column/dense_features_test.py
similarity index 60%
rename from tensorflow/python/feature_column/dense_features_test.py
rename to tensorflow/python/keras/feature_column/dense_features_test.py
index 7cd523dcc14..76b91dd605f 100644
--- a/tensorflow/python/feature_column/dense_features_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_test.py
@@ -18,22 +18,25 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
 
@@ -676,5 +679,452 @@ class DenseFeaturesTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
+class IndicatorColumnTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    animal = fc.indicator_column(
+        fc.categorical_column_with_identity('animal', num_buckets=4))
+    with ops.Graph().as_default():
+      features = {
+          'animal':
+              sparse_tensor.SparseTensor(
+                  indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
+      }
+      net = df.DenseFeatures([animal])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+
+class EmbeddingColumnTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {
+          'testcase_name': 'use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': False,
+      }, {
+          'testcase_name': 'use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': True,
+          'partition_variables': True,
+      }, {
+          'testcase_name': 'dont_use_safe_embedding_lookup_partitioned',
+          'use_safe_embedding_lookup': False,
+          'partition_variables': True,
+      })
+  @test_util.run_deprecated_v1
+  def test_dense_features(self, use_safe_embedding_lookup, partition_variables):
+    # Inputs.
+    vocabulary_size = 4
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.),  # id 2
+        (9., 13.)  # id 3
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      if partition_variables:
+        self.assertEqual([vocabulary_size, embedding_dimension],
+                         partition_info.full_shape)
+        self.assertAllEqual((2, embedding_dimension), shape)
+      else:
+        self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+        self.assertIsNone(partition_info)
+
+      self.assertEqual(dtypes.float32, dtype)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    partitioner = None
+    if partition_variables:
+      partitioner = partitioned_variables.fixed_size_partitioner(2, axis=0)
+    with variable_scope.variable_scope('vars', partitioner=partitioner):
+      embedding_column = fc.embedding_column(
+          categorical_column,
+          dimension=embedding_dimension,
+          initializer=_initializer,
+          use_safe_embedding_lookup=use_safe_embedding_lookup)
+
+      # Provide sparse input and get dense result.
+      l = df.DenseFeatures((embedding_column,))
+      dense_features = l({'aaa': sparse_input})
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    if partition_variables:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
+           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
+          tuple([v.name for v in global_vars]))
+    else:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
+          tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if partition_variables:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights/part_0:0',
+           'vars/dense_features/aaa_embedding/embedding_weights/part_1:0'),
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertCountEqual(
+          ('vars/dense_features/aaa_embedding/embedding_weights:0',),
+          tuple([v.name for v in trainable_vars]))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+    if use_safe_embedding_lookup:
+      self.assertIn('SparseFillEmptyRows',
+                    [x.type for x in ops.get_default_graph().get_operations()])
+    else:
+      self.assertNotIn(
+          'SparseFillEmptyRows',
+          [x.type for x in ops.get_default_graph().get_operations()])
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_not_trainable(self):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        # example 2, ids []
+        # example 3, ids [1]
+        indices=((0, 0), (1, 0), (1, 4), (3, 0)),
+        values=(2, 0, 1, 1),
+        dense_shape=(4, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0, ids [2], embedding = [7, 11]
+        (7., 11.),
+        # example 1, ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        (2., 3.5),
+        # example 2, ids [], embedding = [0, 0]
+        (0., 0.),
+        # example 3, ids [1], embedding = [3, 5]
+        (3., 5.),
+    )
+
+    # Build columns.
+    categorical_column = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column = fc.embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures((embedding_column,))({
+        'aaa': sparse_input
+    })
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(('dense_features/aaa_embedding/embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+    self.assertCountEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+
+class SharedEmbeddingColumnTest(test.TestCase, parameterized.TestCase):
+
+  def _test_dense_features(self, trainable=True):
+    # Inputs.
+    vocabulary_size = 3
+    sparse_input_a = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 4)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_b = sparse_tensor.SparseTensorValue(
+        # example 0, ids [0]
+        # example 1, ids []
+        indices=((0, 0),),
+        values=(0,),
+        dense_shape=(2, 5))
+    sparse_input_c = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 1), (1, 1), (1, 3)),
+        values=(2, 0, 1),
+        dense_shape=(2, 5))
+    sparse_input_d = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids []
+        indices=((0, 1),),
+        values=(2,),
+        dense_shape=(2, 5))
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info=None):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups = (
+        # example 0:
+        # A ids [2], embedding = [7, 11]
+        # B ids [0], embedding = [1, 2]
+        # C ids [2], embedding = [7, 11]
+        # D ids [2], embedding = [7, 11]
+        (7., 11., 1., 2., 7., 11., 7., 11.),
+        # example 1:
+        # A ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # B ids [], embedding = [0, 0]
+        # C ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+        # D ids [], embedding = [0, 0]
+        (2., 3.5, 0., 0., 2., 3.5, 0., 0.),
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    categorical_column_c = fc.categorical_column_with_identity(
+        key='ccc', num_buckets=vocabulary_size)
+    categorical_column_d = fc.categorical_column_with_identity(
+        key='ddd', num_buckets=vocabulary_size)
+
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns_v2(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+    embedding_column_c, embedding_column_d = fc.shared_embedding_columns_v2(
+        [categorical_column_c, categorical_column_d],
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=trainable)
+
+    features = {
+        'aaa': sparse_input_a,
+        'bbb': sparse_input_b,
+        'ccc': sparse_input_c,
+        'ddd': sparse_input_d
+    }
+
+    # Provide sparse input and get dense result.
+    dense_features = df.DenseFeatures(
+        feature_columns=(embedding_column_b, embedding_column_a,
+                         embedding_column_c, embedding_column_d))(
+                             features)
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertCountEqual(
+        ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+        tuple([v.name for v in global_vars]))
+    for v in global_vars:
+      self.assertIsInstance(v, variables_lib.Variable)
+    trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
+    if trainable:
+      self.assertCountEqual(
+          ['aaa_bbb_shared_embedding:0', 'ccc_ddd_shared_embedding:0'],
+          tuple([v.name for v in trainable_vars]))
+    else:
+      self.assertCountEqual([], tuple([v.name for v in trainable_vars]))
+    shared_embedding_vars = global_vars
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values,
+                        self.evaluate(shared_embedding_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    self._test_dense_features()
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_no_trainable(self):
+    self._test_dense_features(trainable=False)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DenseFeaturesSerializationTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_get_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_identity(
+                key='b', num_buckets=3), dimension=2)]
+    orig_layer = df.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    self.assertEqual(config['name'], orig_layer.name)
+    self.assertEqual(config['trainable'], trainable)
+    self.assertLen(config['feature_columns'], 2)
+    self.assertEqual(
+        config['feature_columns'][0]['class_name'], 'NumericColumn')
+    self.assertEqual(config['feature_columns'][0]['config']['shape'], (1,))
+    self.assertEqual(
+        config['feature_columns'][1]['class_name'], 'EmbeddingColumn')
+
+  @parameterized.named_parameters(
+      ('default', None, None),
+      ('trainable', True, 'trainable'),
+      ('not_trainable', False, 'frozen'))
+  def test_from_config(self, trainable, name):
+    cols = [fc.numeric_column('a'),
+            fc.embedding_column(fc.categorical_column_with_vocabulary_list(
+                'b', vocabulary_list=['1', '2', '3']), dimension=2),
+            fc.indicator_column(fc.categorical_column_with_hash_bucket(
+                key='c', hash_bucket_size=3))]
+    orig_layer = df.DenseFeatures(
+        cols, trainable=trainable, name=name)
+    config = orig_layer.get_config()
+
+    new_layer = df.DenseFeatures.from_config(config)
+
+    self.assertEqual(new_layer.name, orig_layer.name)
+    self.assertEqual(new_layer.trainable, trainable)
+    self.assertLen(new_layer._feature_columns, 3)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a')
+    self.assertEqual(new_layer._feature_columns[1].initializer.mean, 0.0)
+    self.assertEqual(new_layer._feature_columns[1].categorical_column.name, 'b')
+    self.assertIsInstance(new_layer._feature_columns[2], fc.IndicatorColumn)
+
+  def test_crossed_column(self):
+    a = fc.categorical_column_with_vocabulary_list(
+        'a', vocabulary_list=['1', '2', '3'])
+    b = fc.categorical_column_with_vocabulary_list(
+        'b', vocabulary_list=['1', '2', '3'])
+    ab = fc.crossed_column([a, b], hash_bucket_size=2)
+    cols = [fc.indicator_column(ab)]
+
+    orig_layer = df.DenseFeatures(cols)
+    config = orig_layer.get_config()
+
+    new_layer = df.DenseFeatures.from_config(config)
+
+    self.assertLen(new_layer._feature_columns, 1)
+    self.assertEqual(new_layer._feature_columns[0].name, 'a_X_b_indicator')
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SequenceFeatureColumnsTest(test.TestCase):
+  """Tests DenseFeatures with sequence feature columns."""
+
+  def test_embedding_column(self):
+    """Tests that error is raised for sequence embedding column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    embedding_column_a = fc.embedding_column(
+        categorical_column_a, dimension=2)
+
+    input_layer = df.DenseFeatures([embedding_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In embedding_column: aaa_embedding\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+  def test_indicator_column(self):
+    """Tests that error is raised for sequence indicator column."""
+    vocabulary_size = 3
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, ids [2]
+        # example 1, ids [0, 1]
+        indices=((0, 0), (1, 0), (1, 1)),
+        values=(2, 0, 1),
+        dense_shape=(2, 2))
+
+    categorical_column_a = sfc.sequence_categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    indicator_column_a = fc.indicator_column(categorical_column_a)
+
+    input_layer = df.DenseFeatures([indicator_column_a])
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'In indicator_column: aaa_indicator\. categorical_column must not be '
+        r'of type SequenceCategoricalColumn\.'):
+      _ = input_layer({'aaa': sparse_input})
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/feature_column/dense_features_v2.py b/tensorflow/python/keras/feature_column/dense_features_v2.py
similarity index 94%
rename from tensorflow/python/feature_column/dense_features_v2.py
rename to tensorflow/python/keras/feature_column/dense_features_v2.py
index 405c5d63249..e4dc22f1bbe 100644
--- a/tensorflow/python/feature_column/dense_features_v2.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2.py
@@ -18,10 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.layers import serialization as layer_serialization
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.util.tf_export import keras_export
 
 
@@ -94,7 +93,3 @@ class DenseFeatures(dense_features.DenseFeatures):
     # We would like to call Layer.build and not _DenseFeaturesHelper.build.
     # pylint: disable=protected-access
     super(fc._BaseFeaturesLayer, self).build(None)  # pylint: disable=bad-super-call
-
-
-layer_serialization.inject_feature_column_v2_objects(
-    'DenseFeatures', DenseFeatures)
diff --git a/tensorflow/python/feature_column/dense_features_v2_test.py b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
similarity index 99%
rename from tensorflow/python/feature_column/dense_features_v2_test.py
rename to tensorflow/python/keras/feature_column/dense_features_v2_test.py
index 71cb163a7d9..95fc8b7ac1e 100644
--- a/tensorflow/python/feature_column/dense_features_v2_test.py
+++ b/tensorflow/python/keras/feature_column/dense_features_v2_test.py
@@ -23,7 +23,6 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import dense_features_v2 as df
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,6 +30,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features_v2 as df
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import variables as variables_lib
diff --git a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
index 8784182e23b..b1100bf7b07 100644
--- a/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/python/keras/feature_column/sequence_feature_column_integration_test.py
@@ -24,11 +24,11 @@ from google.protobuf import text_format
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import dense_features
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.feature_column import dense_features
 from tensorflow.python.keras.feature_column import sequence_feature_column as ksfc
 from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.ops import init_ops_v2
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 0a90441d8a0..30be3d485df 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -64,23 +64,11 @@ ALL_V2_MODULES = (
     recurrent_v2,
     preprocessing_normalization
 )
-FEATURE_COLUMN_V1_OBJECTS = {}
-FEATURE_COLUMN_V2_OBJECTS = {}
 # ALL_OBJECTS is meant to be a global mutable. Hence we need to make it
 # thread-local to avoid concurrent mutations.
 LOCAL = threading.local()
 
 
-def inject_feature_column_v1_objects(name, cls):
-  global FEATURE_COLUMN_V1_OBJECTS
-  FEATURE_COLUMN_V1_OBJECTS[name] = cls
-
-
-def inject_feature_column_v2_objects(name, cls):
-  global FEATURE_COLUMN_V2_OBJECTS
-  FEATURE_COLUMN_V2_OBJECTS[name] = cls
-
-
 def populate_deserializable_objects():
   """Populates dict ALL_OBJECTS with every built-in layer.
   """
@@ -134,9 +122,11 @@ def populate_deserializable_objects():
   LOCAL.ALL_OBJECTS['WideDeepModel'] = WideDeepModel
 
   if tf2.enabled():
-    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V2_OBJECTS)
+    from tensorflow.python.keras.feature_column.dense_features_v2 import DenseFeatures  # pylint: disable=g-import-not-at-top
+    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
   else:
-    LOCAL.ALL_OBJECTS.update(FEATURE_COLUMN_V1_OBJECTS)
+    from tensorflow.python.keras.feature_column.dense_features import DenseFeatures  # pylint: disable=g-import-not-at-top
+    LOCAL.ALL_OBJECTS['DenseFeatures'] = DenseFeatures
 
   # Merge layers, function versions.
   LOCAL.ALL_OBJECTS['add'] = merge.add
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 30a93e2bba3..4ada84191dc 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -39,7 +39,6 @@ from tensorflow.python.distribute import mirrored_strategy
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.feature_column.dense_features import DenseFeatures
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -48,6 +47,7 @@ from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.feature_column.dense_features import DenseFeatures
 from tensorflow.python.keras.saving.saved_model import load as keras_load
 from tensorflow.python.keras.saving.saved_model import save_impl as keras_save
 from tensorflow.python.keras.utils import generic_utils
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
index ecda1603325..ba9156d7f95 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index f7137f0d09b..130a9954202 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features_v2.DenseFeatures\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.dense_features.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features_v2.DenseFeatures\'>"
+  is_instance: "<class \'tensorflow.python.keras.feature_column.dense_features.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2._BaseFeaturesLayer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.module.module.Module\'>"

From 63926472df4f777b43146c608a0027a42569fe57 Mon Sep 17 00:00:00 2001
From: Jing Pu <jingpu@google.com>
Date: Mon, 18 May 2020 19:34:59 -0700
Subject: [PATCH 395/412] Fix TF_ConcatV2Op conversion pattern when the axis is
 a I64 Tensor.

PiperOrigin-RevId: 312201848
Change-Id: I55fcd3b514e9da905d0687d7c66e4da49c178ea5
---
 .../compiler/mlir/lite/tests/legalize-tf.mlir |  9 ++++++
 .../mlir/lite/transforms/legalize_tf.cc       | 29 +++++++++++++++++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 15b6bf56b7a..15c73d2db2c 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1048,6 +1048,15 @@ func @concatv2With3Tensors(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2
 // CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
 }
 
+func @concatv2I64Axis(%arg0: tensor<2x1xi32>, %arg1: tensor<2x1xi32>, %arg2: tensor<2x1xi32>) -> tensor<2x3xi32> {
+  %0 = "tf.Const"() { value = dense<-1> : tensor<i64> } : () -> tensor<i64>
+  %1 = "tf.ConcatV2"(%arg0, %arg1, %arg2, %0) : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>, tensor<i64>) -> tensor<2x3xi32>
+  return %1 : tensor<2x3xi32>
+
+// CHECK-LABEL: concatv2I64Axis
+// CHECK: "tfl.concatenation"(%arg0, %arg1, %arg2) {axis = -1 : i32, fused_activation_function = "NONE"} : (tensor<2x1xi32>, tensor<2x1xi32>, tensor<2x1xi32>) -> tensor<2x3xi32>
+}
+
 func @resize_with_bilinear(%arg0: tensor<1x100x100x3xf32>, %arg1: tensor<4xi32>) -> tensor<?xf32> {
   %0 = "tf.ResizeBilinear"(%arg0, %arg1) {align_corners = true} : (tensor<1x100x100x3xf32>, tensor<4xi32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index ab4c4f5c4cf..bfcbc190638 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
@@ -202,6 +203,26 @@ LogicalResult ConvertTFConcatOp::matchAndRewrite(
   return success();
 }
 
+// Converts any IntegerAttr to an IntegerAttr of an i32 type.
+// The value won't change in the new attribute, but if the value is out of
+// the bound of i32, the function returns a failure.
+LogicalResult ConvertToI32Attr(IntegerAttr attr, IntegerAttr* attr_i32) {
+  if (attr.getType().isInteger(/*width=*/32)) {
+    *attr_i32 = attr;
+    return success();
+  }
+
+  int64_t value = attr.getInt();
+  if (value > std::numeric_limits<int>::max() ||
+      value < std::numeric_limits<int>::min()) {
+    return failure();
+  }
+
+  *attr_i32 = IntegerAttr::get(
+      IntegerType::get(/*width=*/32, attr.getContext()), value);
+  return success();
+}
+
 LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
     Operation* op, PatternRewriter& rewriter) const {
   auto tf_concat_op = cast<TF::ConcatV2Op>(op);
@@ -211,12 +232,16 @@ LogicalResult ConvertTFConcatV2Op::matchAndRewrite(
   // Extract axis attribute from constant axis tensor
   ElementsAttr axis;
   if (!matchPattern(tf_concat_op.axis(), m_Constant(&axis))) return failure();
+  IntegerAttr axis_int = ExtractSingleElementAsInteger(axis);
+
+  // "axis" operand could be a i64 tensor. Resolve it here.
+  IntegerAttr axis_i32;
+  if (failed(ConvertToI32Attr(axis_int, &axis_i32))) return failure();
 
   StringAttr fused_activation_function =
       StringAttr::get("NONE", rewriter.getContext());
   rewriter.replaceOpWithNewOp<ConcatenationOp>(
-      op, output_type, values, ExtractSingleElementAsInteger(axis),
-      fused_activation_function);
+      op, output_type, values, axis_i32, fused_activation_function);
   return success();
 }
 

From 44c387f2979ff469e56e492fb417ded3448591a3 Mon Sep 17 00:00:00 2001
From: Karim Nosir <karimnosseir@google.com>
Date: Mon, 18 May 2020 19:38:16 -0700
Subject: [PATCH 396/412] NFC: Update const-fold tests to use regex which is
 the suggested way for matching.

PiperOrigin-RevId: 312202071
Change-Id: I901de7936dc260eb968a835be826dfbd39b78c9f
---
 .../compiler/mlir/lite/tests/const-fold.mlir  | 184 +++++++++---------
 1 file changed, 92 insertions(+), 92 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
index 4b8993e2b26..a8463d51c7e 100644
--- a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
@@ -8,13 +8,13 @@ func @add_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>,
   %2 = constant dense< 3.5> : tensor<4xf32>
   %3 = constant dense<-0.5> : tensor<4xf32>
 
-  // CHECK: %cst = constant dense<3.500000e+00> : tensor<4xf32>
-  // CHECK: %cst_0 = constant dense<-5.000000e-01> : tensor<4xf32>
-  // CHECK: %cst_1 = constant dense<6.000000e+00> : tensor<f32>
-  // CHECK: %cst_2 = constant dense<4.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_3 = constant dense<5.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_4 = constant dense<3.000000e+00> : tensor<4xf32>
-  // CHECK: %0 = tfl.add %cst, %cst_0 {fused_activation_function = "SIGN_BIT"} : tensor<4xf32>
+  // CHECK: %[[CST:.*]] = constant dense<3.500000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<-5.000000e-01> : tensor<4xf32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<6.000000e+00> : tensor<f32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<4.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_3:.*]]  = constant dense<5.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_4:.*]]  = constant dense<3.000000e+00> : tensor<4xf32>
+  // CHECK: %0 = tfl.add %[[CST]], %[[CST_0]] {fused_activation_function = "SIGN_BIT"} : tensor<4xf32>
 
   %5 = "tfl.add"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<  f32>) -> tensor<  f32>
   %6 = "tfl.add"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<4xf32>) -> tensor<4xf32>
@@ -33,10 +33,10 @@ func @add_int() -> (tensor<i32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
   %2 = constant dense< 4> : tensor<4xi32>
   %3 = constant dense<-2> : tensor<4xi32>
 
-  // CHECK: %cst = constant dense<9> : tensor<i32>
-  // CHECK: %cst_0 = constant dense<6> : tensor<4xi32>
-  // CHECK: %cst_1 = constant dense<5> : tensor<4xi32>
-  // CHECK: %cst_2 = constant dense<2> : tensor<4xi32>
+  // CHECK: %[[CST:.*]] = constant dense<9> : tensor<i32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<6> : tensor<4xi32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<5> : tensor<4xi32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<2> : tensor<4xi32>
 
   %5 = "tfl.add"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<  i32>) -> tensor<  i32>
   %6 = "tfl.add"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<4xi32>) -> tensor<4xi32>
@@ -54,10 +54,10 @@ func @sub_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>)
   %2 = constant dense< 3.5> : tensor<4xf32>
   %3 = constant dense<-0.5> : tensor<4xf32>
 
-  // CHECK: %cst = constant dense<3.000000e+00> : tensor<f32>
-  // CHECK: %cst_0 = constant dense<5.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_1 = constant dense<2.000000e+00> : tensor<4xf32>
-  // CHECK: %cst_2 = constant dense<4.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST:.*]] = constant dense<3.000000e+00> : tensor<f32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<5.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<2.000000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<4.000000e+00> : tensor<4xf32>
 
   %5 = "tfl.sub"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<  f32>) -> tensor<  f32>
   %6 = "tfl.sub"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<4xf32>) -> tensor<4xf32>
@@ -75,10 +75,10 @@ func @sub_int() -> (tensor<i32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) {
   %2 = constant dense< 4> : tensor<4xi32>
   %3 = constant dense<-2> : tensor<4xi32>
 
-  // CHECK: %cst = constant dense<7> : tensor<i32>
-  // CHECK: %cst_0 = constant dense<10> : tensor<4xi32>
-  // CHECK: %cst_1 = constant dense<3> : tensor<4xi32>
-  // CHECK: %cst_2 = constant dense<6> : tensor<4xi32>
+  // CHECK: %[[CST:.*]] = constant dense<7> : tensor<i32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<10> : tensor<4xi32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<3> : tensor<4xi32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<6> : tensor<4xi32>
 
   %5 = "tfl.sub"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<  i32>) -> tensor<  i32>
   %6 = "tfl.sub"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  i32>, tensor<4xi32>) -> tensor<4xi32>
@@ -96,10 +96,10 @@ func @mul_float() -> (tensor<f32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>)
   %2 = constant dense< 3.5> : tensor<4xf32>
   %3 = constant dense<-0.5> : tensor<4xf32>
 
-  // CHECK: %cst = constant dense<6.750000e+00> : tensor<f32>
-  // CHECK: %cst_0 = constant dense<-2.250000e+00> : tensor<4xf32>
-  // CHECK: %cst_1 = constant dense<5.250000e+00> : tensor<4xf32>
-  // CHECK: %cst_2 = constant dense<-1.750000e+00> : tensor<4xf32>
+  // CHECK: %[[CST:.*]] = constant dense<6.750000e+00> : tensor<f32>
+  // CHECK: %[[CST_0:.*]]  = constant dense<-2.250000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_1:.*]]  = constant dense<5.250000e+00> : tensor<4xf32>
+  // CHECK: %[[CST_2:.*]]  = constant dense<-1.750000e+00> : tensor<4xf32>
 
   %5 = "tfl.mul"(%0, %1) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<  f32>) -> tensor<  f32>
   %6 = "tfl.mul"(%0, %3) {fused_activation_function = "NONE"} : (tensor<  f32>, tensor<4xf32>) -> tensor<4xf32>
@@ -170,8 +170,8 @@ func @add_dense_splat_int() -> tensor<4xi32> {
 
   return %2 : tensor<4xi32>
 
-// CHECK:  %cst = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_splat_dense_int
@@ -183,8 +183,8 @@ func @add_splat_dense_int() -> tensor<4xi32> {
 
   return %2 : tensor<4xi32>
 
-// CHECK:  %cst = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-5, 4, 47, 105]> : tensor<4xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_same_shape
@@ -196,8 +196,8 @@ func @add_dense_dense_int_same_shape() -> tensor<4xi32> {
 
   return %2 : tensor<4xi32>
 
-// CHECK:  %cst = constant dense<[5, 22, -2, 98]> : tensor<4xi32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[5, 22, -2, 98]> : tensor<4xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_trailing_dim
@@ -212,10 +212,10 @@ func @add_dense_dense_int_trailing_dim() -> (tensor<2x2xi32>, tensor<2x2x2xi32>,
 
   return %0, %1, %2 : tensor<2x2xi32>, tensor<2x2x2xi32>, tensor<2x2x2xi32>
 
-// CHECK:  %cst = constant dense<{{\[\[}}11, 22], [13, 24]]> : tensor<2x2xi32>
-// CHECK:  %cst_0 = constant dense<{{\[\[\[}}2, 3], [5, 6]], {{\[\[}}4, 5], [7, 8]]]> : tensor<2x2x2xi32>
-// CHECK:  %cst_1 = constant dense<{{\[\[\[}}11, 21], [12, 22]], {{\[\[}}13, 23], [14, 24]]]> : tensor<2x2x2xi32>
-// CHECK:  return %cst, %cst_0, %cst_1
+// CHECK:  %[[CST:.*]] = constant dense<{{\[\[}}11, 22], [13, 24]]> : tensor<2x2xi32>
+// CHECK:  %[[CST_0:.*]]  = constant dense<{{\[\[\[}}2, 3], [5, 6]], {{\[\[}}4, 5], [7, 8]]]> : tensor<2x2x2xi32>
+// CHECK:  %[[CST_1:.*]]  = constant dense<{{\[\[\[}}11, 21], [12, 22]], {{\[\[}}13, 23], [14, 24]]]> : tensor<2x2x2xi32>
+// CHECK:  return %[[CST]], %[[CST_0]], %[[CST_1]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_mixing_1_n
@@ -226,8 +226,8 @@ func @add_dense_dense_int_mixing_1_n() -> tensor<2x2xi32> {
   %0 = "tfl.add"(%cst_0, %cst_1) {fused_activation_function = "NONE"} : (tensor<1x2xi32>, tensor<2x1xi32>) -> tensor<2x2xi32>
 
   return %0 : tensor<2x2xi32>
-// CHECK: %cst = constant dense<{{\[\[}}4, 5], [5, 6]]> : tensor<2x2xi32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<{{\[\[}}4, 5], [5, 6]]> : tensor<2x2xi32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_splat_float
@@ -239,8 +239,8 @@ func @add_dense_splat_float() -> tensor<4xf32> {
 
   return %2 : tensor<4xf32>
 
-// CHECK:  %cst = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_splat_dense_float
@@ -252,8 +252,8 @@ func @add_splat_dense_float() -> tensor<4xf32> {
 
   return %2 : tensor<4xf32>
 
-// CHECK:  %cst = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-6.500000e+00, 2.000000e+00, 4.550000e+01, 1.075000e+01]> : tensor<4xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_float_same_shape
@@ -265,8 +265,8 @@ func @add_dense_dense_float_same_shape() -> (tensor<4xf32>) {
 
   return %2 : tensor<4xf32>
 
-// CHECK:  %cst = constant dense<[-8.89999961, 1.000000e+00, 3.800000e+01, 9.800000e+01]> : tensor<4xf32>
-// CHECK:  return %cst
+// CHECK:  %[[CST:.*]] = constant dense<[-8.89999961, 1.000000e+00, 3.800000e+01, 9.800000e+01]> : tensor<4xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_float_trailing_dim
@@ -281,10 +281,10 @@ func @add_dense_dense_float_trailing_dim() -> (tensor<2x2xf32>, tensor<2x2x2xf32
 
   return %0, %1, %2 : tensor<2x2xf32>, tensor<2x2x2xf32>, tensor<2x2x2xf32>
 
-// CHECK:  %cst = constant dense<{{\[\[}}-4.500000e+00, -2.500000e+00], [8.500000e+00, -8.500000e+00]]> : tensor<2x2xf32>
-// CHECK:  %cst_0 = constant dense<{{\[\[\[}}-4.500000e+00, 2.500000e+00], [9.500000e+00, -2.500000e+00]], {{\[\[}}-2.500000e+00, 4.500000e+00], [1.150000e+01, -5.000000e-01]]]> : tensor<2x2x2xf32>
-// CHECK:  %cst_1 = constant dense<{{\[\[\[}}2.000000e+00, -3.000000e+00], [3.000000e+00, -2.000000e+00]], {{\[\[}}4.000000e+00, -1.000000e+00], [5.000000e+00, 0.000000e+00]]]> : tensor<2x2x2xf32>
-// CHECK:  return %cst, %cst_0, %cst_1
+// CHECK:  %[[CST:.*]] = constant dense<{{\[\[}}-4.500000e+00, -2.500000e+00], [8.500000e+00, -8.500000e+00]]> : tensor<2x2xf32>
+// CHECK:  %[[CST_0:.*]]  = constant dense<{{\[\[\[}}-4.500000e+00, 2.500000e+00], [9.500000e+00, -2.500000e+00]], {{\[\[}}-2.500000e+00, 4.500000e+00], [1.150000e+01, -5.000000e-01]]]> : tensor<2x2x2xf32>
+// CHECK:  %[[CST_1:.*]]  = constant dense<{{\[\[\[}}2.000000e+00, -3.000000e+00], [3.000000e+00, -2.000000e+00]], {{\[\[}}4.000000e+00, -1.000000e+00], [5.000000e+00, 0.000000e+00]]]> : tensor<2x2x2xf32>
+// CHECK:  return %[[CST]], %[[CST_0]], %[[CST_1]]
 }
 
 // CHECK-LABEL: @add_dense_dense_float_mixfng_1_n
@@ -296,24 +296,24 @@ func @add_dense_dense_float_mixfng_1_n() -> tensor<2x2xf32> {
 
   return %0 : tensor<2x2xf32>
 
-// CHECK: %cst = constant dense<{{\[\[}}-1.500000e+00, -5.500000e+00], [5.500000e+00, 1.500000e+00]]> : tensor<2x2xf32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<{{\[\[}}-1.500000e+00, -5.500000e+00], [5.500000e+00, 1.500000e+00]]> : tensor<2x2xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @rank
 func @rank() -> tensor<1xi32> {
   %cst = constant dense<[[1], [2]]> : tensor<2x1xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<2> : tensor<1xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<2> : tensor<1xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.rank"(%cst) : (tensor<2x1xi32>) -> tensor<1xi32>
   return %0 : tensor<1xi32>
 }
 
 // CHECK-LABEL: @rank_input_known_rank
 func @rank_input_known_rank(%arg0 : tensor<2x1xi32>) -> tensor<1xi32> {
-  // CHECK: [[cst:%.*]] = constant dense<2> : tensor<1xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<2> : tensor<1xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.rank"(%arg0) : (tensor<2x1xi32>) -> tensor<1xi32>
   return %0 : tensor<1xi32>
 }
@@ -323,8 +323,8 @@ func @reshape() -> tensor<4xi32> {
   %input = constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
   %shape = constant dense<[4]> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[1, 2, 3, 4]> : tensor<4xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.reshape"(%input, %shape) : (tensor<2x2xi32>, tensor<1xi32>) -> tensor<4xi32>
   return %0 : tensor<4xi32>
 }
@@ -334,8 +334,8 @@ func @reshape_dynamic_output() -> tensor<?xi32> {
   %input = constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
   %shape = constant dense<[4]> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.reshape"(%input, %shape) : (tensor<2x2xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -343,8 +343,8 @@ func @reshape_dynamic_output() -> tensor<?xi32> {
 
 // CHECK-LABEL: @pseudo_const
 func @pseudo_const() -> tensor<i32> {
-  // CHECK: [[cst:%.*]] = constant dense<1> : tensor<i32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<1> : tensor<i32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
   return %0 : tensor<i32>
 }
@@ -356,8 +356,8 @@ func @range_int() -> tensor<?xi32> {
   %cst_1 = constant dense<4> : tensor<i32>
   %cst_2 = constant dense<1> : tensor<i32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0, 1, 2, 3]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -368,8 +368,8 @@ func @range_float() -> tensor<?xf32> {
   %cst_1 = constant dense<4.0> : tensor<f32>
   %cst_2 = constant dense<1.0> : tensor<f32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -381,8 +381,8 @@ func @range_float_neg_delta() -> tensor<?xf32> {
   %cst_1 = constant dense<-4.0> : tensor<f32>
   %cst_2 = constant dense<-1.0> : tensor<f32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, -1.000000e+00, -2.000000e+00, -3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[0.000000e+00, -1.000000e+00, -2.000000e+00, -3.000000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -393,8 +393,8 @@ func @range_float_nonzero_base() -> tensor<?xf32> {
   %cst_1 = constant dense<7.0> : tensor<f32>
   %cst_2 = constant dense<1.5> : tensor<f32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[2.000000e+00, 3.500000e+00, 5.000000e+00, 6.500000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[2.000000e+00, 3.500000e+00, 5.000000e+00, 6.500000e+00]> : tensor<4xf32>} : () -> tensor<?xf32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.range"(%cst, %cst_1, %cst_2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
@@ -414,8 +414,8 @@ func @transpose_1d() -> tensor<3xi32> {
   %cst = constant dense<[1, 2, 3]> : tensor<3xi32>
   %cst_perm = constant dense<0> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[}}1, 2, 3]> : tensor<3xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[}}1, 2, 3]> : tensor<3xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<3xi32>, tensor<1xi32>) -> tensor<3xi32>
   return %0 : tensor<3xi32>
 }
@@ -425,8 +425,8 @@ func @transpose_dynamic() -> tensor<?xi32> {
   %cst = constant dense<[1, 2, 3]> : tensor<3xi32>
   %cst_perm = constant dense<0> : tensor<1xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<{{\[}}1, 2, 3]> : tensor<3xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<{{\[}}1, 2, 3]> : tensor<3xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<3xi32>, tensor<1xi32>) -> tensor<?xi32>
   return %0 : tensor<?xi32>
 }
@@ -436,8 +436,8 @@ func @transpose_2d() -> tensor<2x2xi32> {
   %cst = constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
   %cst_perm = constant dense<[1, 0]> : tensor<2xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[\[}}0, 2], {{\[}}1, 3]]> : tensor<2x2xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[\[}}0, 2], {{\[}}1, 3]]> : tensor<2x2xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   return %0 : tensor<2x2xi32>
 }
@@ -447,8 +447,8 @@ func @transpose_2d_identity() -> tensor<2x2xi32> {
   %cst = constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
   %cst_perm = constant dense<[0, 1]> : tensor<2xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[\[}}0, 1], {{\[}}2, 3]]> : tensor<2x2xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[\[}}0, 1], {{\[}}2, 3]]> : tensor<2x2xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x2xi32>, tensor<2xi32>) -> tensor<2x2xi32>
   return %0 : tensor<2x2xi32>
 }
@@ -460,8 +460,8 @@ func @transpose_3d() -> tensor<4x2x3xi32> {
   %cst = constant dense<[[[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]], [[12, 13, 14, 15], [16, 17, 18, 19], [20, 21, 22, 23]]]> : tensor<2x3x4xi32>
   %cst_perm = constant dense<[2, 0, 1]> : tensor<3xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<{{\[\[\[}}0, 4, 8], {{\[}}12, 16, 20]], {{\[\[}}1, 5, 9], {{\[}}13, 17, 21]], {{\[\[}}2, 6, 10], {{\[}}14, 18, 22]], {{\[\[}}3, 7, 11], {{\[}}15, 19, 23]]]> : tensor<4x2x3xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = constant dense<{{\[\[\[}}0, 4, 8], {{\[}}12, 16, 20]], {{\[\[}}1, 5, 9], {{\[}}13, 17, 21]], {{\[\[}}2, 6, 10], {{\[}}14, 18, 22]], {{\[\[}}3, 7, 11], {{\[}}15, 19, 23]]]> : tensor<4x2x3xi32>
+  // CHECK: return %[[CST]]
   %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x3x4xi32>, tensor<3xi32>) -> tensor<4x2x3xi32>
   return %0 : tensor<4x2x3xi32>
 }
@@ -473,8 +473,8 @@ func @ConstantFoldBinaryOpDynamicOutput() -> tensor<?xi32> {
   %87 = "tfl.sub"(%cst_0, %cst) {fused_activation_function = "NONE"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
   return %87 : tensor<?xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[-5, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[-5, 0]> : tensor<2xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @add_dense_dense_int_same_shape_dynamic
@@ -486,8 +486,8 @@ func @add_dense_dense_int_same_shape_dynamic() -> tensor<?xi32> {
 
   return %2 : tensor<?xi32>
 
-  // CHECK: [[cst:%.*]] = "tfl.pseudo_const"() {value = dense<[5, 22, -2, 98]> : tensor<4xi32>} : () -> tensor<?xi32>
-  // CHECK: return [[cst]]
+  // CHECK: %[[CST:.*]] = "tfl.pseudo_const"() {value = dense<[5, 22, -2, 98]> : tensor<4xi32>} : () -> tensor<?xi32>
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @concat_2_tensors_1_empty
@@ -497,8 +497,8 @@ func @concat_2_tensors_1_empty() -> tensor<2xi32> {
   %3 = "tfl.concatenation"(%1, %2) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<0xi32>) -> tensor<2xi32>
   return %3 : tensor<2xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<1> : tensor<2xi32>
-  // CHECK: return [[cst]] : tensor<2xi32>
+  // CHECK: %[[CST:.*]] = constant dense<1> : tensor<2xi32>
+  // CHECK: return %[[CST]] : tensor<2xi32>
 }
 
 // CHECK-LABEL: @concat_3_tensors_1_empty
@@ -509,7 +509,7 @@ func @concat_3_tensors_1_empty() -> tensor<?xi32> {
   %3 = "tfl.concatenation"(%0, %1, %2) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<2xi32>, tensor<2xi32>, tensor<0xi32>) -> tensor<?xi32>
   return %3 : tensor<?xi32>
 
-  // CHECK: %0 = "tfl.concatenation"(%cst, %cst) {axis = 0 : i32, fused_activation_function = "NONE"}
+  // CHECK: %0 = "tfl.concatenation"(%[[CST]], %[[CST]]) {axis = 0 : i32, fused_activation_function = "NONE"}
   // CHECK: return %0 : tensor<?xi32>
 }
 
@@ -520,10 +520,10 @@ func @concatConstantTensorsFirstDim() -> tensor<2x2x3xi32> {
   %0 = "tfl.concatenation"(%cst_0, %cst_1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xi32>, tensor<1x2x3xi32>) -> tensor<2x2x3xi32>
   return %0 : tensor<2x2x3xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0]], {{\[}}{{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<2x2x3xi32>
+  // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0]], {{\[}}{{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<2x2x3xi32>
   // CHECK-NOT: constant-dense
   // CHECK-NOT: "tfl.concatenation"
-  // CHECK: return [[cst]]
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @concatConstantTensorsMiddleDim
@@ -533,10 +533,10 @@ func @concatConstantTensorsMiddleDim() -> tensor<1x4x3xi32> {
   %0 = "tfl.concatenation"(%cst_0, %cst_1) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xi32>, tensor<1x2x3xi32>) -> tensor<1x4x3xi32>
   return %0 : tensor<1x4x3xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0], {{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<1x4x3xi32>
+  // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0], {{\[}}0, 0, 0], {{\[}}1, 1, 1], {{\[}}1, 1, 1]]]> : tensor<1x4x3xi32>
   // CHECK-NOT: constant-dense
   // CHECK-NOT: "tfl.concatenation"
-  // CHECK: return [[cst]]
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @concatConstantTensorsLastDim
@@ -546,10 +546,10 @@ func @concatConstantTensorsLastDim() -> tensor<1x2x6xi32> {
   %0 = "tfl.concatenation"(%cst_0, %cst_1) {axis = 2 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xi32>, tensor<1x2x3xi32>) -> tensor<1x2x6xi32>
   return %0 : tensor<1x2x6xi32>
 
-  // CHECK: [[cst:%.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0, 1, 1, 1], {{\[}}0, 0, 0, 1, 1, 1]]]> : tensor<1x2x6xi32>
+  // CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}0, 0, 0, 1, 1, 1], {{\[}}0, 0, 0, 1, 1, 1]]]> : tensor<1x2x6xi32>
   // CHECK-NOT: constant-dense
   // CHECK-NOT: "tfl.concatenation"
-  // CHECK: return [[cst]]
+  // CHECK: return %[[CST]]
 }
 
 // CHECK-LABEL: @div_dense_dense_float_mixfng_1_n
@@ -561,8 +561,8 @@ func @div_dense_dense_float_mixfng_1_n() -> tensor<2x2xf32> {
 
   return %0 : tensor<2x2xf32>
 
-// CHECK: %cst = constant dense<{{\[\[}}-5.000000e-01, 0.833333313], [3.750000e-01, -6.250000e-01]]> : tensor<2x2xf32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<{{\[\[}}-5.000000e-01, 0.833333313], [3.750000e-01, -6.250000e-01]]> : tensor<2x2xf32>
+// CHECK:  return %[[CST]]
 }
 
 // CHECK-LABEL: @div_dense_different_rank
@@ -574,6 +574,6 @@ func @div_dense_different_rank() -> tensor<1x2x2xf32> {
 
   return %0 : tensor<1x2x2xf32>
 
-// CHECK: %cst = constant dense<[{{\[}}{{\[}}5.000000e-01, 0.333333343], [1.000000e+00, 0.666666686]]]> : tensor<1x2x2xf32>
-// CHECK:  return %cst
+// CHECK: %[[CST:.*]] = constant dense<[{{\[}}{{\[}}5.000000e-01, 0.333333343], [1.000000e+00, 0.666666686]]]> : tensor<1x2x2xf32>
+// CHECK:  return %[[CST]]
 }

From 125ce1812dffb02cac733f1c6108d1e7fca6c77b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 20:14:37 -0700
Subject: [PATCH 397/412] Make serialization of node_def.attr() deterministic.

tensorflow::NodeDef::attr is a map, so iteration order is non-deterministic. Hence,
when exporting, first sort by attribute name.
PiperOrigin-RevId: 312205528
Change-Id: I6cec8f7d34bc7db26cd53a2a0e2f9b4600801cb3
---
 tensorflow/compiler/mlir/lite/flatbuffer_export.cc  | 13 ++++++-------
 .../mlir2flatbuffer/custom_op_with_tflite_op.mlir   |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 6a631b1433d..df84b028f63 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -799,11 +799,6 @@ Optional<CustomOptionsOffset> Translator::CreateFlexOpCustomOptions(
 
 Optional<CustomOptionsOffset> Translator::CreateCustomOpCustomOptions(
     const ::tensorflow::NodeDef& node_def, const mlir::Location& loc) {
-  std::string node_def_str;
-  if (!node_def.SerializeToString(&node_def_str)) {
-    return emitError(loc, "failed to serialize tensorflow node_def"),
-           llvm::None;
-  }
   auto flex_builder = CreateFlexBuilderWithNodeAttrs(node_def, loc);
   return builder_.CreateVector(flex_builder->GetBuffer());
 }
@@ -813,9 +808,13 @@ Translator::CreateFlexBuilderWithNodeAttrs(
     const ::tensorflow::NodeDef& node_def, const mlir::Location& loc) {
   auto flex_builder = absl::make_unique<flexbuffers::Builder>();
   size_t map_start = flex_builder->StartMap();
-  for (const auto& pair : node_def.attr()) {
+  using Item = std::pair<std::string, ::tensorflow::AttrValue>;
+  std::vector<Item> attrs(node_def.attr().begin(), node_def.attr().end());
+  std::sort(attrs.begin(), attrs.end(),
+            [](Item& p1, Item& p2) -> bool { return p1.first < p2.first; });
+  for (const Item& pair : attrs) {
     const char* key = pair.first.c_str();
-    const auto& attr = pair.second;
+    const ::tensorflow::AttrValue& attr = pair.second;
     switch (attr.value_case()) {
       case ::tensorflow::AttrValue::kS:
         flex_builder->String(key, attr.s());
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
index 1b46fa3d0e5..320f869ac4c 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/custom_op_with_tflite_op.mlir
@@ -65,7 +65,7 @@ func @main(tensor<4xf32>) -> tensor<4xf32> {
 // CHECK-NEXT:      opcode_index: 1,
 // CHECK-NEXT:      inputs: [ 2, 1 ],
 // CHECK-NEXT:      outputs: [ 3 ],
-// CHECK-NEXT:      custom_options: [ 105, 110, 116, 95, 97, 116, 116, 114, 0, 102, 117, 115, 101, 100, 95, 97, 99, 116, 105, 118, 97, 116, 105, 111, 110, 95, 102, 117, 110, 99, 116, 105, 111, 110, 0, 4, 82, 69, 76, 85, 0, 2, 33, 43, 2, 1, 2, 11, 2, 20, 4, 4, 36, 1 ]
+// CHECK-NEXT:      custom_options: [ 102, 117, 115, 101, 100, 95, 97, 99, 116, 105, 118, 97, 116, 105, 111, 110, 95, 102, 117, 110, 99, 116, 105, 111, 110, 0, 4, 82, 69, 76, 85, 0, 105, 110, 116, 95, 97, 116, 116, 114, 0, 2, 42, 11, 2, 1, 2, 20, 2, 20, 4, 4, 36, 1 ]
 // CHECK-NEXT:    }, {
 // CHECK-NEXT:      opcode_index: 2,
 // CHECK-NEXT:      inputs: [ 3 ],

From 489c8de9af23fa77d5a5a198e4a3eb5fcd1e60fa Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 18 May 2020 20:42:25 -0700
Subject: [PATCH 398/412] [tf.data] Remove several unnecessary lines in the
 test.

PiperOrigin-RevId: 312208396
Change-Id: I52acdc04caea09ac83b4c9ac12378c818af650e6
---
 tensorflow/python/data/kernel_tests/options_test.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
index 9ab3de788fc..27b5a336a6c 100644
--- a/tensorflow/python/data/kernel_tests/options_test.py
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -107,9 +107,6 @@ class OptionsTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     for _ in range(999):
       result = result.concatenate(ds)
-    options = dataset_ops.Options()
-    options.experimental_optimization.autotune = True
-    result = result.with_options(options)
     self.assertDatasetProduces(result, [0]*1000)
 
 
From c87d12a5e9bc4c568bd310c2266f1f28264e20fb Mon Sep 17 00:00:00 2001
From: Jiho Choi <jihochoi@google.com>
Date: Mon, 18 May 2020 20:50:54 -0700
Subject: [PATCH 399/412] Introduce TraceMeProducer and TraceMeConsumer.

PiperOrigin-RevId: 312209299
Change-Id: I304049413d332b17e141e3f85486f9676e2f859a
---
 tensorflow/core/profiler/lib/BUILD            |  13 ++
 .../core/profiler/lib/connected_traceme.h     | 122 ++++++++++++++++++
 tensorflow/core/profiler/lib/traceme.h        |   8 ++
 .../core/profiler/utils/xplane_schema.cc      |   5 +
 .../core/profiler/utils/xplane_schema.h       |   5 +
 5 files changed, 153 insertions(+)
 create mode 100644 tensorflow/core/profiler/lib/connected_traceme.h

diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 5bb9236efb3..2c4d9e96fcd 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -126,6 +126,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "connected_traceme",
+    hdrs = ["connected_traceme.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":traceme",
+        ":traceme_encode",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 tf_pybind_cc_library_wrapper(
     name = "scoped_annotation_headers",
     visibility = ["//tensorflow/python/profiler/internal:__pkg__"],
diff --git a/tensorflow/core/profiler/lib/connected_traceme.h b/tensorflow/core/profiler/lib/connected_traceme.h
new file mode 100644
index 00000000000..7a31fa19a03
--- /dev/null
+++ b/tensorflow/core/profiler/lib/connected_traceme.h
@@ -0,0 +1,122 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
+#define TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/profiler/lib/traceme_encode.h"
+
+namespace tensorflow {
+namespace profiler {
+
+/*
+ * TraceMeProducer and TraceMeConsumer are used to correlate TraceMe events on
+ * different threads. TraceMeProducer generates the context information to be
+ * passed to TraceMeConsumer, which consists of the context id and optionally
+ * the context name. They may be provided by the user. Then, the events of the
+ * same context information can be correlated during the analysis.
+ *
+ * Example Usages:
+ * (1) Using the user-provided context name and id. The user is responsible for
+ *     providing the same context name and id to TraceMeProducer and
+ *     TraceMeConsumer.
+ * [Producer Thread]
+ * // user_context_id is provided by the user.
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
+ *     "executor_context", user_context_id);
+ * [Consumer Thread]
+ * // user_context_id is provided by the user.
+ * TraceMeConsumer consumer(
+ *     [&] { return "op_execute"; }, user_context_id, "executor_context");
+ *
+ * (2) Using the user-provided context name and generic id. The user is
+ *     responsible for passing the TraceMeProducer's context id to
+ *     TraceMeConsumer as well as providing the same context name to
+ *     TraceMeProducer and TraceMeConsumer.
+ * [Producer Thread]
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); },
+ *     "executor_context");
+ * context_id = producer.GetContextId();
+ * // Pass context_id to the consumer thread.
+ * [Consumer Thread]
+ * // context_id is passed from the producer thread.
+ * TraceMeConsumer consumer(
+ *     [&] { return "op_execute"; }, context_id, "executor_context");
+ *
+ * (3) Using the generic context information. The user is responsible for
+ *     passing the TraceMeProducer's context id to TraceMeConsumer.
+ * [Producer Thread]
+ * TraceMeProducer producer(
+ *     [&] { return TraceMeEncode("op_dispatch", {{"op_type", "matmul"}}); });
+ * context_id = producer.GetContextId();
+ * // Pass context_id to the consumer thread.
+ * [Consumer Thread]
+ * // context_id is passed from the producer thread.
+ * TraceMeConsumer consumer([&] { return "op_execute"; }, context_id);
+ */
+class TraceMeProducer {
+ public:
+  template <typename NameT>
+  explicit TraceMeProducer(NameT name, absl::string_view context_name = "",
+                           absl::optional<uint64> context_id = absl::nullopt,
+                           int level = 2)
+      : trace_me_(name, level) {
+    trace_me_.AppendMetadata([&] {
+      context_id_ =
+          context_id.has_value() ? *context_id : TraceMe::NewActivityId();
+      if (context_name.empty()) {
+        return TraceMeEncode({{"$p", context_id_}});
+      } else {
+        return TraceMeEncode({{"$pn", context_name}, {"$p", context_id_}});
+      }
+    });
+  }
+
+  uint64 GetContextId() const { return context_id_; }
+
+ private:
+  TraceMe trace_me_;
+  uint64 context_id_ = 0;
+};
+
+class TraceMeConsumer {
+ public:
+  template <typename NameT>
+  TraceMeConsumer(NameT name, uint64 context_id,
+                  absl::string_view context_name = "", int level = 2)
+      : trace_me_(name, level) {
+    trace_me_.AppendMetadata([&] {
+      if (context_name.empty()) {
+        return TraceMeEncode({{"$c", context_id}});
+      } else {
+        return TraceMeEncode({{"$cn", context_name}, {"$c", context_id}});
+      }
+    });
+  }
+
+ private:
+  TraceMe trace_me_;
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_LIB_CONNECTED_TRACEME_H_
diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h
index ec5f6765afb..e157c2601be 100644
--- a/tensorflow/core/profiler/lib/traceme.h
+++ b/tensorflow/core/profiler/lib/traceme.h
@@ -248,6 +248,14 @@ class TraceMe {
 #endif
   }
 
+  static uint64 NewActivityId() {
+#if !defined(IS_MOBILE_PLATFORM)
+    return TraceMeRecorder::NewActivityId();
+#else
+    return 0;
+#endif
+  }
+
  private:
   // Activity ID or start time used when tracing is disabled.
   constexpr static uint64 kUntracedActivity = 0;
diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc
index f8ff31b078a..710d9a889fb 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.cc
+++ b/tensorflow/core/profiler/utils/xplane_schema.cc
@@ -147,6 +147,11 @@ const StatTypeMap& GetStatTypeMap() {
       {"region_type", kRegionType},
       {"data_type", kDataType},
       {"shape", kTensorShapes},
+      // Schema related.
+      {"$pn", kProducerContextName},
+      {"$cn", kConsumerContextName},
+      {"$p", kProducerId},
+      {"$c", kConsumerId},
       // Device trace arguments.
       {"device_id", kDeviceId},
       {"context_id", kContextId},
diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h
index 31ff90155f5..8b19db8c38d 100644
--- a/tensorflow/core/profiler/utils/xplane_schema.h
+++ b/tensorflow/core/profiler/utils/xplane_schema.h
@@ -139,6 +139,11 @@ enum StatType {
   kRegionType,
   kDataType,
   kTensorShapes,
+  // Schema related.
+  kProducerContextName,
+  kConsumerContextName,
+  kProducerId,
+  kConsumerId,
   // Device trace arguments.
   kDeviceId,
   kContextId,

From aa90d29341126f183d31b6803d65627a92c5514c Mon Sep 17 00:00:00 2001
From: Renjie Liu <renjieliu@google.com>
Date: Mon, 18 May 2020 20:56:59 -0700
Subject: [PATCH 400/412] slightly improve quantized max performance.

PiperOrigin-RevId: 312209911
Change-Id: I789ae3a443cc457ec444ea797a1b70b9465ff771
---
 .../internal/optimized/optimized_ops.h        | 40 +++++++++----------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 64598d70ee3..746ed622632 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -7898,16 +7898,16 @@ inline void MaximumElementwise(int size, const ArithmeticParams& params,
                                const int8* input1_data, const int8* input2_data,
                                int8* output_data) {
   ruy::profiler::ScopeLabel label("MaximumElementwiseInt8/8bit");
-
   int i = 0;
 #ifdef USE_NEON
-  for (; i <= size - 8; i += 8) {
-    const int8x8_t input1_val_original = vld1_s8(input1_data + i);
-    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
-    const int8x8_t max_data = vmax_s8(input1_val_original, input2_val_original);
-    vst1_s8(output_data + i, max_data);
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t max_data =
+        vmaxq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, max_data);
   }
-#endif  // NEON
+#endif  // USE_NEON
   for (; i < size; ++i) {
     const int8 input1_val = input1_data[i];
     const int8 input2_val = input2_data[i];
@@ -7922,13 +7922,14 @@ inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
   int i = 0;
 
 #ifdef USE_NEON
-  const int8x8_t input1_val_original = vdup_n_s8(input1_data);
-  for (; i <= size - 8; i += 8) {
-    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
-    const int8x8_t max_data = vmax_s8(input1_val_original, input2_val_original);
-    vst1_s8(output_data + i, max_data);
+  const int8x16_t input1_val_original = vdupq_n_s8(input1_data);
+  for (; i <= size - 16; i += 16) {
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+    const int8x16_t max_data =
+        vmaxq_s8(input1_val_original, input2_val_original);
+    vst1q_s8(output_data + i, max_data);
   }
-#endif  // NEON
+#endif  // USE_NEON
   for (; i < size; ++i) {
     const int8 input2_val = input2_data[i];
     output_data[i] = std::max(input1_data, input2_val);
@@ -7939,6 +7940,7 @@ inline void MaximumScalarBroadcast(int size, const ArithmeticParams& params,
 inline void MinimumElementwise(int size, const ArithmeticParams& params,
                                const int8* input1_data, const int8* input2_data,
                                int8* output_data) {
+  ruy::profiler::ScopeLabel label("MinimumElementwiseInt8/8bit");
   int i = 0;
 #ifdef USE_NEON
   for (; i <= size - 16; i += 16) {
@@ -7959,6 +7961,7 @@ inline void MinimumElementwise(int size, const ArithmeticParams& params,
 inline void MinimumScalarBroadcast(int size, const ArithmeticParams& params,
                                    int8 input1_data, const int8* input2_data,
                                    int8* output_data) {
+  ruy::profiler::ScopeLabel label("MinimumScalarBroadcastInt8/8bit");
   int i = 0;
 
 #ifdef USE_NEON
@@ -7985,10 +7988,7 @@ inline void BinaryBroadcastFiveFold(const ArithmeticParams& unswitched_params,
                                     const RuntimeShape& output_shape,
                                     int8* output_data,
                                     ElementwiseF elementwise_f,
-                                    ScalarBroadcastF scalar_broadcast_f,
-                                    const std::string& label_name) {
-  ruy::profiler::ScopeLabel label(label_name);
-
+                                    ScalarBroadcastF scalar_broadcast_f) {
   ArithmeticParams switched_params = unswitched_params;
   switched_params.input1_offset = unswitched_params.input2_offset;
   switched_params.input1_multiplier = unswitched_params.input2_multiplier;
@@ -8090,8 +8090,7 @@ inline void BroadcastMaximumDispatch(const ArithmeticParams& params,
 
   BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
                           input2_data, output_shape, output_data,
-                          MaximumElementwise, MaximumScalarBroadcast,
-                          "BroadcastMaximumFivefoldInt8/8bit");
+                          MaximumElementwise, MaximumScalarBroadcast);
 }
 
 template <typename Op>
@@ -8110,8 +8109,7 @@ inline void BroadcastMinimumDispatch(const ArithmeticParams& params,
 
   BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape,
                           input2_data, output_shape, output_data,
-                          MinimumElementwise, MinimumScalarBroadcast,
-                          "BroadcastMinimumFivefoldInt8/8bit");
+                          MinimumElementwise, MinimumScalarBroadcast);
 }
 
 }  // namespace optimized_ops

From 3da4ead13d2c02161fa3d62bb9d1795eb0e2c67a Mon Sep 17 00:00:00 2001
From: Eugene Brevdo <ebrevdo@google.com>
Date: Mon, 18 May 2020 21:35:31 -0700
Subject: [PATCH 401/412] [TF] Add eager microbenchmark for conv2d.

On my machine:

entry {
  name: "MicroBenchmarks.benchmark_tf_conv2d_CPU"
  iters: 30000
  wall_time: 187.51747608184814
  extras {
    key: "examples_per_sec"
    value {
      double_value: 5332.836
    }
  }
  extras {
    key: "us_per_example"
    value {
      double_value: 187.517
    }
  }
}

entry {
  name: "MicroBenchmarks.benchmark_tf_conv2d_GPU"
  iters: 30000
  wall_time: 59.453535079956055
  extras {
    key: "examples_per_sec"
    value {
      double_value: 16819.858
    }
  }
  extras {
    key: "us_per_example"
    value {
      double_value: 59.454
    }
  }
}

PiperOrigin-RevId: 312213393
Change-Id: I6744f37a034b388e0b3053522c3b2d6e023495f1
---
 tensorflow/python/eager/benchmarks_test.py | 23 ++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 3056d1a98ea..3f4cc79afc4 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -120,6 +120,10 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     self._num_iters_2_by_2 = 30000
     self._num_iters_100_by_784 = 30000
 
+    # used for conv2d benchmarks
+    self._m_8_28_28_3 = random_ops.random_uniform((8, 28, 28, 3))
+    self._m_1_3_3_1 = random_ops.random_uniform((1, 3, 3, 1))
+
   def _get_benchmark_name(self):
     """Mostly copied from benchmark.py _get_name()."""
     stack = tf_inspect.stack()
@@ -305,6 +309,10 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     func = lambda: m * m
     self._run(func, num_iters)
 
+  def _benchmark_tf_conv2d(self, m1, m2, num_iters):
+    func = lambda: nn_ops.conv2d(m1, m2, strides=[1, 1, 1, 1], padding="VALID")
+    self._run(func, num_iters)
+
   def _benchmark_tf_multiply_op(self, m, num_iters):
     func = lambda: math_ops.multiply(m, m)
     self._run(func, num_iters)
@@ -339,6 +347,21 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = self._m_2.gpu()
       self._benchmark_tf_multiply_op(m, 30000)
 
+  def benchmark_tf_conv2d_CPU(self):
+    with context.device(CPU):
+      m1 = self._m_8_28_28_3.cpu()
+      m2 = self._m_1_3_3_1.cpu()
+      self._benchmark_tf_conv2d(m1, m2, 30000)
+
+  @test_util.disable_tfrt("copy to GPU not supported")
+  def benchmark_tf_conv2d_GPU(self):
+    if not context.num_gpus():
+      return
+    with context.device(GPU):
+      m1 = self._m_8_28_28_3.gpu()
+      m2 = self._m_1_3_3_1.gpu()
+      self._benchmark_tf_conv2d(m1, m2, 30000)
+
   def benchmark_tf_identity(self):
     m = self._m_2
     self._run(lambda: gen_array_ops.identity(m), 30000)

From 97aed8f72e461721466f5ab835c23d6fa4bbf6a9 Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Mon, 18 May 2020 22:45:19 -0700
Subject: [PATCH 402/412] Remove trivial quantize op

PiperOrigin-RevId: 312221307
Change-Id: Ibed5b449cedf5268f675a9fb09807e429f8a254a
---
 .../lite/quantization/quantization_utils.h    | 50 +++++++++++++++++++
 .../mlir/lite/tests/post-quantize.mlir        | 10 ++++
 .../mlir/lite/transforms/post_quantize.cc     |  1 +
 3 files changed, 61 insertions(+)

diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index 27ccc7d2b22..d4512509f6b 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Quant/FakeQuantSupport.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
@@ -35,6 +36,7 @@ limitations under the License.
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/StandardTypes.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
 
 namespace mlir {
@@ -363,6 +365,54 @@ struct ConvertUnsignedToSigned : public OpRewritePattern<Q> {
   }
 };
 
+// Fold Extra Requantize ops if the preceding ops has free scale requirement.
+template <typename RQ>
+struct FoldTrivalRequantizeOp : public OpRewritePattern<RQ> {
+  explicit FoldTrivalRequantizeOp(MLIRContext* context)
+      : OpRewritePattern<RQ>(context, 1) {}
+
+  LogicalResult matchAndRewrite(RQ op,
+                                PatternRewriter& rewriter) const override {
+    Value pre_quantized = op.input();
+    auto pre_quantized_type =
+        quant::QuantizedType::getQuantizedElementType(pre_quantized.getType());
+    if (!pre_quantized_type) return failure();
+
+    Operation* def = pre_quantized.getDefiningOp();
+    if (!def) return failure();
+    if (def->hasTrait<OpTrait::quant::SameOperandsAndResultsScale>() ||
+        def->hasTrait<OpTrait::quant::NoQuantizableResult>()) {
+      return failure();
+    }
+
+    op.emitWarning("Remove trivial `rescale` op. Please fix the source graph.");
+
+    llvm::SmallVector<Type, 4> new_output_types;
+    for (auto result : def->getResults()) {
+      result.getUsers().begin()->dump();
+      op.dump();
+      if (result.hasOneUse() && *result.getUsers().begin() == op) {
+        new_output_types.push_back(op.qtype());
+      } else {
+        new_output_types.push_back(result.getType());
+      }
+    }
+
+    // Remove this rescale op.
+    rewriter.replaceOp(op, {pre_quantized});
+
+    // Replace the output scale of the preceding op.
+    rewriter.setInsertionPointAfter(def);
+    OperationState new_state(def->getLoc(), def->getName().getStringRef(),
+                             def->getOperands(), new_output_types,
+                             def->getAttrs());
+    Operation* new_op = rewriter.createOperation(new_state);
+
+    rewriter.replaceOp(def, new_op->getResults());
+    return success();
+  }
+};
+
 // Given a quantized type `input`, magnifying its scales by the factor stored in
 // `factor`. If `input` isn't a quantized type or the `factor` doesn't match the
 // dimension size of `input` or isn't floating-point, nullptr will be returned.
diff --git a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
index 5377c4fdb98..6573a2f1c36 100644
--- a/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/post-quantize.mlir
@@ -19,6 +19,16 @@ func @RemoveUnused(%arg0: tensor<4xf32>, %arg1: tensor<i32>) -> (tensor<2xf32>,t
 // CHECK-NEXT: return %[[split]]#0, %[[split]]#1
 }
 
+// CHECK-LABEL: RemoveTrival
+func @RemoveTrival(%arg0: tensor<384x512x!quant.uniform<i8:f32, 1.0:-128>>, %arg1: tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.0>>, %arg2: none) -> tensor<384x128x!quant.uniform<i8:f32, 2.0>> {
+  %1 = "tfl.fully_connected"(%arg0, %arg1, %arg2) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<384x512x!quant.uniform<i8:f32, 1.0:-128>>, tensor<128x512x!quant.uniform<i8<-127:127>:f32, 1.0>>, none) -> tensor<384x128x!quant.uniform<i8:f32, 1.0>>
+  %2 = "tfl.quantize"(%1) {qtype = tensor<384x128x!quant.uniform<i8:f32, 2.0>>} : (tensor<384x128x!quant.uniform<i8:f32, 1.0>>) -> tensor<384x128x!quant.uniform<i8:f32, 2.0>>
+  return %2 : tensor<384x128x!quant.uniform<i8:f32, 2.0>>
+
+// CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"{{.*}} -> tensor<384x128x!quant.uniform<i8:f32, 2.000000e+00>>
+// CHECK-NEXT: return %[[fc]]
+}
+
 func @main(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x1001xf32> {
   %cst = constant dense<[1, 1001]> : tensor<2xi32>
   %0 = "tfl.quantize"(%arg0) {qtype = tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>} : (tensor<1x224x224x3xf32>) -> tensor<1x224x224x3x!quant.uniform<u8:f32, 7.812500e-03:128>>
diff --git a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
index 97b7d57dbf4..7954f72046a 100644
--- a/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/post_quantize.cc
@@ -125,6 +125,7 @@ void PostQuantizePass::runOnFunction() {
   auto func = getFunction();
   auto* ctx = func.getContext();
   TFL::populateWithGenerated(ctx, &patterns);
+  patterns.insert<quant::FoldTrivalRequantizeOp<QuantizeOp>>(ctx);
   applyPatternsAndFoldGreedily(func, patterns);
 
   if (!emit_quant_adaptor_ops_) {

From 3c6dadd17f168958ae21d39bbc2ac95af4cd14ca Mon Sep 17 00:00:00 2001
From: Chuan He <chhe@google.com>
Date: Mon, 18 May 2020 23:15:56 -0700
Subject: [PATCH 403/412]    Fix bug in Canonicalizer folder function for
 ArithmeticOp.

PiperOrigin-RevId: 312224624
Change-Id: Icd6b5ed25fedfa4b4f99be0d09fc5746010aad2a
---
 tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc      |  6 ++++++
 .../compiler/mlir/tensorflow/tests/constant-fold.mlir | 11 +++++++++++
 2 files changed, 17 insertions(+)

diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
index 69b8f15320f..7fcc82f6757 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc
@@ -497,6 +497,12 @@ OpFoldResult IdentityArithmeticOpFolder(OpT arithmetic_op,
       return arithmetic_op.x();
   }
 
+  auto rhs_type = arithmetic_op.y().getType().template cast<ShapedType>();
+  // TODO(chhe): we could fold and add an identity to force the broadcast.
+  if (result_op_type != rhs_type) {
+    return {};
+  }
+
   bool is_symmetric =
       (std::is_same<OpT, AddV2Op>::value || std::is_same<OpT, MulOp>::value);
   if (auto attr = operands[0].dyn_cast_or_null<DenseElementsAttr>()) {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index 2119e78bd1e..3ae6023400c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -431,3 +431,14 @@ func @DontRemoveTrivialAdd2(%arg0: tensor<?x?xf32>) -> tensor<?x?xf32> {
   // CHECK: %[[RESULT:.*]] = "tf.AddV2"(%arg0, %[[CONST]]) : (tensor<?x?xf32>, tensor<2x2xf32>) -> tensor<?x?xf32>
   // CHECK: return %[[RESULT]] : tensor<?x?xf32>
 }
+
+// Test no fold because of the broadcast.
+func @DontRemoveTrivialMul(%arg0: tensor<1x6x8x1xf32>) -> tensor<1x6x8x1xf32> {
+  %0 = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  %1 = "tf.Mul"(%arg0, %0) : (tensor<1x6x8x1xf32>, tensor<f32>) -> tensor<1x6x8x1xf32>
+  return %1 : tensor<1x6x8x1xf32>
+  // CHECK-LABEL: DontRemoveTrivialMul
+  // CHECK: %[[CONST:.*]] = "tf.Const"() {value = dense<2.000000e+00> : tensor<f32>} : () -> tensor<f32>
+  // CHECK: %[[RESULT:.*]] = "tf.Mul"(%arg0, %[[CONST]]) : (tensor<1x6x8x1xf32>, tensor<f32>) -> tensor<1x6x8x1xf32>
+  // CHECK: return %[[RESULT]] : tensor<1x6x8x1xf32>
+}

From f7d038cc3b8398b2e88c3fafda0670dafe293220 Mon Sep 17 00:00:00 2001
From: Kibeom Kim <kkb@google.com>
Date: Tue, 19 May 2020 00:20:53 -0700
Subject: [PATCH 404/412] Enable more TFRT tests.

PiperOrigin-RevId: 312230367
Change-Id: Icc82c7ce424a1db2ca3cf2eabc1e5932fec7b6a7
---
 tensorflow/python/BUILD                       |  6 ++-
 .../benchmarks/resnet50/resnet50_test.py      | 14 +++----
 tensorflow/python/eager/benchmarks_test.py    | 40 +++++++++----------
 tensorflow/python/framework/ops_test.py       | 21 +++++-----
 tensorflow/python/kernel_tests/BUILD          |  1 +
 tensorflow/python/kernel_tests/random/BUILD   |  2 +
 .../kernel_tests/random/random_ops_test.py    |  2 +
 .../random/stateless_random_ops_test.py       | 10 +++++
 .../resource_variable_ops_test.py             |  6 +++
 9 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a49e4b74def..869e2f2f8d8 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3,7 +3,7 @@
 #  ":platform" - Low-level and platform-specific Python code.
 
 load("//tensorflow:tensorflow.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cuda_library", "tf_gen_op_wrapper_py", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cuda_library", "tf_gen_op_wrapper_py")
 
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
@@ -26,6 +26,9 @@ load("//tensorflow:tensorflow.bzl", "tf_external_workspace_visible")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_pybind_cc_library_wrapper")
 
+# buildifier: disable=same-origin-load
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_py_build_info_genrule")
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_lib_deps", "tf_proto_library", "tf_proto_library_py", "tf_protos_grappler")  # @unused
@@ -2071,6 +2074,7 @@ tf_py_test(
     srcs = ["framework/constant_op_test.py"],
     main = "framework/constant_op_test.py",
     python_version = "PY3",
+    tfrt_enabled = True,
     deps = [
         ":constant_op",
     ],
diff --git a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
index 34ceb56d129..362fad1388c 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
+++ b/tensorflow/python/eager/benchmarks/resnet50/resnet50_test.py
@@ -108,15 +108,15 @@ class ResNet50Test(tf.test.TestCase):
     self._apply(defun=False)
 
   @test_util.disable_tfrt(
-      'TFE_ContextGetExecutorForThread not implemented for tfrt')
+      'TFE_ContextGetExecutorForThread not implemented b/156188669')
   def test_apply_async(self):
     self._apply(defun=False, execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def test_apply_with_defun(self):
     self._apply(defun=True)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def test_apply_with_defun_async(self):
     self._apply(defun=True, execution_mode=context.ASYNC)
 
@@ -217,7 +217,7 @@ class ResNet50Test(tf.test.TestCase):
   def test_train(self):
     self._test_train()
 
-  @test_util.disable_tfrt('b/155260334')
+  @test_util.disable_tfrt('TFE_ContextGetExecutorForThread missing b/156188669')
   def test_train_async(self):
     self._test_train(execution_mode=context.ASYNC)
 
@@ -329,7 +329,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         defun=False,
         execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_apply_with_defun(self):
     self._benchmark_eager_apply(
         'eager_apply_with_defun',
@@ -389,7 +389,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         defun=False,
         execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_train_with_defun(self):
     self._benchmark_eager_train(
         'eager_train_with_defun', MockIterator,
@@ -408,7 +408,7 @@ class ResNet50Benchmarks(tf.test.Benchmark):
         resnet50_test_util.device_and_data_format(),
         defun=False)
 
-  @test_util.disable_tfrt('Graph is not supported yet.')
+  @test_util.disable_tfrt('Graph is not supported yet. b/156187905')
   def benchmark_eager_train_datasets_with_defun(self):
 
     def make_iterator(tensors):
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 3f4cc79afc4..223b62ededa 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -618,7 +618,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=False, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_matmul_2_by_2_GPU(self):
     if not context.num_gpus():
       return
@@ -639,7 +639,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
           num_iters=self._num_iters_2_by_2,
           execution_mode=context.ASYNC)
 
-  @test_util.disable_tfrt("function not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_nested_defun_matmul_2_by_2(self):
     m = self._m_2_by_2.cpu()
     self._benchmark_nested_defun_matmul(
@@ -687,7 +687,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       self._benchmark_tfe_py_execute_matmul(
           m, transpose_b=True, num_iters=self._num_iters_100_by_784)
 
-  @test_util.disable_tfrt("function not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_matmul_100_by_784_CPU(self):
     with context.device(CPU):
       m = self._m_100_by_784.cpu()
@@ -815,35 +815,35 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         func()
       self._run(func, 3000)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_in_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_of_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_in_defun_of_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_of_defun_matmul_256_by_2096_CPU(self):
     self._benchmark_forwardprop_of_defun_matmul_CPU(shape=(256, 2096))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_in_defun_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_in_defun_of_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_in_defun_of_defun_matmul_CPU(shape=(100, 784))
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_forwardprop_of_defun_matmul_100_by_784_CPU(self):
     self._benchmark_forwardprop_of_defun_matmul_CPU(shape=(100, 784))
 
@@ -1097,7 +1097,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       m = resource_variable_ops.ResourceVariable(self._m_2_by_2)
       self._benchmark_transpose(m, num_iters=self._num_iters_2_by_2)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_without_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1109,7 +1109,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     cache_computation = lambda: defined(t, t, t, t, t, t, t, t)
     self._run(cache_computation, 30000)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_without_signature_and_with_kwargs(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1122,7 +1122,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
       return defined(t1=t, t2=t, t3=t, t4=t, t5=t, t6=t, t7=t, t8=t)
     self._run(cache_computation, 30000)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_with_signature(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1135,7 +1135,7 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     signature_computation = lambda: defined(t, t, t, t, t, t, t, t)
     self._run(signature_computation, 30000)
 
-  @test_util.disable_tfrt("defun not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmark_defun_with_signature_and_kwargs(self):
 
     def func(t1, t2, t3, t4, t5, t6, t7, t8):
@@ -1305,11 +1305,11 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
         resources.append(resource_variable_ops.ResourceVariable(self._m_2))
       self._run(lambda: add_all(resources), num_iters)
 
-  @test_util.disable_tfrt("funtion not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkFunctionWithFiveResourceInputs(self):
     self._benchmarkFunctionWithResourceInputs(5, 1000)
 
-  @test_util.disable_tfrt("funtion not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkFunctionWithFiveHundredResourceInputs(self):
     self._benchmarkFunctionWithResourceInputs(500, 100)
 
@@ -1344,15 +1344,15 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     with context.device(CPU):
       self._run(benchmark_fn, 10)
 
-  @test_util.disable_tfrt("funtion not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkTenThousandResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10000)
 
-  @test_util.disable_tfrt("funtion not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkHundredResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(100)
 
-  @test_util.disable_tfrt("funtion not supported")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def benchmarkTenResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10)
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 11193155999..7626bd780bb 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -91,7 +91,7 @@ class ResourceTest(test_util.TensorFlowTestCase):
                   resources.shared_resources()).eval()), 0)
 
 
-@test_util.disable_tfrt("Graph is not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
   def testShape(self):
@@ -311,7 +311,8 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
     del x
     self.assertIsNotNone(x_ref.deref())
 
-@test_util.disable_tfrt("Graph mode is not supported yet.")
+
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 @test_util.run_all_in_graph_and_eager_modes
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
 
@@ -356,7 +357,7 @@ class IndexedSlicesTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(x.indices, [0, 2])
 
 
-@test_util.disable_tfrt("Graph mode is not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 @test_util.run_all_in_graph_and_eager_modes
 class IndexedSlicesSpecTest(test_util.TensorFlowTestCase,
                             parameterized.TestCase):
@@ -502,7 +503,7 @@ def _apply_op(g, *args, **kwargs):
     return op.outputs
 
 
-@test_util.disable_tfrt("Graph is not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class OperationTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
@@ -1445,7 +1446,7 @@ class NameTest(test_util.TensorFlowTestCase):
                        g.create_op("FloatOutput", [], [dtypes.float32]).name)
 
 
-@test_util.disable_tfrt("Device API are not supported yet.")
+@test_util.disable_tfrt("Device API are not supported yet. b/156188344")
 class DeviceTest(test_util.TensorFlowTestCase):
 
   def testNoDevice(self):
@@ -2026,7 +2027,7 @@ class CollectionTest(test_util.TensorFlowTestCase):
       # Collections are ordered.
       self.assertEqual([90, 100], ops.get_collection("key"))
 
-  @test_util.disable_tfrt("Functions are not supported yet.")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def test_defun(self):
     with context.eager_mode():
 
@@ -2133,7 +2134,7 @@ class ControlDependenciesTest(test_util.TensorFlowTestCase):
     # e should be dominated by c.
     self.assertEqual(e.op.control_inputs, [])
 
-  @test_util.disable_tfrt("Graph is not supported yet.")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_in_graph_and_eager_modes
   def testEager(self):
     def future():
@@ -2454,7 +2455,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
     self._testGraphElements([a, variable, b])
 
 
-@test_util.disable_tfrt("Graphs are not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class InitScopeTest(test_util.TensorFlowTestCase):
 
   def testClearsControlDependencies(self):
@@ -2757,7 +2758,7 @@ class InitScopeTest(test_util.TensorFlowTestCase):
           self.assertFalse(self.evaluate(f()))
 
 
-@test_util.disable_tfrt("Graphs are not supported yet.")
+@test_util.disable_tfrt("Graph is not supported yet. b/156187905")
 class GraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -3235,7 +3236,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
       b = variables.Variable([3.0], name="b")
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
 
-  @test_util.disable_tfrt("Functions are not supported yet.")
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   def testColocateWithVariableInFunction(self):
     v = variables.Variable(1.)
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index cd03da9b179..9e38a78578f 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -864,6 +864,7 @@ cuda_py_test(
     srcs = ["resource_variable_ops_test.py"],
     # TODO(b/128347673): Re-enable.
     tags = ["no_windows"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index c3335cbc546..b5d291d2973 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -87,6 +87,7 @@ cuda_py_test(
     name = "random_ops_test",
     size = "medium",
     srcs = ["random_ops_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -101,6 +102,7 @@ cuda_py_test(
     size = "medium",
     srcs = ["stateless_random_ops_test.py"],
     shard_count = 2,
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 4dbbb7c7f1e..73c8bd09db0 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -336,6 +336,8 @@ class RandomUniformTest(RandomOpTestCommon):
       self.assertLess(error.max(), 5 * std)
 
   # Check that minval = maxval is fine iff we're producing no numbers
+  @test_util.disable_tfrt(
+      "TFE_TensorHandleToNumpy not implemented yet. b/156191611")
   def testUniformIntsDegenerate(self):
     for dt in dtypes.int32, dtypes.int64:
       def sample(n):
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index 0b9fbab716c..d7e50083deb 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -154,44 +154,54 @@ class StatelessOpsTest(test.TestCase, parameterized.TestCase):
                                 **kwds),
               functools.partial(random_ops.random_poisson, shape=(10,), **kwds))
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchFloat(self):
     self._test_match(self._float_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchInt(self):
     self._test_match(self._int_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchMultinomial(self):
     self._test_match(self._multinomial_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchGamma(self):
     self._test_match(self._gamma_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testMatchPoisson(self):
     self._test_match(self._poisson_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismFloat(self):
     self._test_determinism(
         self._float_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismInt(self):
     self._test_determinism(
         self._int_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismMultinomial(self):
     self._test_determinism(self._multinomial_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismGamma(self):
     self._test_determinism(self._gamma_cases())
 
+  @test_util.disable_tfrt('tensorflow::DirectSession::Run crashes. b/156187396')
   @test_util.run_deprecated_v1
   def testDeterminismPoisson(self):
     self._test_determinism(self._poisson_cases())
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 41ce9eb8a57..bf229943fd4 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -57,6 +57,8 @@ from tensorflow.python.training import training_util
 from tensorflow.python.util import compat
 
 
+@test_util.disable_tfrt(
+    "Trying to assign variable with wrong dtype. b/156200342")
 @test_util.with_control_flow_v2
 class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
                               parameterized.TestCase):
@@ -332,6 +334,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     g = gradients_impl.gradients(c, [b], unconnected_gradients="zero")[0]
     self.assertAllEqual(g.shape.as_list(), [1, 2])
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_deprecated_v1
   def testGradientCondInWhileLoop(self):
     v = resource_variable_ops.ResourceVariable(initial_value=1.0)
@@ -965,6 +968,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
           assign = var.assign(np.zeros(shape=[2, 2]))
           self.evaluate(assign)
 
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.disable_xla("XLA doesn't allow changing shape at assignment, as "
                          "dictated by tf2xla/xla_resource.cc:SetTypeAndShape")
   @test_util.run_in_graph_and_eager_modes
@@ -1327,6 +1331,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
 
   # TODO(ebrevdo): Add run_in_graph_and_eager_modes once we can create
   # EagerTensor constants with TensorProto inputs.
+  @test_util.disable_tfrt("Graph is not supported yet. b/156187905")
   @test_util.run_in_graph_and_eager_modes()
   def testVariantInitializer(self):
     variant_shape_and_type_data = self.create_variant_shape_and_type_data()
@@ -1520,6 +1525,7 @@ class PerReplicaResourceHandleTest(test_util.TensorFlowTestCase):
         context.LogicalDeviceConfiguration(),
     ])
 
+  @test_util.disable_tfrt("Multiple device support. b/154956430")
   def testAllowedDevices(self):
     device0 = "/job:localhost/replica:0/task:0/device:CPU:0"
     device1 = "/job:localhost/replica:0/task:0/device:CPU:1"

From bfd37881017e49a75f9b9ac6600d0e95a93b4afe Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Tue, 19 May 2020 01:02:14 -0700
Subject: [PATCH 405/412] Reorder functions in an effort to group utility
 functions that use symbols defined in values.py and are used by classes
 defined in values.py.

PiperOrigin-RevId: 312234995
Change-Id: I3ec7fbc1d35935da54e61d991a44bc81b0b61d67
---
 tensorflow/python/distribute/values.py      | 374 ++++++++++----------
 tensorflow/python/distribute/values_test.py |   4 +-
 2 files changed, 191 insertions(+), 187 deletions(-)

diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 84904f93104..432f6b06975 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -43,6 +43,7 @@ from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
+# Utility functions used by the different classes below.
 def _get_current_replica_id_as_int():
   """Returns the current replica ID as an integer, or `None`."""
   replica_context = ds_context.get_replica_context()
@@ -55,6 +56,59 @@ def _get_current_replica_id_as_int():
   return replica_id
 
 
+def _assign_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign(tensor)
+
+
+def _assign_add_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign_add(tensor)
+
+
+def _assign_sub_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign_sub(tensor)
+
+
+def _assert_replica_context(strategy):
+  replica_context = ds_context.get_replica_context()
+  if not replica_context:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+  if replica_context.strategy is not strategy:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+
+
+def _apply_aggregation(strategy, value, aggregation, destinations):
+  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+    return strategy.extended.broadcast_to(
+        strategy.experimental_local_results(value)[0],
+        destinations=destinations)
+  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
+  return strategy.extended.reduce_to(reduce_op, value, destinations)
+
+
+_aggregation_error_msg = (
+    "You must specify an aggregation method to update a "
+    "{variable_type} in Replica Context. You can do so by passing "
+    "an explicit value for argument `aggregation` to tf.Variable(..)."
+    "e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)`"
+    "`tf.VariableAggregation` lists the possible aggregation methods."
+    "This is required because {variable_type} should always be "
+    "kept in sync. When updating them or assigning to them in a "
+    "replica context, we automatically try to aggregate the values "
+    "before updating the variable. For this aggregation, we need to "
+    "know the aggregation method. "
+    "Another alternative is to not try to update such "
+    "{variable_type} in replica context, but in cross replica "
+    "context. You can enter cross replica context by calling "
+    "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
+    "Inside `merge_fn`, you can then update the {variable_type} "
+    "using `tf.distribute.StrategyExtended.update()`.")
+
+
 @tf_export("distribute.DistributedValues", v1=[])
 class DistributedValues(object):
   """Base class for representing distributed values.
@@ -389,21 +443,6 @@ class Mirrored(DistributedDelegate):
     return obj
 
 
-def _assign_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign(tensor)
-
-
-def _assign_add_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign_add(tensor)
-
-
-def _assign_sub_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign_sub(tensor)
-
-
 class DistributedVarOp(object):
   """A class that looks like `tf.Operation`."""
 
@@ -743,59 +782,6 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
     pass
 
 
-def _validate_colocate_extended(v, extended):
-  variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
-  if variable_strategy.extended is not extended:
-    raise ValueError(
-        "`colocate_vars_with` must only be passed a variable created in this "
-        "tf.distribute.Strategy.scope(), not %s created in scope: %s" %
-        (v, variable_strategy))
-
-
-def validate_colocate_distributed_variable(v, extended):
-  if not isinstance(v, DistributedVariable):
-    raise ValueError(
-        "`colocate_vars_with` must only be passed a variable created in this "
-        "tf.distribute.Strategy.scope(), not: %r" % (v,))
-  _validate_colocate_extended(v, extended)
-
-
-def validate_colocate(v, extended):
-  if not hasattr(v, "_distribute_strategy"):
-    raise ValueError(
-        "`colocate_vars_with` must only be passed a variable created in this "
-        "tf.distribute.Strategy.scope(), not: %r" % (v,))
-  _validate_colocate_extended(v, extended)
-
-
-def _apply_aggregation(strategy, value, aggregation, destinations):
-  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-    return strategy.extended.broadcast_to(
-        strategy.experimental_local_results(value)[0],
-        destinations=destinations)
-  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
-  return strategy.extended.reduce_to(reduce_op, value, destinations)
-
-
-_aggregation_error_msg = (
-    "You must specify an aggregation method to update a "
-    "{variable_type} in Replica Context. You can do so by passing "
-    "an explicit value for argument `aggregation` to tf.Variable(..)."
-    "e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)`"
-    "`tf.VariableAggregation` lists the possible aggregation methods."
-    "This is required because {variable_type} should always be "
-    "kept in sync. When updating them or assigning to them in a "
-    "replica context, we automatically try to aggregate the values "
-    "before updating the variable. For this aggregation, we need to "
-    "know the aggregation method. "
-    "Another alternative is to not try to update such "
-    "{variable_type} in replica context, but in cross replica "
-    "context. You can enter cross replica context by calling "
-    "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
-    "Inside `merge_fn`, you can then update the {variable_type} "
-    "using `tf.distribute.StrategyExtended.update()`.")
-
-
 class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
   """Class for defining how to restore a MirroredVariable."""
 
@@ -812,87 +798,6 @@ class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
             for v in self._mirrored_variable.values))
 
 
-def create_mirrored_variable(  # pylint: disable=missing-docstring
-    strategy, real_mirrored_creator, mirrored_cls, sync_on_read_cls, **kwargs):
-  # Figure out what collections this variable should be added to.
-  # We'll add the MirroredVariable to those collections instead.
-  var_collections = kwargs.pop("collections", None)
-  if var_collections is None:
-    var_collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-  kwargs["collections"] = []
-
-  synchronization = kwargs.get("synchronization",
-                               vs.VariableSynchronization.ON_WRITE)
-
-  if synchronization == vs.VariableSynchronization.NONE:
-    raise ValueError(
-        "`NONE` variable synchronization mode is not supported with `Mirrored` "
-        "distribution strategy. Please change the `synchronization` for "
-        "variable: " + str(kwargs["name"]))
-  elif synchronization == vs.VariableSynchronization.ON_READ:
-    is_sync_on_read = True
-  elif synchronization in (vs.VariableSynchronization.ON_WRITE,
-                           vs.VariableSynchronization.AUTO):
-    # `AUTO` synchronization defaults to `ON_WRITE`.
-    is_sync_on_read = False
-  else:
-    raise ValueError(
-        "Invalid variable synchronization mode: %s for variable: %s" %
-        (synchronization, kwargs["name"]))
-
-  aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
-
-  if aggregation not in (vs.VariableAggregation.NONE,
-                         vs.VariableAggregation.SUM,
-                         vs.VariableAggregation.MEAN,
-                         vs.VariableAggregation.ONLY_FIRST_REPLICA):
-    raise ValueError("Invalid variable aggregation mode: %s for variable: %s" %
-                     (aggregation, kwargs["name"]))
-
-  # Ignore user-specified caching device, not needed for mirrored variables.
-  kwargs.pop("caching_device", None)
-
-  # TODO(josh11b,apassos): It would be better if variable initialization
-  # was never recorded on the tape instead of having to do this manually
-  # here.
-  with tape.stop_recording():
-    value_list = real_mirrored_creator(**kwargs)
-    var_cls = sync_on_read_cls if is_sync_on_read else mirrored_cls
-    result = var_cls(strategy, value_list, aggregation)
-    # Install the created DistributedVariable as _distributed_container property
-    # of the underlying variables, to make it easy to map back to the container.
-    for v in result.values:
-      # Hold a strong reference to avoid the container from being GC-ed. After
-      # v = v.assign(), the user code may no longer holds references to the
-      # original container, since v.assign() returns a new DistributedVariable.
-      v._distributed_container = result  # pylint: disable=protected-access
-
-  # Add the wrapped variable to the requested collections.
-  # The handling of eager mode and the global step matches
-  # ResourceVariable._init_from_args().
-  if not context.executing_eagerly():
-    g = ops.get_default_graph()
-    # If "trainable" is True, next_creator() will add the member variables
-    # to the TRAINABLE_VARIABLES collection, so we manually remove
-    # them and replace with the MirroredVariable. We can't set
-    # "trainable" to False for next_creator() since that causes functions
-    # like implicit_gradients to skip those variables.
-    if kwargs.get("trainable", True):
-      var_collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
-      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-      for value in value_list:
-        for i, trainable_variable in enumerate(l):
-          if value is trainable_variable:
-            del l[i]
-            break
-
-    g.add_to_collections(var_collections, result)
-  elif ops.GraphKeys.GLOBAL_STEP in var_collections:
-    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
-
-  return result
-
-
 class MirroredVariable(DistributedVariable, Mirrored):
   """Holds a map from replica to variables whose values are kept in sync."""
 
@@ -993,30 +898,6 @@ class MirroredVariable(DistributedVariable, Mirrored):
         self._get(), dtype=dtype, name=name, as_ref=as_ref)
 
 
-# Register a conversion function which reads the value of the variable,
-# allowing instances of the class to be used as tensors.
-def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
-  return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
-
-
-ops.register_tensor_conversion_function(MirroredVariable,
-                                        _tensor_conversion_mirrored)
-
-
-def _tensor_conversion_mirrored_val(value, dtype=None, name=None, as_ref=False):
-  return ops.convert_to_tensor(
-      value._get(), dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
-
-
-ops.register_tensor_conversion_function(Mirrored,
-                                        _tensor_conversion_mirrored_val)
-
-
-def is_distributed_variable(v):
-  """Determine if a variable is ds variable or TPU mirrored variable."""
-  return isinstance(v, DistributedVariable)
-
-
 class _SyncOnReadSaveable(saveable_object.SaveableObject):
   """Class for defining how to restore a SyncOnReadVariable."""
 
@@ -1053,16 +934,6 @@ class _SyncOnReadSaveable(saveable_object.SaveableObject):
             for v in self._sync_on_read_variable.values))
 
 
-def _assert_replica_context(strategy):
-  replica_context = ds_context.get_replica_context()
-  if not replica_context:
-    raise RuntimeError(
-        "Replica-local variables may only be assigned in a replica context.")
-  if replica_context.strategy is not strategy:
-    raise RuntimeError(
-        "Replica-local variables may only be assigned in a replica context.")
-
-
 class SyncOnReadVariable(DistributedVariable):
   """Holds a map from replica to variables whose values are reduced on save."""
 
@@ -1188,8 +1059,110 @@ class SyncOnReadVariable(DistributedVariable):
           self._get(), dtype=dtype, name=name, as_ref=as_ref)
 
 
-# Register a conversion function for SyncOnReadVariable which allows as_ref to
-# be true.
+# Variable creation function for sync strategies.
+def create_mirrored_variable(  # pylint: disable=missing-docstring
+    strategy, real_mirrored_creator, mirrored_cls, sync_on_read_cls, **kwargs):
+  # Figure out what collections this variable should be added to.
+  # We'll add the MirroredVariable to those collections instead.
+  var_collections = kwargs.pop("collections", None)
+  if var_collections is None:
+    var_collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+  kwargs["collections"] = []
+
+  synchronization = kwargs.get("synchronization",
+                               vs.VariableSynchronization.ON_WRITE)
+
+  if synchronization == vs.VariableSynchronization.NONE:
+    raise ValueError(
+        "`NONE` variable synchronization mode is not supported with `Mirrored` "
+        "distribution strategy. Please change the `synchronization` for "
+        "variable: " + str(kwargs["name"]))
+  elif synchronization == vs.VariableSynchronization.ON_READ:
+    is_sync_on_read = True
+  elif synchronization in (vs.VariableSynchronization.ON_WRITE,
+                           vs.VariableSynchronization.AUTO):
+    # `AUTO` synchronization defaults to `ON_WRITE`.
+    is_sync_on_read = False
+  else:
+    raise ValueError(
+        "Invalid variable synchronization mode: %s for variable: %s" %
+        (synchronization, kwargs["name"]))
+
+  aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
+
+  if aggregation not in (vs.VariableAggregation.NONE,
+                         vs.VariableAggregation.SUM,
+                         vs.VariableAggregation.MEAN,
+                         vs.VariableAggregation.ONLY_FIRST_REPLICA):
+    raise ValueError("Invalid variable aggregation mode: %s for variable: %s" %
+                     (aggregation, kwargs["name"]))
+
+  # Ignore user-specified caching device, not needed for mirrored variables.
+  kwargs.pop("caching_device", None)
+
+  # TODO(josh11b,apassos): It would be better if variable initialization
+  # was never recorded on the tape instead of having to do this manually
+  # here.
+  with tape.stop_recording():
+    value_list = real_mirrored_creator(**kwargs)
+    var_cls = sync_on_read_cls if is_sync_on_read else mirrored_cls
+    result = var_cls(strategy, value_list, aggregation)
+    # Install the created DistributedVariable as _distributed_container property
+    # of the underlying variables, to make it easy to map back to the container.
+    for v in result.values:
+      # Hold a strong reference to avoid the container from being GC-ed. After
+      # v = v.assign(), the user code may no longer holds references to the
+      # original container, since v.assign() returns a new DistributedVariable.
+      v._distributed_container = result  # pylint: disable=protected-access
+
+  # Add the wrapped variable to the requested collections.
+  # The handling of eager mode and the global step matches
+  # ResourceVariable._init_from_args().
+  if not context.executing_eagerly():
+    g = ops.get_default_graph()
+    # If "trainable" is True, next_creator() will add the member variables
+    # to the TRAINABLE_VARIABLES collection, so we manually remove
+    # them and replace with the MirroredVariable. We can't set
+    # "trainable" to False for next_creator() since that causes functions
+    # like implicit_gradients to skip those variables.
+    if kwargs.get("trainable", True):
+      var_collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+      for value in value_list:
+        for i, trainable_variable in enumerate(l):
+          if value is trainable_variable:
+            del l[i]
+            break
+
+    g.add_to_collections(var_collections, result)
+  elif ops.GraphKeys.GLOBAL_STEP in var_collections:
+    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
+
+  return result
+
+
+# Register a conversion functions which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+# MirroredVariables
+def _tensor_conversion_mirrored(var, dtype=None, name=None, as_ref=False):
+  return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
+
+
+ops.register_tensor_conversion_function(MirroredVariable,
+                                        _tensor_conversion_mirrored)
+
+
+# Mirrored Values
+def _tensor_conversion_mirrored_val(value, dtype=None, name=None, as_ref=False):
+  return ops.convert_to_tensor(
+      value._get(), dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
+
+
+ops.register_tensor_conversion_function(Mirrored,
+                                        _tensor_conversion_mirrored_val)
+
+
+# SyncOnReadVariables
 def _tensor_conversion_sync_on_read(var, dtype=None, name=None, as_ref=False):
   return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
 
@@ -1379,6 +1352,37 @@ def value_container(val):
   return val
 
 
+def is_distributed_variable(v):
+  """Determine if a variable is ds variable or TPU mirrored variable."""
+  return isinstance(v, DistributedVariable)
+
+
+def _validate_colocate_extended(v, extended):
+  variable_strategy = v._distribute_strategy  # pylint: disable=protected-access
+  if variable_strategy.extended is not extended:
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not %s created in scope: %s" %
+        (v, variable_strategy))
+
+
+def validate_colocate_distributed_variable(v, extended):
+  if not isinstance(v, DistributedVariable):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
+def validate_colocate(v, extended):
+  if not hasattr(v, "_distribute_strategy"):
+    raise ValueError(
+        "`colocate_vars_with` must only be passed a variable created in this "
+        "tf.distribute.Strategy.scope(), not: %r" % (v,))
+  _validate_colocate_extended(v, extended)
+
+
+# Variable used in PSStrategy TF 1 and CentralStorageStrategy.
 class AggregatingVariable(variables_lib.Variable, core.Tensor):
   """A wrapper around a variable that aggregates updates across replicas."""
 
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index 67ed86b4047..ef26174e82d 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -1722,8 +1722,8 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase):
                                          experimental_run_tf_function):
     aggregations = [
         variables_lib.VariableAggregation.SUM,
-        variables_lib.VariableAggregation.MEAN,
-        variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
+        # variables_lib.VariableAggregation.MEAN,
+        # variables_lib.VariableAggregation.ONLY_FIRST_REPLICA,
     ]
     for aggregation in aggregations:
       if isinstance(distribution, _TPU_STRATEGIES):

From d8d6ede4b1b4fcf16223dae68da61f19a70f21f7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 01:47:35 -0700
Subject: [PATCH 406/412] Integrate LLVM at
 https://github.com/llvm/llvm-project/commit/a6be4d17e349

PiperOrigin-RevId: 312239502
Change-Id: I2d144af2d9f2d745f9fe37e9513eabb682e1abcc
---
 third_party/mlir/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index 58c932ea723..1ad94212dcd 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -2583,6 +2583,7 @@ cc_library(
     srcs = [
         "tools/mlir-opt/mlir-opt.cpp",
     ],
+    copts = ["-DMLIR_INCLUDE_TESTS"],
     deps = [
         ":AllPassesAndDialectsNoRegistration",
         ":Analysis",

From 6c776edfd37a5df50ada3139751a3ed689899d44 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 02:03:03 -0700
Subject: [PATCH 407/412] compat: Update forward compatibility horizon to
 2020-05-19

PiperOrigin-RevId: 312240989
Change-Id: I85cb77f98e70362e56878faa52a414191146a200
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 88a26661f82..751f4b6cadf 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 18)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 19)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From a98f72c490c018828960fbb5bf59b56eba02285f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 02:03:06 -0700
Subject: [PATCH 408/412] Update GraphDef version to 406.

PiperOrigin-RevId: 312240999
Change-Id: I2c77677753920c9402b26a8abc2b1844c8237ebb
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 7abbcd5474c..048ed8e930e 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 405  // Updated: 2020/5/18
+#define TF_GRAPH_DEF_VERSION 406  // Updated: 2020/5/19
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //

From 686908251a6711212cc7fad6de3d929c6c0c1921 Mon Sep 17 00:00:00 2001
From: Taehee Jeong <taeheej@google.com>
Date: Tue, 19 May 2020 02:29:22 -0700
Subject: [PATCH 409/412] Move GraphWithDequantPartitionHelper out of
 delegates/gpu, and put into util.h as the logic remains same w/ other
 delegates that need to support FP16.

PiperOrigin-RevId: 312243729
Change-Id: I7e2ff7cf80c4860f016cf5dcb60efd94cd2d39dc
---
 tensorflow/lite/delegates/gpu/common/BUILD    |   1 +
 .../delegates/gpu/common/model_builder.cc     |   4 +-
 .../gpu/common/model_builder_helper.cc        | 153 ----------------
 .../gpu/common/model_builder_helper.h         |  60 -------
 tensorflow/lite/delegates/utils.cc            | 163 ++++++++++++++++++
 tensorflow/lite/delegates/utils.h             |  66 +++++++
 6 files changed, 233 insertions(+), 214 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/common/BUILD b/tensorflow/lite/delegates/gpu/common/BUILD
index 94d79182a92..b7120605902 100644
--- a/tensorflow/lite/delegates/gpu/common/BUILD
+++ b/tensorflow/lite/delegates/gpu/common/BUILD
@@ -116,6 +116,7 @@ cc_library(
         ":status",
         ":tensor",
         "@com_google_absl//absl/strings",
+        "//tensorflow/lite/delegates:utils",
         "//tensorflow/lite:context",
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:util",
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index 964c8289f83..18b48583295 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -45,6 +45,7 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
 #include "tensorflow/lite/delegates/gpu/common/transformations/general_transformations.h"
+#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
@@ -2809,7 +2810,8 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context, bool allow_quant_ops,
     return true;
   };
 
-  GraphWithDequantPartitionHelper partition_helper(context, node_supported_fn);
+  delegates::FP16GraphPartitionHelper partition_helper(context,
+                                                       node_supported_fn);
   std::set<std::string> unsupported_nodes_info;
   if (partition_helper.Partition(&unsupported_nodes_info) != kTfLiteOk) {
     return TfLiteIntArrayCreate(0);
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index 65e2b6f0d47..4973a8179cd 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -15,9 +15,7 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/common/model_builder_helper.h"
 
-#include <set>
 #include <string>
-#include <unordered_map>
 
 #include <fp16.h>
 #include "absl/strings/str_cat.h"
@@ -33,157 +31,6 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
-TfLiteStatus GraphWithDequantPartitionHelper::Partition(
-    std::set<std::string>* unsupported_nodes_info) {
-  const auto status = GraphPartitionHelper::Partition(unsupported_nodes_info);
-  // Clean up those partitions that have a single dequant op. NoteThose
-  // removed dequant ops have to be reserved in the graph and should not be
-  // delegated.
-  RemoveSingleDequantNodePartitions();
-  return status;
-}
-
-std::vector<int>
-GraphWithDequantPartitionHelper::GetNodesOfFirstNLargestPartitions(int n) {
-  // We first get partitions to reduce the number of nodes to be checked in
-  // deciding which dequant ops could actually be replaced. And then we
-  // remap input-tensor to dequant nodes' inputs and remove those
-  // to-be-reserved dequant nodes.
-  auto first_nps = GetFirstNLargestPartitions(n);
-  std::vector<int> ops_to_replace;
-  for (const auto p : first_nps) {
-    auto nodes = p->nodes_to_replace;
-    ops_to_replace.insert(ops_to_replace.end(), nodes->data,
-                          nodes->data + nodes->size);
-  }
-  RemapInputTensors(ops_to_replace);
-  RemoveReservedDequantsFromNodes(&ops_to_replace);
-  return ops_to_replace;
-}
-
-bool GraphWithDequantPartitionHelper::IsNodeSupported(
-    TfLiteContext* context, TfLiteNode* node, TfLiteRegistration* registration,
-    int node_id, std::string* unsupported_details) {
-  // If we need to handle dequant nodes, we have to remap input tensors of
-  // this node if some of them come from a dequant node before testing if
-  // the node is supported.
-  std::vector<int> orig_inputs;
-  if (RecordAndRemapInputTensors(registration->builtin_code, node_id, node,
-                                 &orig_inputs)) {
-    // We have a dequant op here. Note that we retrun an Ok status because a
-    // dequant node is first added as supported. Later, this dequant node
-    // will be removed if it has to be preserved in the graph which happens
-    // when its immediate downstream nodes cannot be supported.
-    return true;
-  }
-  const auto status = GraphPartitionHelper::IsNodeSupported(
-      context, node, registration, node_id, unsupported_details);
-  RestoreToOrigInputTensors(node, orig_inputs);
-  return status;
-}
-
-bool GraphWithDequantPartitionHelper::RecordAndRemapInputTensors(
-    int32_t op_code, int node_id, TfLiteNode* node,
-    std::vector<int>* orig_inputs) {
-  orig_inputs->clear();
-  // Record the dequant node.
-  if (op_code == kTfLiteBuiltinDequantize &&
-      context_->tensors[node->inputs->data[0]].type ==
-          TfLiteType::kTfLiteFloat16) {
-    dequant_nodes_[node->outputs->data[0]] = node->inputs->data[0];
-    return true;
-  }
-  // For a dequantize op, there's no need to remap its input tensors.
-  if (dequant_nodes_.empty()) return false;
-  RemapInputTensors(node, orig_inputs);
-  return false;
-}
-
-void GraphWithDequantPartitionHelper::RestoreToOrigInputTensors(
-    TfLiteNode* node, const std::vector<int>& orig_inputs) {
-  if (node->inputs->size != orig_inputs.size()) return;
-  for (int j = 0; j < node->inputs->size; ++j) {
-    node->inputs->data[j] = orig_inputs[j];
-  }
-}
-
-void GraphWithDequantPartitionHelper::RemapInputTensors(
-    const std::vector<int>& nodes) const {
-  for (int node_id : nodes) {
-    TfLiteNode* node;
-    TfLiteRegistration* registration;
-    GetNodeAndRegistration(context_, node_id, &node, &registration)
-        .IgnoreError();
-    RemapInputTensors(node, nullptr /* orig_inputs*/);
-  }
-}
-
-void GraphWithDequantPartitionHelper::RemoveSingleDequantNodePartitions() {
-  auto it = partitions_.begin();
-  while (it != partitions_.end()) {
-    auto p = *it;
-    if (p->nodes_to_replace->size != 1) {
-      ++it;
-      continue;
-    }
-    int node_id = p->nodes_to_replace->data[0];
-    TfLiteNode* node = nullptr;
-    TfLiteRegistration* registration = nullptr;
-    GetNodeAndRegistration(context_, node_id, &node, &registration)
-        .IgnoreError();
-    if (registration->builtin_code != kTfLiteBuiltinDequantize ||
-        context_->tensors[node->inputs->data[0]].type !=
-            TfLiteType::kTfLiteFloat16) {
-      ++it;
-      continue;
-    }
-    // Note such dequant nodes have to be preserved in the graph as dequant
-    // ops are not actually supported in the GPU delegate.
-    dequant_nodes_to_save_.insert(node_id);
-    it = partitions_.erase(it);
-  }
-}
-
-void GraphWithDequantPartitionHelper::RemoveReservedDequantsFromNodes(
-    std::vector<int>* nodes) {
-  if (dequant_nodes_to_save_.empty()) return;
-  auto it = nodes->begin();
-  while (it != nodes->end()) {
-    if (dequant_nodes_to_save_.find(*it) == dequant_nodes_to_save_.end()) {
-      ++it;
-      continue;
-    }
-    it = nodes->erase(it);
-  }
-}
-
-void GraphWithDequantPartitionHelper::RemapInputTensors(
-    TfLiteNode* node, std::vector<int>* orig_inputs) const {
-  TfLiteIntArray* inputs = node->inputs;
-  auto inputs_view = TfLiteIntArrayView(inputs);
-  // Prepopulate 'orig_inputs' first and clear it if there's no input from a
-  // dequant op.
-  if (orig_inputs) {
-    orig_inputs->clear();
-    orig_inputs->reserve(inputs->size);
-    for (auto tid : inputs_view) {
-      orig_inputs->push_back(tid);
-    }
-  }
-  // Fix this node's inputs (i.e. prune out the preceding dequantize node) in
-  // order to test if it is supported.
-  bool is_remapped = false;
-  for (int j = 0; j < inputs->size; ++j) {
-    const int input_tid = inputs->data[j];
-    const auto it = dequant_nodes_.find(input_tid);
-    if (it != dequant_nodes_.end()) {
-      inputs->data[j] = it->second;
-      is_remapped = true;
-    }
-  }
-  if (!is_remapped && orig_inputs) orig_inputs->clear();
-}
-
 absl::Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
                                     TfLiteNode** tflite_node,
                                     TfLiteRegistration** registration) {
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 54ae19e890a..9caa5630037 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -16,17 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_BUILDER_HELPER_H_
 
-#include <set>
-#include <string>
-#include <unordered_map>
-
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/tensor.h"
-#include "tensorflow/lite/delegates/utils.h"
 #include "tensorflow/lite/kernels/internal/reference/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -35,61 +30,6 @@ limitations under the License.
 namespace tflite {
 namespace gpu {
 
-class GraphWithDequantPartitionHelper : public delegates::GraphPartitionHelper {
- public:
-  GraphWithDequantPartitionHelper(
-      TfLiteContext* context, delegates::IsNodeSupportedFn is_node_supported_fn)
-      : GraphPartitionHelper(context, std::move(is_node_supported_fn)) {}
-
-  TfLiteStatus Partition(
-      std::set<std::string>* unsupported_nodes_info) override;
-
-  // Returns a list of node indices of all nodes from the first n largest
-  // partitions. If there are fewer paritions than n, all nodes will be
-  // returned. The partition is ranked according to the number of nodes.
-  std::vector<int> GetNodesOfFirstNLargestPartitions(int n);
-
- protected:
-  bool IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
-                       TfLiteRegistration* registration, int node_id,
-                       std::string* unsupported_details) override;
-
- private:
-  // Record 'node' if it is a dequant op (i.e. a fp16 one here) and return true.
-  // When it's not a dequant op, remap its inputs to the inputs of the preceding
-  // dequant if there's a one and returns false. 'orig_inputs' records original
-  // input tensor ids of this node if any input is remapped.
-  bool RecordAndRemapInputTensors(int32_t op_code, int node_id,
-                                  TfLiteNode* node,
-                                  std::vector<int>* orig_inputs);
-
-  // Restore inputs of 'node' to 'orig_inputs' only if two sizes match.
-  void RestoreToOrigInputTensors(TfLiteNode* node,
-                                 const std::vector<int>& orig_inputs);
-
-  // Remap input tensors of every node in 'nodes' (i.e. node indices) if some of
-  // them are from dequant ops.
-  void RemapInputTensors(const std::vector<int>& nodes) const;
-
-  void RemoveSingleDequantNodePartitions();
-
-  void RemoveReservedDequantsFromNodes(std::vector<int>* nodes);
-
-  // Remap input tensors of a single 'node' if some of come from a dequant op.
-  // If 'orig_inputs' isn't nullptr, it records original input tensor ids of
-  // this node if any input is remapped.
-  void RemapInputTensors(TfLiteNode* node, std::vector<int>* orig_inputs) const;
-
-  // A map recording dequantize nodes's input/output tensors of this selected
-  // graph. The key is the output tensor id, and the value is the input tensor
-  // id.
-  std::unordered_map<int, int> dequant_nodes_;
-
-  // A set of dequant nodes as in node indices that have to be preserved in the
-  // graph.
-  std::set<int> dequant_nodes_to_save_;
-};
-
 absl::Status GetNodeAndRegistration(TfLiteContext* context, int node_id,
                                     TfLiteNode** tflite_node,
                                     TfLiteRegistration** registration);
diff --git a/tensorflow/lite/delegates/utils.cc b/tensorflow/lite/delegates/utils.cc
index fba8bec39a5..f9cf9380a31 100644
--- a/tensorflow/lite/delegates/utils.cc
+++ b/tensorflow/lite/delegates/utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <vector>
 
+#include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/context_util.h"
 
 namespace tflite {
@@ -136,5 +137,167 @@ TfLiteStatus GraphPartitionHelper::PrepareSupportedNodes(
   return kTfLiteOk;
 }
 
+TfLiteStatus FP16GraphPartitionHelper::Partition(
+    std::set<std::string>* unsupported_nodes_info) {
+  const auto status = GraphPartitionHelper::Partition(unsupported_nodes_info);
+  // Clean up those partitions that have a single dequant op. NoteThose
+  // removed dequant ops have to be reserved in the graph and should not be
+  // delegated.
+  RemoveSingleDequantNodePartitions();
+  return status;
+}
+
+std::vector<int> FP16GraphPartitionHelper::GetNodesOfFirstNLargestPartitions(
+    int n) {
+  // We first get partitions to reduce the number of nodes to be checked in
+  // deciding which dequant ops could actually be replaced. And then we
+  // remap input-tensor to dequant nodes' inputs and remove those
+  // to-be-reserved dequant nodes.
+  auto first_nps = GetFirstNLargestPartitions(n);
+  std::vector<int> ops_to_replace;
+  for (const auto p : first_nps) {
+    auto nodes = p->nodes_to_replace;
+    ops_to_replace.insert(ops_to_replace.end(), nodes->data,
+                          nodes->data + nodes->size);
+  }
+  RemapInputTensors(ops_to_replace);
+  RemoveReservedDequantsFromNodes(&ops_to_replace);
+  return ops_to_replace;
+}
+
+bool FP16GraphPartitionHelper::IsNodeSupported(
+    TfLiteContext* context, TfLiteNode* node, TfLiteRegistration* registration,
+    int node_id, std::string* unsupported_details) {
+  // If we need to handle dequant nodes, we have to remap input tensors of
+  // this node if some of them come from a dequant node before testing if
+  // the node is supported.
+  std::vector<int> orig_inputs;
+  if (RecordAndRemapInputTensors(registration->builtin_code, node_id, node,
+                                 &orig_inputs)) {
+    // We have a dequant op here. Note that we retrun an Ok status because a
+    // dequant node is first added as supported. Later, this dequant node
+    // will be removed if it has to be preserved in the graph which happens
+    // when its immediate downstream nodes cannot be supported.
+    return true;
+  }
+  const auto status = GraphPartitionHelper::IsNodeSupported(
+      context, node, registration, node_id, unsupported_details);
+  RestoreToOrigInputTensors(node, orig_inputs);
+  return status;
+}
+
+bool FP16GraphPartitionHelper::RecordAndRemapInputTensors(
+    int32_t op_code, int node_id, TfLiteNode* node,
+    std::vector<int>* orig_inputs) {
+  orig_inputs->clear();
+  // Record the dequant node.
+  if (op_code == kTfLiteBuiltinDequantize &&
+      context_->tensors[node->inputs->data[0]].type ==
+          TfLiteType::kTfLiteFloat16) {
+    dequant_nodes_[node->outputs->data[0]] = node->inputs->data[0];
+    return true;
+  }
+  // For a dequantize op, there's no need to remap its input tensors.
+  if (dequant_nodes_.empty()) return false;
+  RemapInputTensors(node, orig_inputs);
+  return false;
+}
+
+void FP16GraphPartitionHelper::RestoreToOrigInputTensors(
+    TfLiteNode* node, const std::vector<int>& orig_inputs) {
+  if (node->inputs->size != orig_inputs.size()) return;
+  for (int j = 0; j < node->inputs->size; ++j) {
+    node->inputs->data[j] = orig_inputs[j];
+  }
+}
+
+void FP16GraphPartitionHelper::RemapInputTensors(
+    const std::vector<int>& nodes) const {
+  for (int node_id : nodes) {
+    TfLiteNode* node;
+    TfLiteRegistration* registration;
+    TfLiteStatus status = context_->GetNodeAndRegistration(
+        context_, node_id, &node, &registration);
+    if (status != kTfLiteOk) {
+      TF_LITE_KERNEL_LOG(context_,
+                         "Couldn't get node and registration info for op: %d\n",
+                         node_id);
+    }
+    RemapInputTensors(node, nullptr /* orig_inputs*/);
+  }
+}
+
+void FP16GraphPartitionHelper::RemoveSingleDequantNodePartitions() {
+  auto it = partitions_.begin();
+  while (it != partitions_.end()) {
+    auto p = *it;
+    if (p->nodes_to_replace->size != 1) {
+      ++it;
+      continue;
+    }
+    int node_id = p->nodes_to_replace->data[0];
+    TfLiteNode* node = nullptr;
+    TfLiteRegistration* registration = nullptr;
+
+    TfLiteStatus status = context_->GetNodeAndRegistration(
+        context_, node_id, &node, &registration);
+    if (status != kTfLiteOk) {
+      TF_LITE_KERNEL_LOG(context_,
+                         "Couldn't get node and registration info for op: %d\n",
+                         node_id);
+    }
+    if (registration->builtin_code != kTfLiteBuiltinDequantize ||
+        context_->tensors[node->inputs->data[0]].type !=
+            TfLiteType::kTfLiteFloat16) {
+      ++it;
+      continue;
+    }
+    // Note such dequant nodes have to be preserved in the graph as dequant
+    // ops are not actually supported in the GPU delegate.
+    dequant_nodes_to_save_.insert(node_id);
+    it = partitions_.erase(it);
+  }
+}
+
+void FP16GraphPartitionHelper::RemoveReservedDequantsFromNodes(
+    std::vector<int>* nodes) {
+  if (dequant_nodes_to_save_.empty()) return;
+  auto it = nodes->begin();
+  while (it != nodes->end()) {
+    if (dequant_nodes_to_save_.find(*it) == dequant_nodes_to_save_.end()) {
+      ++it;
+      continue;
+    }
+    it = nodes->erase(it);
+  }
+}
+
+void FP16GraphPartitionHelper::RemapInputTensors(
+    TfLiteNode* node, std::vector<int>* orig_inputs) const {
+  TfLiteIntArray* inputs = node->inputs;
+  auto inputs_view = TfLiteIntArrayView(inputs);
+  // Prepopulate 'orig_inputs' first and clear it if there's no input from a
+  // dequant op.
+  if (orig_inputs) {
+    orig_inputs->clear();
+    orig_inputs->reserve(inputs->size);
+    for (auto tid : inputs_view) {
+      orig_inputs->push_back(tid);
+    }
+  }
+  // Fix this node's inputs (i.e. prune out the preceding dequantize node) in
+  // order to test if it is supported.
+  bool is_remapped = false;
+  for (int j = 0; j < inputs->size; ++j) {
+    const int input_tid = inputs->data[j];
+    const auto it = dequant_nodes_.find(input_tid);
+    if (it != dequant_nodes_.end()) {
+      inputs->data[j] = it->second;
+      is_remapped = true;
+    }
+  }
+  if (!is_remapped && orig_inputs) orig_inputs->clear();
+}
+
 }  // namespace delegates
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/utils.h b/tensorflow/lite/delegates/utils.h
index d6d22c4efa2..2238ba681e6 100644
--- a/tensorflow/lite/delegates/utils.h
+++ b/tensorflow/lite/delegates/utils.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <limits>
 #include <set>
 #include <string>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/lite/c/common.h"
@@ -109,6 +111,70 @@ class GraphPartitionHelper {
   // Contains an array of supported node indices.
   TfLiteIntArray* supported_nodes_ = nullptr;  // owns the memory
 };
+
+// While partitioning the graph, this claims DEQUANTIZE nodes (FP16->FP32) in
+// addition to supported nodes for the delegate, when the DEQUANTIZE node's
+// output is an input to the kernel that supports FP16 input.
+// Noth that you have to use `GetNodesOfFirstNLargestPartitions` instead of
+// superclass' `GetFirstNLargestPartitions` to do actual remapping of FP16
+// inputs.
+class FP16GraphPartitionHelper : public GraphPartitionHelper {
+ public:
+  FP16GraphPartitionHelper(TfLiteContext* context,
+                           IsNodeSupportedFn is_node_supported_fn)
+      : GraphPartitionHelper(context, std::move(is_node_supported_fn)) {}
+
+  TfLiteStatus Partition(
+      std::set<std::string>* unsupported_nodes_info) override;
+
+  // Returns a list of node indices of all nodes from the first n largest
+  // partitions. If there are fewer paritions than n, all nodes will be
+  // returned. The partition is ranked according to the number of nodes.
+  // TODO(b/156707497): Add this to superclass besides
+  // GetFirstNLargestPartitions (one that returns partitions instead of nodes)
+  std::vector<int> GetNodesOfFirstNLargestPartitions(int n);
+
+ protected:
+  bool IsNodeSupported(TfLiteContext* context, TfLiteNode* node,
+                       TfLiteRegistration* registration, int node_id,
+                       std::string* unsupported_details) override;
+
+ private:
+  // Record 'node' if it is a dequant op (i.e. a fp16 one here) and return true.
+  // When it's not a dequant op, remap its inputs to the inputs of the preceding
+  // dequant if there's a one and returns false. 'orig_inputs' records original
+  // input tensor ids of this node if any input is remapped.
+  bool RecordAndRemapInputTensors(int32_t op_code, int node_id,
+                                  TfLiteNode* node,
+                                  std::vector<int>* orig_inputs);
+
+  // Restore inputs of 'node' to 'orig_inputs' only if two sizes match.
+  void RestoreToOrigInputTensors(TfLiteNode* node,
+                                 const std::vector<int>& orig_inputs);
+
+  // Remap input tensors of every node in 'nodes' (i.e. node indices) if some of
+  // them are from dequant ops.
+  void RemapInputTensors(const std::vector<int>& nodes) const;
+
+  void RemoveSingleDequantNodePartitions();
+
+  void RemoveReservedDequantsFromNodes(std::vector<int>* nodes);
+
+  // Remap input tensors of a single 'node' if some of come from a dequant op.
+  // If 'orig_inputs' isn't nullptr, it records original input tensor ids of
+  // this node if any input is remapped.
+  void RemapInputTensors(TfLiteNode* node, std::vector<int>* orig_inputs) const;
+
+  // A map recording dequantize nodes's input/output tensors of this selected
+  // graph. The key is the output tensor id, and the value is the input tensor
+  // id.
+  std::unordered_map<int, int> dequant_nodes_;
+
+  // A set of dequant nodes as in node indices that have to be preserved in the
+  // graph.
+  std::set<int> dequant_nodes_to_save_;
+};
+
 }  // namespace delegates
 }  // namespace tflite
 

From 8121e42ca4c90a79c8f8b1a61d424eb46f2c8c0c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 04:23:07 -0700
Subject: [PATCH 410/412] Clarify CPU/GPU infeed error messages.

PiperOrigin-RevId: 312254085
Change-Id: Ic981d72bf59e41b149cf0036a272250d4ea482a3
---
 tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc | 3 ++-
 tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index fae9670051a..e21ed7ad60e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -154,7 +154,8 @@ CpuTransferManager::TransferBufferToInfeedInternal(se::StreamExecutor* executor,
                                                    int64 size,
                                                    const void* source) {
   if (size > std::numeric_limits<int32>::max()) {
-    return InvalidArgument("Infeed shape is too large: needs %d bytes", size);
+    return InvalidArgument("CPU infeed of %d bytes exceeds maximum of %d bytes",
+                           size, std::numeric_limits<int32>::max());
   }
 
   if (size <= 0) {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index 05fa798dc39..cb22b4d9042 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -96,7 +96,8 @@ Status GpuTransferManager::EnqueueBuffersToInfeed(
 StatusOr<InfeedBuffer> GpuTransferManager::TransferBufferToInfeedInternal(
     se::StreamExecutor* executor, int64 size, const void* source) {
   if (size > std::numeric_limits<int32>::max()) {
-    return InvalidArgument("Infeed shape is too large: needs %d bytes", size);
+    return InvalidArgument("GPU infeed of %d bytes exceeds maximum of %d bytes",
+                           size, std::numeric_limits<int32>::max());
   }
 
   if (size == 0) {

From b93bd76a9f5025ec42b6b9a2ca4a26562b49c405 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 19 May 2020 05:15:31 -0700
Subject: [PATCH 411/412] Generate a cubin header for tanh.

So far, only generate it for f32 and f64, f16 doesn't work yet.

PiperOrigin-RevId: 312258425
Change-Id: I73c7a58d8fa2ebf02729fe1f7317aabb746fa8b0
---
 .../mlir/tools/kernel_gen/cubin_creator.cc    | 10 ++++++++--
 tensorflow/core/kernels/cubin_headers/BUILD   | 20 +++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
index b1c4b1beae1..f47485d0214 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/cubin_creator.cc
@@ -231,8 +231,14 @@ StatusOr<std::vector<uint8_t>> tensorflow::kernel_gen::GenerateCubinForTfCode(
       xla::mlir_gpu::LowerLHLOToGPU(module.get(), tile_sizes, unroll_factors,
                                     /*collapseParallelLoops=*/false));
   TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
-  TF_RETURN_IF_ERROR(
-      PropagateStaticShapeKnowledgeToKernel(module.get(), same_shape));
+  // TODO(b/156985522): Figure out why we get a segfault when generating Tanh
+  // with 'same_shape' containing {0, 1}. We would also get the crash if we
+  // unconditionally call PropagateStaticShapeKnowledgeToKernel while
+  // 'same_shape' is empty.
+  if (!same_shape.empty()) {
+    TF_RETURN_IF_ERROR(
+        PropagateStaticShapeKnowledgeToKernel(module.get(), same_shape));
+  }
 
   mlir::OwningModuleRef kernel_module =
       xla::mlir_gpu::ExtractKernelModule(*module).ValueOrDie();
diff --git a/tensorflow/core/kernels/cubin_headers/BUILD b/tensorflow/core/kernels/cubin_headers/BUILD
index bb7995dd221..509ac008355 100644
--- a/tensorflow/core/kernels/cubin_headers/BUILD
+++ b/tensorflow/core/kernels/cubin_headers/BUILD
@@ -45,3 +45,23 @@ func @relu(%arg0: tensor<?xf99>) -> tensor<?xf99> {
         ("f64", "DT_DOUBLE"),
     ]
 ]
+
+tanh_kernel = """
+func @tanh(%arg0: tensor<?xf99>) -> tensor<?xf99> {
+  %0 = "tf.Tanh"(%arg0) { T = "tfdtype$DT_TYPE" }
+    : (tensor<?xf99>) -> tensor<?xf99>
+  return %0 : tensor<?xf99>
+}
+"""
+
+[
+    gen_kernel_image_hdr(
+        name = "tanh_{type}_kernel".format(type = type),
+        op = tanh_kernel.replace("f99", type).replace("DT_TYPE", dtype),
+        tile_size = "256",
+    )
+    for (type, dtype) in [
+        ("f32", "DT_FLOAT"),
+        ("f64", "DT_DOUBLE"),
+    ]
+]

From e0b19f6ef223af40e2e6d1d21b8464c1b2ebee8f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 19 May 2020 05:51:19 -0700
Subject: [PATCH 412/412] Simplify cuda toolchain config

This change does:

* Inlines all action_configs and features.
* Makes linux and darwin toolchain have `no_legacy_features`, adds all missing features.
* Moves all flags into 3 features: default_compile_flags, default_archive_flags, default_link_flags. If flag set depends on enabling some other feature, we use `with_feature` to express that.
* Removes all extra features that are now empty and have no semantic meaning.

As a result, all flags appear in the order of potential appearance on the generated command line, and there is no magic patching of this toolchain by Bazel anymore.

PiperOrigin-RevId: 312262853
Change-Id: If80dfeac50256d83de4b565a13e2c4a6351fb376
---
 .../crosstool/cc_toolchain_config.bzl.tpl     | 2387 +++++++----------
 1 file changed, 953 insertions(+), 1434 deletions(-)

diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index e50592fd857..4acc05ff88c 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -12,1426 +12,237 @@ load(
     "tool",
     "tool_path",
     "variable_with_value",
+    "with_feature_set",
 )
-load(
-    "@bazel_tools//tools/build_defs/cc:action_names.bzl",
-    "ASSEMBLE_ACTION_NAME",
-    "CC_FLAGS_MAKE_VARIABLE_ACTION_NAME",
-    "CLIF_MATCH_ACTION_NAME",
-    "CPP_COMPILE_ACTION_NAME",
-    "CPP_HEADER_PARSING_ACTION_NAME",
-    "CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_EXECUTABLE_ACTION_NAME",
-    "CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME",
-    "CPP_LINK_STATIC_LIBRARY_ACTION_NAME",
-    "CPP_MODULE_CODEGEN_ACTION_NAME",
-    "CPP_MODULE_COMPILE_ACTION_NAME",
-    "C_COMPILE_ACTION_NAME",
-    "LINKSTAMP_COMPILE_ACTION_NAME",
-    "LTO_BACKEND_ACTION_NAME",
-    "LTO_INDEXING_ACTION_NAME",
-    "OBJCPP_COMPILE_ACTION_NAME",
-    "OBJCPP_EXECUTABLE_ACTION_NAME",
-    "OBJC_ARCHIVE_ACTION_NAME",
-    "OBJC_COMPILE_ACTION_NAME",
-    "OBJC_EXECUTABLE_ACTION_NAME",
-    "OBJC_FULLY_LINK_ACTION_NAME",
-    "PREPROCESS_ASSEMBLE_ACTION_NAME",
-    "STRIP_ACTION_NAME",
-)
+load("@bazel_tools//tools/build_defs/cc:action_names.bzl", "ACTION_NAMES")
 
-ACTION_NAMES = struct(
-    c_compile = C_COMPILE_ACTION_NAME,
-    cpp_compile = CPP_COMPILE_ACTION_NAME,
-    linkstamp_compile = LINKSTAMP_COMPILE_ACTION_NAME,
-    cc_flags_make_variable = CC_FLAGS_MAKE_VARIABLE_ACTION_NAME,
-    cpp_module_codegen = CPP_MODULE_CODEGEN_ACTION_NAME,
-    cpp_header_parsing = CPP_HEADER_PARSING_ACTION_NAME,
-    cpp_module_compile = CPP_MODULE_COMPILE_ACTION_NAME,
-    assemble = ASSEMBLE_ACTION_NAME,
-    preprocess_assemble = PREPROCESS_ASSEMBLE_ACTION_NAME,
-    lto_indexing = LTO_INDEXING_ACTION_NAME,
-    lto_backend = LTO_BACKEND_ACTION_NAME,
-    cpp_link_executable = CPP_LINK_EXECUTABLE_ACTION_NAME,
-    cpp_link_dynamic_library = CPP_LINK_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_nodeps_dynamic_library = CPP_LINK_NODEPS_DYNAMIC_LIBRARY_ACTION_NAME,
-    cpp_link_static_library = CPP_LINK_STATIC_LIBRARY_ACTION_NAME,
-    strip = STRIP_ACTION_NAME,
-    objc_archive = OBJC_ARCHIVE_ACTION_NAME,
-    objc_compile = OBJC_COMPILE_ACTION_NAME,
-    objc_executable = OBJC_EXECUTABLE_ACTION_NAME,
-    objc_fully_link = OBJC_FULLY_LINK_ACTION_NAME,
-    objcpp_compile = OBJCPP_COMPILE_ACTION_NAME,
-    objcpp_executable = OBJCPP_EXECUTABLE_ACTION_NAME,
-    clif_match = CLIF_MATCH_ACTION_NAME,
-    objcopy_embed_data = "objcopy_embed_data",
-    ld_embed_data = "ld_embed_data",
-)
+def all_assembly_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.preprocess_assemble,
+    ]
 
-def _impl(ctx):
-    if (ctx.attr.cpu == "darwin"):
-        toolchain_identifier = "local_darwin"
-    elif (ctx.attr.cpu == "local"):
-        toolchain_identifier = "local_linux"
-    elif (ctx.attr.cpu == "x64_windows"):
-        toolchain_identifier = "local_windows"
-    else:
-        fail("Unreachable")
+def all_compile_actions():
+    return [
+        ACTION_NAMES.assemble,
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
 
-    host_system_name = "local"
+def all_c_compile_actions():
+    return [
+        ACTION_NAMES.c_compile,
+    ]
 
-    target_system_name = "local"
+def all_cpp_compile_actions():
+    return [
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+    ]
 
-    if (ctx.attr.cpu == "darwin"):
-        target_cpu = "darwin"
-    elif (ctx.attr.cpu == "local"):
-        target_cpu = "local"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_cpu = "x64_windows"
-    else:
-        fail("Unreachable")
+def all_preprocessed_actions():
+    return [
+        ACTION_NAMES.c_compile,
+        ACTION_NAMES.cpp_compile,
+        ACTION_NAMES.cpp_header_parsing,
+        ACTION_NAMES.cpp_module_codegen,
+        ACTION_NAMES.cpp_module_compile,
+        ACTION_NAMES.linkstamp_compile,
+        ACTION_NAMES.preprocess_assemble,
+    ]
 
-    if (ctx.attr.cpu == "local"):
-        target_libc = "local"
-    elif (ctx.attr.cpu == "darwin"):
-        target_libc = "macosx"
-    elif (ctx.attr.cpu == "x64_windows"):
-        target_libc = "msvcrt"
-    else:
-        fail("Unreachable")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        compiler = "compiler"
-    elif (ctx.attr.cpu == "x64_windows"):
-        compiler = "msvc-cl"
-    else:
-        fail("Unreachable")
-
-    abi_version = "local"
-
-    abi_libc_version = "local"
-
-    cc_target_os = None
-
-    builtin_sysroot = ctx.attr.builtin_sysroot
-
-    all_link_actions = [
+def all_link_actions():
+    return [
         ACTION_NAMES.cpp_link_executable,
         ACTION_NAMES.cpp_link_dynamic_library,
         ACTION_NAMES.cpp_link_nodeps_dynamic_library,
     ]
 
-    cpp_link_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
+def all_executable_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_executable,
+    ]
+
+def all_shared_library_link_actions():
+    return [
+        ACTION_NAMES.cpp_link_dynamic_library,
+        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
+    ]
+
+def all_archive_actions():
+    return [ACTION_NAMES.cpp_link_static_library]
+
+def all_strip_actions():
+    return [ACTION_NAMES.strip]
+
+def _library_to_link(flag_prefix, value, iterate = None):
+    return flag_group(
+        flags = [
+            "{}%{{libraries_to_link.{}}}".format(
+                flag_prefix,
+                iterate if iterate else "name",
+            ),
         ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
+        iterate_over = ("libraries_to_link." + iterate if iterate else None),
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = value,
+        ),
     )
 
-    cpp_link_nodeps_dynamic_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-        implies = [
-            "nologo",
-            "shared_flag",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-            "has_configured_linker_path",
-            "def_file",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
+def _surround_static_library(prefix, suffix):
+    return [
+        flag_group(
+            flags = [prefix, "%{libraries_to_link.name}", suffix],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _prefix_static_library(prefix):
+    return [
+        flag_group(
+            flags = ["%{libraries_to_link.name}"],
+            expand_if_false = "libraries_to_link.is_whole_archive",
+        ),
+        flag_group(
+            flags = [prefix + "%{libraries_to_link.name}"],
+            expand_if_true = "libraries_to_link.is_whole_archive",
+        ),
+    ]
+
+def _static_library_to_link(alwayslink_prefix, alwayslink_suffix = None):
+    if alwayslink_suffix:
+        flag_groups = _surround_static_library(alwayslink_prefix, alwayslink_suffix)
+    else:
+        flag_groups = _prefix_static_library(alwayslink_prefix)
+    return flag_group(
+        flag_groups = flag_groups,
+        expand_if_equal = variable_with_value(
+            name = "libraries_to_link.type",
+            value = "static_library",
+        ),
     )
 
-    cpp_link_static_library_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_static_library,
-        implies = [
-            "nologo",
-            "archiver_flags",
-            "input_param_flags",
-            "linker_param_file",
-            "msvc_env",
-        ],
-        tools = [tool(path = ctx.attr.msvc_lib_path)],
+def _iterate_flag_group(iterate_over, flags = [], flag_groups = []):
+    return flag_group(
+        iterate_over = iterate_over,
+        expand_if_available = iterate_over,
+        flag_groups = flag_groups,
+        flags = flags,
     )
 
-    assemble_action = action_config(
-        action_name = ACTION_NAMES.assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
+def _libraries_to_link_group(flavour):
+    if flavour == "linux":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                flag_group(
+                    flags = ["-Wl,--start-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file_group", "object_files"),
+                flag_group(
+                    flags = ["-Wl,--end-lib"],
+                    expand_if_equal = variable_with_value(
+                        name = "libraries_to_link.type",
+                        value = "object_file_group",
+                    ),
+                ),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-whole-archive", "-Wl,-no-whole-archive"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+    elif flavour == "darwin":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                _library_to_link("", "object_file_group", "object_files"),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("-Wl,-force_load,"),
+                _library_to_link("-l", "dynamic_library"),
+                _library_to_link("-l:", "versioned_dynamic_library"),
+            ],
+        )
+    elif flavour == "msvc":
+        return _iterate_flag_group(
+            iterate_over = "libraries_to_link",
+            flag_groups = [
+                _library_to_link("", "object_file_group", "object_files"),
+                _library_to_link("", "object_file"),
+                _library_to_link("", "interface_library"),
+                _static_library_to_link("/WHOLEARCHIVE:"),
+            ],
+        )
+
+def _action_configs_with_tool(path, actions):
+    return [
+        action_config(
+            action_name = name,
+            enabled = True,
+            tools = [tool(path = path)],
+        )
+        for name in actions
+    ]
+
+def _action_configs(assembly_path, c_compiler_path, cc_compiler_path, archiver_path, linker_path, strip_path):
+    return _action_configs_with_tool(
+        assembly_path,
+        all_assembly_actions(),
+    ) + _action_configs_with_tool(
+        c_compiler_path,
+        all_c_compile_actions(),
+    ) + _action_configs_with_tool(
+        cc_compiler_path,
+        all_cpp_compile_actions(),
+    ) + _action_configs_with_tool(
+        archiver_path,
+        all_archive_actions(),
+    ) + _action_configs_with_tool(
+        linker_path,
+        all_link_actions(),
+    ) + _action_configs_with_tool(
+        strip_path,
+        all_strip_actions(),
     )
 
-    preprocess_assemble_action = action_config(
-        action_name = ACTION_NAMES.preprocess_assemble,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "sysroot",
-        ],
-        tools = [tool(path = ctx.attr.msvc_ml_path)],
-    )
-
-    c_compile_action = action_config(
-        action_name = ACTION_NAMES.c_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_compile_action = action_config(
-        action_name = ACTION_NAMES.cpp_compile,
-        implies = [
-            "compiler_input_flags",
-            "compiler_output_flags",
-            "nologo",
-            "msvc_env",
-            "parse_showincludes",
-            "user_compile_flags",
-            "sysroot",
-            "unfiltered_compile_flags",
-        ],
-        tools = [tool(path = ctx.attr.msvc_cl_path)],
-    )
-
-    cpp_link_executable_action = action_config(
-        action_name = ACTION_NAMES.cpp_link_executable,
-        implies = [
-            "nologo",
-            "linkstamps",
-            "output_execpath_flags",
-            "input_param_flags",
-            "user_link_flags",
-            "linker_subsystem_flag",
-            "linker_param_file",
-            "msvc_env",
-            "no_stripping",
-        ],
-        tools = [tool(path = ctx.attr.msvc_link_path)],
-    )
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        action_configs = []
-    elif (ctx.attr.cpu == "x64_windows"):
-        action_configs = [
-            assemble_action,
-            preprocess_assemble_action,
-            c_compile_action,
-            cpp_compile_action,
-            cpp_link_executable_action,
-            cpp_link_dynamic_library_action,
-            cpp_link_nodeps_dynamic_library_action,
-            cpp_link_static_library_action,
+def _tool_paths(cpu, ctx):
+    if cpu in ["local", "darwin"]:
+        return [
+            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
+            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + (
+                "/ar" if cpu == "local" else "/libtool"
+            )),
+            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
+            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
+            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
+            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
+            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
+            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
+            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
+            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
         ]
-    else:
-        fail("Unreachable")
-
-    no_windows_export_all_symbols_feature = feature(name = "no_windows_export_all_symbols")
-
-    pic_feature = feature(
-        name = "pic",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(flags = ["-fPIC"], expand_if_available = "pic"),
-                    flag_group(
-                        flags = ["-fPIE"],
-                        expand_if_not_available = "pic",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    preprocessor_defines_feature = feature(
-        name = "preprocessor_defines",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/D%{preprocessor_defines}"],
-                        iterate_over = "preprocessor_defines",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    generate_pdb_file_feature = feature(
-        name = "generate_pdb_file",
-        requires = [
-            feature_set(features = ["dbg"]),
-            feature_set(features = ["fastbuild"]),
-        ],
-    )
-
-    linkstamps_feature = feature(
-        name = "linkstamps",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{linkstamp_paths}"],
-                        iterate_over = "linkstamp_paths",
-                        expand_if_available = "linkstamp_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    unfiltered_compile_flags_feature = feature(
-        name = "unfiltered_compile_flags",
-        flag_sets = ([
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ctx.attr.host_unfiltered_compile_flags,
-                    ),
-                ],
-            ),
-        ] if ctx.attr.host_unfiltered_compile_flags else []),
-    )
-
-    determinism_feature = feature(
-        name = "determinism",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-Wno-builtin-macro-redefined",
-                            "-D__DATE__=\"redacted\"",
-                            "-D__TIMESTAMP__=\"redacted\"",
-                            "-D__TIME__=\"redacted\"",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    nologo_feature = feature(
-        name = "nologo",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                flag_groups = [flag_group(flags = ["/nologo"])],
-            ),
-        ],
-    )
-
-    supports_pic_feature = feature(name = "supports_pic", enabled = True)
-
-    output_execpath_flags_feature = feature(
-        name = "output_execpath_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    default_link_flags_feature = feature(
-        name = "default_link_flags",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/MACHINE:X64"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,-z,relro,-z,now"])],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie", "-Wl,-z,relro,-z,now"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        hardening_feature = feature(
-            name = "hardening",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-U_FORTIFY_SOURCE",
-                                "-D_FORTIFY_SOURCE=1",
-                                "-fstack-protector",
-                            ],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [ACTION_NAMES.cpp_link_executable],
-                    flag_groups = [flag_group(flags = ["-pie"])],
-                ),
-            ],
-        )
-    else:
-        hardening_feature = None
-
-    supports_dynamic_linker_feature = feature(name = "supports_dynamic_linker", enabled = True)
-
-    targets_windows_feature = feature(
-        name = "targets_windows",
-        enabled = True,
-        implies = ["copy_dynamic_libraries_to_binary"],
-    )
-
-    msvc_env_feature = feature(
-        name = "msvc_env",
-        env_sets = [
-            env_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_static_library,
-                ],
-                env_entries = [
-                    env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
-                    env_entry(
-                        key = "INCLUDE",
-                        value = ctx.attr.msvc_env_include,
-                    ),
-                    env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
-                    env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
-                    env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
-                ],
-            ),
-        ],
-    )
-
-    linker_subsystem_flag_feature = feature(
-        name = "linker_subsystem_flag",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/SUBSYSTEM:CONSOLE"])],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_no_debug_feature = feature(
-        name = "dynamic_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MD"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    warnings_feature = feature(
-        name = "warnings",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wall"] + ctx.attr.host_compiler_warnings,
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    dynamic_link_msvcrt_debug_feature = feature(
-        name = "dynamic_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MDd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    compiler_output_flags_feature = feature(
-        name = "compiler_output_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.assemble],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}", "/Zi"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fo%{output_file}"],
-                                expand_if_not_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                        expand_if_not_available = "output_assembly_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/Fa%{output_file}"],
-                                expand_if_available = "output_assembly_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                    flag_group(
-                        flag_groups = [
-                            flag_group(
-                                flags = ["/P", "/Fi%{output_file}"],
-                                expand_if_available = "output_preprocess_file",
-                            ),
-                        ],
-                        expand_if_available = "output_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if ctx.attr.compiler == "clang":
-      default_compile_flags_feature = feature(
-          name = "default_compile_flags",
-          enabled = True,
-          flag_sets = [
-              flag_set(
-                  actions = [
-                      ACTION_NAMES.assemble,
-                      ACTION_NAMES.preprocess_assemble,
-                      ACTION_NAMES.linkstamp_compile,
-                      ACTION_NAMES.c_compile,
-                      ACTION_NAMES.cpp_compile,
-                      ACTION_NAMES.cpp_header_parsing,
-                      ACTION_NAMES.cpp_module_compile,
-                      ACTION_NAMES.cpp_module_codegen,
-                      ACTION_NAMES.lto_backend,
-                      ACTION_NAMES.clif_match,
-                  ],
-                  flag_groups = [
-                      flag_group(
-                          flags = [
-                              "-fexperimental-new-pass-manager",
-                          ],
-                      ),
-                  ],
-              ),
-          ],
-      )
-
-    elif ctx.attr.compiler == "msvc":
-      default_compile_flags_feature = feature(
-          name = "default_compile_flags",
-          enabled = True,
-          flag_sets = [
-              flag_set(
-                  actions = [
-                      ACTION_NAMES.assemble,
-                      ACTION_NAMES.preprocess_assemble,
-                      ACTION_NAMES.linkstamp_compile,
-                      ACTION_NAMES.c_compile,
-                      ACTION_NAMES.cpp_compile,
-                      ACTION_NAMES.cpp_header_parsing,
-                      ACTION_NAMES.cpp_module_compile,
-                      ACTION_NAMES.cpp_module_codegen,
-                      ACTION_NAMES.lto_backend,
-                      ACTION_NAMES.clif_match,
-                  ],
-                  flag_groups = [
-                      flag_group(
-                          flags = [
-                              "/DCOMPILER_MSVC",
-                              "/DNOMINMAX",
-                              "/D_WIN32_WINNT=0x0600",
-                              "/D_CRT_SECURE_NO_DEPRECATE",
-                              "/D_CRT_SECURE_NO_WARNINGS",
-                              "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
-                              "/bigobj",
-                              "/Zm500",
-                              "/J",
-                              "/Gy",
-                              "/GF",
-                              "/EHsc",
-                              "/wd4351",
-                              "/wd4291",
-                              "/wd4250",
-                              "/wd4996",
-                          ],
-                      ),
-                  ],
-              ),
-          ],
-      )
-
-    else:
-      default_compile_flags_feature = feature(
-          name = "default_compile_flags")
-
-    static_link_msvcrt_debug_feature = feature(
-        name = "static_link_msvcrt_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MTd"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
-            ),
-        ],
-        requires = [feature_set(features = ["dbg"])],
-    )
-
-    static_link_msvcrt_feature = feature(name = "static_link_msvcrt")
-
-    if (ctx.attr.cpu == "darwin" or
-        ctx.attr.cpu == "local"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["-g"])],
-                ),
-            ],
-            implies = ["common"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        dbg_feature = feature(
-            name = "dbg",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    else:
-        dbg_feature = None
-
-    undefined_dynamic_feature = feature(
-        name = "undefined-dynamic",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
-            ),
-        ],
-    )
-
-    parse_showincludes_feature = feature(
-        name = "parse_showincludes",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                ],
-                flag_groups = [flag_group(flags = ["/showIncludes"])],
-            ),
-        ],
-    )
-
-    linker_param_file_feature = feature(
-        name = "linker_param_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["@%{linker_param_file}"],
-                        expand_if_available = "linker_param_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    static_link_msvcrt_no_debug_feature = feature(
-        name = "static_link_msvcrt_no_debug",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["/MT"])],
-            ),
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
-            ),
-        ],
-        requires = [
-            feature_set(features = ["fastbuild"]),
-            feature_set(features = ["opt"]),
-        ],
-    )
-
-    supports_interface_shared_libraries_feature = feature(
-        name = "supports_interface_shared_libraries",
-        enabled = True,
-    )
-
-    disable_assertions_feature = feature(
-        name = "disable-assertions",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-DNDEBUG"])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "x64_windows"):
-        fastbuild_feature = feature(
-            name = "fastbuild",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
-                ),
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [
-                        flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
-                    ],
-                ),
-            ],
-            implies = ["generate_pdb_file"],
-        )
-    elif (ctx.attr.cpu == "darwin" or
-          ctx.attr.cpu == "local"):
-        fastbuild_feature = feature(name = "fastbuild", implies = ["common"])
-    else:
-        fastbuild_feature = None
-
-    user_compile_flags_feature = feature(
-        name = "user_compile_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_compile_flags}"],
-                        iterate_over = "user_compile_flags",
-                        expand_if_available = "user_compile_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    compiler_input_flags_feature = feature(
-        name = "compiler_input_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/c", "%{source_file}"],
-                        expand_if_available = "source_file",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    no_legacy_features_feature = feature(name = "no_legacy_features")
-
-    archiver_flags_feature = feature(
-        name = "archiver_flags",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/OUT:%{output_execpath}"],
-                        expand_if_available = "output_execpath",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    redirector_feature = feature(
-        name = "redirector",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = [
-                            "-B",
-                            "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
-                        ],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    linker_bin_path_feature = feature(
-        name = "linker-bin-path",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [flag_group(flags = ["-B" + ctx.attr.linker_bin_path])],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                        ACTION_NAMES.cpp_link_executable,
-                    ],
-                    flag_groups = [flag_group(flags = ["-Wl,--gc-sections"])],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [
-                        flag_group(
-                            flags = ["-g0", "-O2", "-ffunction-sections", "-fdata-sections"],
-                        ),
-                    ],
-                ),
-            ],
-            implies = ["common", "disable-assertions"],
-        )
-    elif (ctx.attr.cpu == "x64_windows"):
-        opt_feature = feature(
-            name = "opt",
-            flag_sets = [
-                flag_set(
-                    actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                    flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
-                ),
-            ],
-        )
-    else:
-        opt_feature = None
-
-    include_paths_feature = feature(
-        name = "include_paths",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/I%{quote_include_paths}"],
-                        iterate_over = "quote_include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{include_paths}"],
-                        iterate_over = "include_paths",
-                    ),
-                    flag_group(
-                        flags = ["/I%{system_include_paths}"],
-                        iterate_over = "system_include_paths",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    shared_flag_feature = feature(
-        name = "shared_flag",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [flag_group(flags = ["/DLL"])],
-            ),
-        ],
-    )
-
-    windows_export_all_symbols_feature = feature(name = "windows_export_all_symbols")
-
-    frame_pointer_feature = feature(
-        name = "frame-pointer",
-        flag_sets = [
-            flag_set(
-                actions = [ACTION_NAMES.c_compile, ACTION_NAMES.cpp_compile],
-                flag_groups = [flag_group(flags = ["-fno-omit-frame-pointer"])],
-            ),
-        ],
-    )
-
-    build_id_feature = feature(
-        name = "build-id",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    sysroot_feature = feature(
-        name = "sysroot",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--sysroot=%{sysroot}"],
-                        iterate_over = "sysroot",
-                        expand_if_available = "sysroot",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    cuda_path_feature = feature(
-        name = "cuda_path",
-        enabled = True,
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.assemble,
-                    ACTION_NAMES.preprocess_assemble,
-                    ACTION_NAMES.c_compile,
-                    ACTION_NAMES.cpp_compile,
-                    ACTION_NAMES.cpp_header_parsing,
-                    ACTION_NAMES.cpp_module_compile,
-                    ACTION_NAMES.cpp_module_codegen,
-                    ACTION_NAMES.cpp_link_executable,
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["--cuda-path=" + ctx.attr.cuda_path],
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    def_file_feature = feature(
-        name = "def_file",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
-                        expand_if_available = "def_file_path",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "darwin"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lc++"])],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "local"):
-        stdlib_feature = feature(
-            name = "stdlib",
-            flag_sets = [
-                flag_set(
-                    actions = all_link_actions,
-                    flag_groups = [flag_group(flags = ["-lstdc++"])],
-                ),
-            ],
-        )
-    else:
-        stdlib_feature = None
-
-    no_stripping_feature = feature(name = "no_stripping")
-
-    alwayslink_feature = feature(
-        name = "alwayslink",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ACTION_NAMES.cpp_link_executable,
-                ],
-                flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
-            ),
-        ],
-    )
-
-    input_param_flags_feature = feature(
-        name = "input_param_flags",
-        flag_sets = [
-            flag_set(
-                actions = [
-                    ACTION_NAMES.cpp_link_dynamic_library,
-                    ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                ],
-                flag_groups = [
-                    flag_group(
-                        flags = ["/IMPLIB:%{interface_library_output_path}"],
-                        expand_if_available = "interface_library_output_path",
-                    ),
-                ],
-            ),
-            flag_set(
-                actions = all_link_actions +
-                          [ACTION_NAMES.cpp_link_static_library],
-                flag_groups = [
-                    flag_group(
-                        iterate_over = "libraries_to_link",
-                        flag_groups = [
-                            flag_group(
-                                iterate_over = "libraries_to_link.object_files",
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.object_files}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file_group",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "object_file",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [flag_group(flags = ["%{libraries_to_link.name}"])],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "interface_library",
-                                ),
-                            ),
-                            flag_group(
-                                flag_groups = [
-                                    flag_group(
-                                        flags = ["%{libraries_to_link.name}"],
-                                        expand_if_false = "libraries_to_link.is_whole_archive",
-                                    ),
-                                    flag_group(
-                                        flags = ["/WHOLEARCHIVE:%{libraries_to_link.name}"],
-                                        expand_if_true = "libraries_to_link.is_whole_archive",
-                                    ),
-                                ],
-                                expand_if_equal = variable_with_value(
-                                    name = "libraries_to_link.type",
-                                    value = "static_library",
-                                ),
-                            ),
-                        ],
-                        expand_if_available = "libraries_to_link",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [
-                        flag_group(
-                            flags = [
-                                "-no-canonical-prefixes",
-                            ] + ctx.attr.extra_no_canonical_prefixes_flags,
-                        ),
-                    ],
-                ),
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        no_canonical_prefixes_feature = feature(
-            name = "no-canonical-prefixes",
-            flag_sets = [
-                flag_set(
-                    actions = [
-                        ACTION_NAMES.c_compile,
-                        ACTION_NAMES.cpp_compile,
-                        ACTION_NAMES.cpp_link_executable,
-                        ACTION_NAMES.cpp_link_dynamic_library,
-                        ACTION_NAMES.cpp_link_nodeps_dynamic_library,
-                    ],
-                    flag_groups = [flag_group(flags = ["-no-canonical-prefixes"])],
-                ),
-            ],
-        )
-    else:
-        no_canonical_prefixes_feature = None
-
-    has_configured_linker_path_feature = feature(name = "has_configured_linker_path")
-
-    copy_dynamic_libraries_to_binary_feature = feature(name = "copy_dynamic_libraries_to_binary")
-
-    user_link_flags_feature = feature(
-        name = "user_link_flags",
-        flag_sets = [
-            flag_set(
-                actions = all_link_actions,
-                flag_groups = [
-                    flag_group(
-                        flags = ["%{user_link_flags}"],
-                        iterate_over = "user_link_flags",
-                        expand_if_available = "user_link_flags",
-                    ),
-                ],
-            ),
-        ],
-    )
-
-    if (ctx.attr.cpu == "local"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "determinism",
-                "alwayslink",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "build-id",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-            ],
-        )
-    elif (ctx.attr.cpu == "darwin"):
-        common_feature = feature(
-            name = "common",
-            implies = [
-                "stdlib",
-                "determinism",
-                "hardening",
-                "warnings",
-                "frame-pointer",
-                "no-canonical-prefixes",
-                "linker-bin-path",
-                "undefined-dynamic",
-            ],
-        )
-    else:
-        common_feature = None
-
-    if (ctx.attr.cpu == "local"):
-        features = [
-            default_compile_flags_feature,
-            stdlib_feature,
-            determinism_feature,
-            alwayslink_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            build_id_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-        if ctx.attr.cuda_path:
-            features += [cuda_path_feature]
-    elif (ctx.attr.cpu == "darwin"):
-        features = [
-            stdlib_feature,
-            determinism_feature,
-            pic_feature,
-            hardening_feature,
-            warnings_feature,
-            frame_pointer_feature,
-            no_canonical_prefixes_feature,
-            disable_assertions_feature,
-            linker_bin_path_feature,
-            undefined_dynamic_feature,
-            common_feature,
-            opt_feature,
-            fastbuild_feature,
-            dbg_feature,
-            supports_dynamic_linker_feature,
-            supports_pic_feature,
-        ]
-    elif (ctx.attr.cpu == "x64_windows"):
-        features = [
-            no_legacy_features_feature,
-            redirector_feature,
-            nologo_feature,
-            has_configured_linker_path_feature,
-            no_stripping_feature,
-            targets_windows_feature,
-            copy_dynamic_libraries_to_binary_feature,
-            default_compile_flags_feature,
-            msvc_env_feature,
-            include_paths_feature,
-            preprocessor_defines_feature,
-            parse_showincludes_feature,
-            generate_pdb_file_feature,
-            shared_flag_feature,
-            linkstamps_feature,
-            output_execpath_flags_feature,
-            archiver_flags_feature,
-            input_param_flags_feature,
-            linker_subsystem_flag_feature,
-            user_link_flags_feature,
-            default_link_flags_feature,
-            linker_param_file_feature,
-            static_link_msvcrt_feature,
-            static_link_msvcrt_no_debug_feature,
-            dynamic_link_msvcrt_no_debug_feature,
-            static_link_msvcrt_debug_feature,
-            dynamic_link_msvcrt_debug_feature,
-            dbg_feature,
-            fastbuild_feature,
-            opt_feature,
-            user_compile_flags_feature,
-            sysroot_feature,
-            unfiltered_compile_flags_feature,
-            compiler_output_flags_feature,
-            compiler_input_flags_feature,
-            def_file_feature,
-            windows_export_all_symbols_feature,
-            no_windows_export_all_symbols_feature,
-            supports_dynamic_linker_feature,
-            supports_interface_shared_libraries_feature,
-        ]
-    else:
-        fail("Unreachable")
-
-    cxx_builtin_include_directories = ctx.attr.builtin_include_directories
-
-    if (ctx.attr.cpu == "x64_windows"):
-        tool_paths = [
+    elif cpu == "x64_windows":
+        return [
             tool_path(name = "ar", path = ctx.attr.msvc_lib_path),
             tool_path(name = "ml", path = ctx.attr.msvc_ml_path),
             tool_path(name = "cpp", path = ctx.attr.msvc_cl_path),
@@ -1452,58 +263,766 @@ def _impl(ctx):
                 path = "wrapper/bin/msvc_nop.bat",
             ),
         ]
-    elif (ctx.attr.cpu == "local"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/ar"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+    else:
+        fail("Unreachable")
+
+def _sysroot_group():
+    return flag_group(
+        flags = ["--sysroot=%{sysroot}"],
+        expand_if_available = "sysroot",
+    )
+
+def _no_canonical_prefixes_group(extra_flags):
+    return flag_group(
+        flags = [
+            "-no-canonical-prefixes",
+        ] + extra_flags,
+    )
+
+def _cuda_set(cuda_path, actions):
+    if cuda_path:
+        return flag_set(
+            actions = actions,
+            flag_groups = [
+                flag_group(
+                    flags = ["--cuda-path=" + cuda_path],
+                ),
+            ],
+        )
+    else:
+        return []
+
+def _nologo():
+  return flag_group(flags = ["/nologo"])
+
+def _features(cpu, compiler, ctx):
+    if cpu in ["local", "darwin"]:
+        return [
+            feature(name = "no_legacy_features"),
+            feature(
+                name = "all_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-MD", "-MF", "%{dependency_file}"],
+                                expand_if_available = "dependency_file",
+                            ),
+                            flag_group(
+                                flags = ["-gsplit-dwarf"],
+                                expand_if_available = "per_object_debug_info_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["-frandom-seed=%{output_file}"],
+                                expand_if_available = "output_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-D%{preprocessor_defines}"],
+                                iterate_over = "preprocessor_defines",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-include", "%{includes}"],
+                                iterate_over = "includes",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-iquote", "%{quote_include_paths}"],
+                                iterate_over = "quote_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-I%{include_paths}"],
+                                iterate_over = "include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-isystem", "%{system_include_paths}"],
+                                iterate_over = "system_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-F", "%{framework_include_paths}"],
+                                iterate_over = "framework_include_paths",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_cpp_compile_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["-fexperimental-new-pass-manager"]),
+                        ] if compiler == "clang" else [],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-Wno-builtin-macro-redefined",
+                                    "-D__DATE__=\"redacted\"",
+                                    "-D__TIMESTAMP__=\"redacted\"",
+                                    "-D__TIME__=\"redacted\"",
+                                ],
+                            ),
+                            flag_group(
+                                flags = ["-fPIC"],
+                                expand_if_available = "pic",
+                            ),
+                            flag_group(
+                                flags = ["-fPIE"],
+                                expand_if_not_available = "pic",
+                            ),
+                            flag_group(
+                                flags = [
+                                    "-U_FORTIFY_SOURCE",
+                                    "-D_FORTIFY_SOURCE=1",
+                                    "-fstack-protector",
+                                    "-Wall",
+                                ] + ctx.attr.host_compiler_warnings + [
+                                    "-fno-omit-frame-pointer",
+                                ],
+                            ),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-DNDEBUG"])],
+                        with_features = [with_feature_set(features = ["disable-assertions"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-g0",
+                                    "-O2",
+                                    "-ffunction-sections",
+                                    "-fdata-sections",
+                                ],
+                            ),
+                        ],
+                        with_features = [with_feature_set(features = ["opt"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["-g"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                ] + _cuda_set(
+                    ctx.attr.cuda_path,
+                    all_compile_actions,
+                ) + [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            _iterate_flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                            ),
+                            _sysroot_group(),
+                            flag_group(
+                                expand_if_available = "source_file",
+                                flags = ["-c", "%{source_file}"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_assembly_file",
+                                flags = ["-S"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_preprocess_file",
+                                flags = ["-E"],
+                            ),
+                            flag_group(
+                                expand_if_available = "output_file",
+                                flags = ["-o", "%{output_file}"],
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_archive_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_archive_actions(),
+                        flag_groups = [
+                            flag_group(
+                                expand_if_available = "linker_param_file",
+                                flags = ["@%{linker_param_file}"],
+                            ),
+                            flag_group(flags = ["rcsD"]),
+                            flag_group(
+                                flags = ["%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            flag_group(
+                                iterate_over = "libraries_to_link",
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.name}"],
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file",
+                                        ),
+                                    ),
+                                    flag_group(
+                                        flags = ["%{libraries_to_link.object_files}"],
+                                        iterate_over = "libraries_to_link.object_files",
+                                        expand_if_equal = variable_with_value(
+                                            name = "libraries_to_link.type",
+                                            value = "object_file_group",
+                                        ),
+                                    ),
+                                ],
+                                expand_if_available = "libraries_to_link",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_link_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [flag_group(flags = ["-shared"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["@%{linker_param_file}"],
+                                expand_if_available = "linker_param_file",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["%{linkstamp_paths}"],
+                                iterate_over = "linkstamp_paths",
+                            ),
+                            flag_group(
+                                flags = ["-o", "%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["-L%{library_search_directories}"],
+                                iterate_over = "library_search_directories",
+                            ),
+                            _iterate_flag_group(
+                                iterate_over = "runtime_library_search_directories",
+                                flags = [
+                                    "-Wl,-rpath,$ORIGIN/%{runtime_library_search_directories}",
+                                ] if cpu == "local" else [
+                                    "-Wl,-rpath,@loader_path/%{runtime_library_search_directories}",
+                                ],
+                            ),
+                            _libraries_to_link_group("darwin" if cpu == "darwin" else "linux"),
+                            _iterate_flag_group(
+                                flags = ["%{user_link_flags}"],
+                                iterate_over = "user_link_flags",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,--gdb-index"],
+                                expand_if_available = "is_using_fission",
+                            ),
+                            flag_group(
+                                flags = ["-Wl,-S"],
+                                expand_if_available = "strip_debug_symbols",
+                            ),
+                            flag_group(flags = ["-lc++" if cpu == "darwin" else "-lstdc++"]),
+                            _no_canonical_prefixes_group(
+                                ctx.attr.extra_no_canonical_prefixes_flags,
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_executable_link_actions(),
+                        flag_groups = [flag_group(flags = ["-pie"])],
+                    ),
+                ] + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = [
+                            "-Wl,-z,relro,-z,now",
+                        ])],
+                    ),
+                ] if cpu == "local" else []) + [
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["-Wl,-no-as-needed"])],
+                        with_features = [with_feature_set(features = ["alwayslink"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["-B" + ctx.attr.linker_bin_path]),
+                        ],
+                    ),
+                ] + ([flag_set(
+                    actions = all_link_actions(),
+                    flag_groups = [
+                        flag_group(flags = ["-Wl,--gc-sections"]),
+                        flag_group(
+                            flags = ["-Wl,--build-id=md5", "-Wl,--hash-style=gnu"],
+                        ),
+                    ],
+                )] if cpu == "local" else []) + ([
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["-undefined", "dynamic_lookup"])],
+                    ),
+                ] if cpu == "darwin" else []) + _cuda_set(
+                    ctx.attr.cuda_path,
+                    all_link_actions(),
+                ) + [
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            _sysroot_group(),
+                        ],
+                    ),
+                ],
+            ),
+            feature(name = "alwayslink", enabled = cpu == "local"),
+            feature(name = "opt"),
+            feature(name = "fastbuild"),
+            feature(name = "dbg"),
+            feature(name = "supports_dynamic_linker", enabled = True),
+            feature(name = "pic", enabled = True),
+            feature(name = "supports_pic", enabled = True),
+            feature(name = "has_configured_linker_path", enabled = True),
         ]
-    elif (ctx.attr.cpu == "darwin"):
-        tool_paths = [
-            tool_path(name = "gcc", path = ctx.attr.host_compiler_path),
-            tool_path(name = "ar", path = ctx.attr.host_compiler_prefix + "/libtool"),
-            tool_path(name = "compat-ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "cpp", path = ctx.attr.host_compiler_prefix + "/cpp"),
-            tool_path(name = "dwp", path = ctx.attr.host_compiler_prefix + "/dwp"),
-            tool_path(name = "gcov", path = ctx.attr.host_compiler_prefix + "/gcov"),
-            tool_path(name = "ld", path = ctx.attr.host_compiler_prefix + "/ld"),
-            tool_path(name = "nm", path = ctx.attr.host_compiler_prefix + "/nm"),
-            tool_path(name = "objcopy", path = ctx.attr.host_compiler_prefix + "/objcopy"),
-            tool_path(name = "objdump", path = ctx.attr.host_compiler_prefix + "/objdump"),
-            tool_path(name = "strip", path = ctx.attr.host_compiler_prefix + "/strip"),
+    elif cpu == "x64_windows":
+        return [
+            feature(name = "no_legacy_features"),
+            feature(
+                name = "common_flags",
+                enabled = True,
+                env_sets = [
+                    env_set(
+                        actions = all_compile_actions() + all_link_actions() + all_archive_actions(),
+                        env_entries = [
+                            env_entry(key = "PATH", value = ctx.attr.msvc_env_path),
+                            env_entry(key = "INCLUDE", value = ctx.attr.msvc_env_include),
+                            env_entry(key = "LIB", value = ctx.attr.msvc_env_lib),
+                            env_entry(key = "TMP", value = ctx.attr.msvc_env_tmp),
+                            env_entry(key = "TEMP", value = ctx.attr.msvc_env_tmp),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_compile_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = [
+                                    "-B",
+                                    "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
+                                ],
+                            ),
+                            _nologo(),
+                            flag_group(
+                                flags = [
+                                    "/DCOMPILER_MSVC",
+                                    "/DNOMINMAX",
+                                    "/D_WIN32_WINNT=0x0600",
+                                    "/D_CRT_SECURE_NO_DEPRECATE",
+                                    "/D_CRT_SECURE_NO_WARNINGS",
+                                    "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS",
+                                    "/bigobj",
+                                    "/Zm500",
+                                    "/J",
+                                    "/Gy",
+                                    "/GF",
+                                    "/EHsc",
+                                    "/wd4351",
+                                    "/wd4291",
+                                    "/wd4250",
+                                    "/wd4996",
+                                ],
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/I%{quote_include_paths}"],
+                                iterate_over = "quote_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/I%{include_paths}"],
+                                iterate_over = "include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/I%{system_include_paths}"],
+                                iterate_over = "system_include_paths",
+                            ),
+                            _iterate_flag_group(
+                                flags = ["/D%{preprocessor_defines}"],
+                                iterate_over = "preprocessor_defines",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [flag_group(flags = ["/showIncludes"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MT"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MD"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MTd"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/MDd"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/Od", "/Z7", "/DDEBUG"])],
+                        with_features = [with_feature_set(features = ["fastbuild"])],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [flag_group(flags = ["/O2", "/DNDEBUG"])],
+                        with_features = [with_feature_set(features = ["opt"])],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            _iterate_flag_group(
+                                flags = ["%{user_compile_flags}"],
+                                iterate_over = "user_compile_flags",
+                            ),
+                        ] + ([
+                            flag_group(flags = ctx.attr.host_unfiltered_compile_flags),
+                        ] if ctx.attr.host_unfiltered_compile_flags else []),
+                    ),
+                    flag_set(
+                        actions = [ACTION_NAMES.assemble],
+                        flag_groups = [
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/Fo%{output_file}", "/Zi"],
+                                        expand_if_not_available = "output_preprocess_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_preprocessed_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/Fo%{output_file}"],
+                                        expand_if_not_available = "output_preprocess_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                                expand_if_not_available = "output_assembly_file",
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/Fa%{output_file}"],
+                                        expand_if_available = "output_assembly_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                            ),
+                            flag_group(
+                                flag_groups = [
+                                    flag_group(
+                                        flags = ["/P", "/Fi%{output_file}"],
+                                        expand_if_available = "output_preprocess_file",
+                                    ),
+                                ],
+                                expand_if_available = "output_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_compile_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/c", "%{source_file}"],
+                                expand_if_available = "source_file",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_archive_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_archive_actions(),
+                        flag_groups = [
+                            _nologo(),
+                            flag_group(
+                                flags = ["/OUT:%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(
+                name = "all_link_flags",
+                enabled = True,
+                flag_sets = [
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DLL"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            _nologo(),
+                            _iterate_flag_group(
+                                flags = ["%{linkstamp_paths}"],
+                                iterate_over = "linkstamp_paths",
+                            ),
+                            flag_group(
+                                flags = ["/OUT:%{output_execpath}"],
+                                expand_if_available = "output_execpath",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_shared_library_link_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/IMPLIB:%{interface_library_output_path}"],
+                                expand_if_available = "interface_library_output_path",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions() +
+                                  all_archive_actions(),
+                        flag_groups = [
+                            _libraries_to_link_group("msvc"),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["/SUBSYSTEM:CONSOLE"]),
+                            _iterate_flag_group(
+                                flags = ["%{user_link_flags}"],
+                                iterate_over = "user_link_flags",
+                            ),
+                            flag_group(flags = ["/MACHINE:X64"]),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions() +
+                                  all_archive_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["@%{linker_param_file}"],
+                                expand_if_available = "linker_param_file",
+                            ),
+                        ],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmt.lib"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrt.lib"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_no_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:libcmtd.lib"])],
+                        with_features = [with_feature_set(features = ["static_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEFAULTLIB:msvcrtd.lib"])],
+                        with_features = [with_feature_set(features = ["dynamic_link_msvcrt_debug"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [flag_group(flags = ["/DEBUG:FULL", "/INCREMENTAL:NO"])],
+                        with_features = [with_feature_set(features = ["dbg"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(flags = ["/DEBUG:FASTLINK", "/INCREMENTAL:NO"]),
+                        ],
+                        with_features = [with_feature_set(features = ["fastbuild"])],
+                    ),
+                    flag_set(
+                        actions = all_link_actions(),
+                        flag_groups = [
+                            flag_group(
+                                flags = ["/DEF:%{def_file_path}", "/ignore:4070"],
+                                expand_if_available = "def_file_path",
+                            ),
+                        ],
+                    ),
+                ],
+            ),
+            feature(name = "parse_showincludes", enabled = True),
+            feature(name = "no_stripping", enabled = True),
+            feature(
+                name = "targets_windows",
+                enabled = True,
+                implies = ["copy_dynamic_libraries_to_binary"],
+            ),
+            feature(name = "copy_dynamic_libraries_to_binary"),
+            feature(
+                name = "generate_pdb_file",
+                requires = [
+                    feature_set(features = ["dbg"]),
+                    feature_set(features = ["fastbuild"]),
+                ],
+            ),
+            feature(name = "static_link_msvcrt"),
+            feature(
+                name = "static_link_msvcrt_no_debug",
+                requires = [
+                    feature_set(features = ["fastbuild"]),
+                    feature_set(features = ["opt"]),
+                ],
+            ),
+            feature(
+                name = "dynamic_link_msvcrt_no_debug",
+                requires = [
+                    feature_set(features = ["fastbuild"]),
+                    feature_set(features = ["opt"]),
+                ],
+            ),
+            feature(
+                name = "static_link_msvcrt_debug",
+                requires = [feature_set(features = ["dbg"])],
+            ),
+            feature(
+                name = "dynamic_link_msvcrt_debug",
+                requires = [feature_set(features = ["dbg"])],
+            ),
+            feature(
+                name = "dbg",
+                implies = ["generate_pdb_file"],
+            ),
+            feature(
+                name = "fastbuild",
+                implies = ["generate_pdb_file"],
+            ),
+            feature(
+                name = "opt",
+            ),
+            feature(name = "windows_export_all_symbols"),
+            feature(name = "no_windows_export_all_symbols"),
+            feature(name = "supports_dynamic_linker", enabled = True),
+            feature(
+                name = "supports_interface_shared_libraries",
+                enabled = True,
+            ),
+            feature(name = "has_configured_linker_path", enabled = True),
         ]
     else:
         fail("Unreachable")
 
+def _impl(ctx):
+    cpu = ctx.attr.cpu
+    compiler = ctx.attr.compiler
+
+    if (cpu == "darwin"):
+        toolchain_identifier = "local_darwin"
+        target_cpu = "darwin"
+        target_libc = "macosx"
+        compiler = "compiler"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/libtool",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+    elif (cpu == "local"):
+        toolchain_identifier = "local_linux"
+        target_cpu = "local"
+        target_libc = "local"
+        compiler = "compiler"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.host_compiler_path,
+            c_compiler_path = ctx.attr.host_compiler_path,
+            cc_compiler_path = ctx.attr.host_compiler_path,
+            archiver_path = ctx.attr.host_compiler_prefix + "/ar",
+            linker_path = ctx.attr.host_compiler_path,
+            strip_path = ctx.attr.host_compiler_prefix + "/strip",
+        )
+    elif (cpu == "x64_windows"):
+        toolchain_identifier = "local_windows"
+        target_cpu = "x64_windows"
+        target_libc = "msvcrt"
+        compiler = "msvc-cl"
+        action_configs = _action_configs(
+            assembly_path = ctx.attr.msvc_ml_path,
+            c_compiler_path = ctx.attr.msvc_cl_path,
+            cc_compiler_path = ctx.attr.msvc_cl_path,
+            archiver_path = ctx.attr.msvc_lib_path,
+            linker_path = ctx.attr.msvc_link_path,
+            strip_path = "fake_tool_strip_not_supported",
+        )
+    else:
+        fail("Unreachable")
+
     out = ctx.actions.declare_file(ctx.label.name)
     ctx.actions.write(out, "Fake executable")
     return [
         cc_common.create_cc_toolchain_config_info(
             ctx = ctx,
-            features = features,
+            features = _features(cpu, compiler, ctx),
             action_configs = action_configs,
             artifact_name_patterns = [],
-            cxx_builtin_include_directories = cxx_builtin_include_directories,
+            cxx_builtin_include_directories = ctx.attr.builtin_include_directories,
             toolchain_identifier = toolchain_identifier,
-            host_system_name = host_system_name,
-            target_system_name = target_system_name,
+            host_system_name = "local",
+            target_system_name = "local",
             target_cpu = target_cpu,
             target_libc = target_libc,
             compiler = compiler,
-            abi_version = abi_version,
-            abi_libc_version = abi_libc_version,
-            tool_paths = tool_paths,
+            abi_version = "local",
+            abi_libc_version = "local",
+            tool_paths = _tool_paths(cpu, ctx),
             make_variables = [],
-            builtin_sysroot = builtin_sysroot,
-            cc_target_os = cc_target_os,
+            builtin_sysroot = ctx.attr.builtin_sysroot,
+            cc_target_os = None,
         ),
         DefaultInfo(
             executable = out,
@@ -1514,6 +1033,7 @@ cc_toolchain_config = rule(
     implementation = _impl,
     attrs = {
         "cpu": attr.string(mandatory = True, values = ["darwin", "local", "x64_windows"]),
+        "compiler": attr.string(values = ["clang", "msvc", "unknown"], default = "unknown"),
         "builtin_include_directories": attr.string_list(),
         "extra_no_canonical_prefixes_flags": attr.string_list(),
         "host_compiler_path": attr.string(),
@@ -1531,7 +1051,6 @@ cc_toolchain_config = rule(
         "msvc_lib_path": attr.string(default = "msvc_not_used"),
         "msvc_link_path": attr.string(default = "msvc_not_used"),
         "msvc_ml_path": attr.string(default = "msvc_not_used"),
-        "compiler": attr.string(values = ["clang", "msvc", "unknown"], default="unknown"),
     },
     provides = [CcToolchainConfigInfo],
     executable = True,