diff --git a/tensorflow/compiler/jit/xla_kernel_creator_util.cc b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
index 99046c0bd76..3cc68f2a1a4 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator_util.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator_util.cc
@@ -91,7 +91,7 @@ Status CreateXlaKernel(FunctionLibraryRuntime* flr, const NodeDef& node_def,
     }
     string message = absl::StrCat(
         "Function invoked by the following node is not compilable: ",
-        SummarizeNodeDef(node_def), ".\n");
+        SummarizeNodeDef(node_def, /*max_inputs_in_summary=*/10), ".\n");
     absl::StrAppend(&message, "Uncompilable nodes:");
     for (const auto& node_info : uncompilable_node_info) {
       string node_message =
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index e0ec990462b..8c24f182f5c 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -201,9 +201,7 @@ void XlaComputationLaunchContext::PopulateInputs(
   se::Stream* stream =
       ctx->op_device_context() ? ctx->op_device_context()->stream() : nullptr;
   // Build ShapedBuffers that point directly to the Tensor buffers.
-  arg_buffers_.reserve(kernel->xla_input_shapes.size() + 1);
-  arg_buffers_.resize(kernel->xla_input_shapes.size());
-  arg_ptrs_ = std::vector<ShapedBuffer*>(arg_buffers_.size());
+  arg_ptrs_ = std::vector<ShapedBuffer*>(kernel->xla_input_shapes.size());
 
   // Pass remaining parameters.
   const Tensor* t;
@@ -239,11 +237,11 @@ void XlaComputationLaunchContext::PopulateInputs(
           << " not the same as on-host shape "
           << xla::ShapeUtil::HumanStringWithLayout(shape);
       se::DeviceMemoryBase dmem = XlaTensor::DeviceMemoryFromTensor(*t);
-      arg_buffers_[i] = absl::make_unique<ShapedBuffer>(
+      arg_buffers_.emplace_back(
           /*on_host_shape=*/shape, /*on_device_shape=*/shape,
           client_->platform(), client_->default_device_ordinal());
-      arg_buffers_[i]->set_buffer(dmem, /*index=*/{});
-      arg_ptrs_[i] = arg_buffers_[i].get();
+      arg_buffers_.back().set_buffer(dmem, /*index=*/{});
+      arg_ptrs_[i] = &arg_buffers_.back();
     }
   }
 }
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 511e0f1451a..cf68dcb7dd6 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -165,7 +165,7 @@ class XlaComputationLaunchContext {
   se::DeviceMemoryAllocator* xla_allocator_;
   bool allocate_xla_tensors_;
   bool use_multiple_streams_;
-  std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_;
+  std::deque<xla::ShapedBuffer> arg_buffers_;
   std::vector<xla::ShapedBuffer*> arg_ptrs_;
 };
 
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 6eff7dbd084..1a508bdb190 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -279,22 +279,6 @@ cc_library(
     ],
 )
 
-tf_cc_test(
-    name = "tftext_utils_test",
-    size = "small",
-    srcs = ["utils/lstm_utils_test.cc"],
-    deps = [
-        ":lstm_utils",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@llvm-project//llvm:support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:StandardOps",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
 cc_library(
     name = "stateful_ops_utils",
     srcs = [
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 923efdbaf9d..edb533c9442 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -2297,26 +2297,17 @@ def TFL_PReluOp : TFL_Op<"prelu", [
     NoSideEffect,
     ResultsBroadcastableShape,
     TFL_GpuTargetOp,
-    TFL_OperandHasRankAtMost<0, 4>,
-    TFL_OperandHasRankAtMost<1, 4>,
+    TFL_BinaryOperandsHaveSameShapesOrBroadcastableShape<0, 1, 4>,
     BinaryOpSameElementTypeConstraint,
     PredOpTrait<"input and output must have the same element type",
-      TFL_TCresVTEtIsSameAsOp<0, 0>>,
-    PredOpTrait<"'alpha' should have one less rank than 'input'.",
-      Or<[TFL_OperandIsUnrankedPred<0>,
-          TFL_OperandIsUnrankedPred<1>,
-          CPred<"$_op.getOperand(0).getType().cast<ShapedType>().getRank() == "
-                "$_op.getOperand(1).getType().cast<ShapedType>().getRank() "
-                "+ 1">]>>]> {
+      TFL_TCresVTEtIsSameAsOp<0, 0>>]> {
   let summary = "Parameterized Relu operator";
 
   let description = [{
     Parameterized Relu operator
       x -> x >= 0 ? x : (alpha * x)
     where alpha is a trainable tensor.
-    alpha should have one less rank than the input as it doesn't have the batch
-    dimension, and the other dimensions either should be the same size as input
-    or size 1, where it is broadcasted in the second case.
+    input and alpha should be the same size as input or be broadcastable.
   }];
 
   let arguments = (
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index a07b7b8dd1d..8a2faebcbe6 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -55,8 +55,8 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   std::vector<string> node_names;
   std::vector<string> node_dtypes;
   std::vector<std::vector<int>> node_shapes;
-  std::vector<double> node_mins;
-  std::vector<double> node_maxs;
+  std::vector<llvm::Optional<double>> node_mins;
+  std::vector<llvm::Optional<double>> node_maxs;
 
   // Populate quantization specs.
   TF_RETURN_IF_ERROR(internal::PopulateQuantizationSpecs(
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index 51fcbb97360..ab80746f8b7 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -125,8 +125,8 @@ Status ConvertSavedModelToTFLiteFlatBuffer(
   std::vector<string> node_names;
   std::vector<string> node_dtypes;
   std::vector<std::vector<int>> node_shapes;
-  std::vector<double> node_mins;
-  std::vector<double> node_maxs;
+  std::vector<llvm::Optional<double>> node_mins;
+  std::vector<llvm::Optional<double>> node_maxs;
 
   // Populate quantization specs.
   TF_RETURN_IF_ERROR(internal::PopulateQuantizationSpecs(
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index a1401323e89..8f2c8bc362c 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -177,14 +177,13 @@ Status RegisterAllCustomOps(const toco::TocoFlags& toco_flags) {
   return RegisterCustomBuiltinOps(extra_tf_opdefs);
 }
 
-Status PopulateQuantizationSpecs(const toco::ModelFlags& model_flags,
-                                 const toco::TocoFlags& toco_flags,
-                                 mlir::TFL::QuantizationSpecs* quant_specs,
-                                 std::vector<string>* node_names,
-                                 std::vector<string>* node_dtypes,
-                                 std::vector<std::vector<int>>* node_shapes,
-                                 std::vector<double>* node_mins,
-                                 std::vector<double>* node_maxs) {
+Status PopulateQuantizationSpecs(
+    const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
+    mlir::TFL::QuantizationSpecs* quant_specs, std::vector<string>* node_names,
+    std::vector<string>* node_dtypes,
+    std::vector<std::vector<int>>* node_shapes,
+    std::vector<llvm::Optional<double>>* node_mins,
+    std::vector<llvm::Optional<double>>* node_maxs) {
   quant_specs->inference_input_type =
       ConvertIODataTypeToDataType(toco_flags.inference_input_type());
   tensorflow::DataType inference_type =
@@ -211,11 +210,16 @@ Status PopulateQuantizationSpecs(const toco::ModelFlags& model_flags,
                                             flag.shape().dims().end()));
     // Currently, only UINT8 and INT8 require inputs stats
     if (inference_type == DT_QINT8 || inference_type == DT_QUINT8) {
-      TF_ASSIGN_OR_RETURN(
-          auto min_max, InputStatsToMinMax(flag.mean_value(), flag.std_value(),
-                                           inference_type));
-      node_mins->push_back(min_max.first);
-      node_maxs->push_back(min_max.second);
+      if (flag.has_mean_value() && flag.has_std_value()) {
+        TF_ASSIGN_OR_RETURN(
+            auto min_max, InputStatsToMinMax(flag.mean_value(),
+                                             flag.std_value(), inference_type));
+        node_mins->push_back(min_max.first);
+        node_maxs->push_back(min_max.second);
+      } else {
+        node_mins->push_back(llvm::None);
+        node_maxs->push_back(llvm::None);
+      }
     }
   }
 
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index 3ea36e5eb1d..87e73912a46 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -34,14 +34,13 @@ Status RegisterAllCustomOps(const toco::TocoFlags& toco_flags);
 
 // Populate quantization specs (or not) given user specified ranges for each
 // input arrays.
-Status PopulateQuantizationSpecs(const toco::ModelFlags& model_flags,
-                                 const toco::TocoFlags& toco_flags,
-                                 mlir::TFL::QuantizationSpecs* quant_specs,
-                                 std::vector<string>* node_names,
-                                 std::vector<string>* node_dtypes,
-                                 std::vector<std::vector<int>>* node_shapes,
-                                 std::vector<double>* node_mins,
-                                 std::vector<double>* node_maxs);
+Status PopulateQuantizationSpecs(
+    const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
+    mlir::TFL::QuantizationSpecs* quant_specs, std::vector<string>* node_names,
+    std::vector<string>* node_dtypes,
+    std::vector<std::vector<int>>* node_shapes,
+    std::vector<llvm::Optional<double>>* node_mins,
+    std::vector<llvm::Optional<double>>* node_maxs);
 
 // Convert imported MLIR file to TfLite flatbuffer.
 // This will also run relevant passes as well.
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
index 6b897bd5608..3edd9c36760 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.cc
@@ -45,7 +45,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
                               absl::string_view inference_type,
                               QuantizationSpecs* quant_specs) {
   std::vector<std::string> input_nodes = absl::StrSplit(node_names, ',');
-  std::vector<double> node_mins;
+  std::vector<llvm::Optional<double>> node_mins;
   if (!min_values.empty()) {
     std::vector<std::string> node_mins_str = absl::StrSplit(min_values, ',');
     for (int i = 0; i < node_mins_str.size(); i++) {
@@ -57,7 +57,7 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
     }
   }
 
-  std::vector<double> node_maxs;
+  std::vector<llvm::Optional<double>> node_maxs;
   if (!max_values.empty()) {
     std::vector<std::string> node_maxs_str = absl::StrSplit(max_values, ',');
     for (int i = 0; i < node_maxs_str.size(); i++) {
@@ -79,11 +79,11 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
                                 quant_specs);
 }
 
-bool GetInputNodeQuantSpecs(const std::vector<std::string>& node_names,
-                            const std::vector<double>& node_mins,
-                            const std::vector<double>& node_maxs,
-                            tensorflow::DataType inference_type,
-                            QuantizationSpecs* quant_specs) {
+bool GetInputNodeQuantSpecs(
+    const std::vector<std::string>& node_names,
+    const std::vector<llvm::Optional<double>>& node_mins,
+    const std::vector<llvm::Optional<double>>& node_maxs,
+    tensorflow::DataType inference_type, QuantizationSpecs* quant_specs) {
   quant_specs->inference_type = inference_type;
 
   // If min/max are not specified, just return;
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
index 2ffba579548..a4046553d17 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
@@ -19,6 +19,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_CONFIG_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_QUANTIZATION_CONFIG_H_
 
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -69,7 +70,8 @@ struct QuantizationSpecs {
   // arguments. They are only used when `weight_quantization` is set to false,
   // and the model is required to have quantization parameters, either from
   // quantization aware training or calibration, for the remaining tensors.
-  std::vector<std::pair<double, double>> input_ranges;
+  std::vector<std::pair<llvm::Optional<double>, llvm::Optional<double>>>
+      input_ranges;
 
   // The default ranges can be used when a tensor doesn't have quantization
   // parameters and couldn't be quantized. Used only for latency tests.
@@ -130,11 +132,11 @@ bool ParseInputNodeQuantSpecs(absl::string_view node_names,
 // Gets the quantization specification for input arrays. The array names are not
 // stored in the spec, and will be matched by position. The min/max will be
 // ignored if the inference_type isn't a quantized type. Returns true if failed.
-bool GetInputNodeQuantSpecs(const std::vector<std::string>& node_names,
-                            const std::vector<double>& node_mins,
-                            const std::vector<double>& node_maxs,
-                            tensorflow::DataType inference_type,
-                            QuantizationSpecs* quant_specs);
+bool GetInputNodeQuantSpecs(
+    const std::vector<std::string>& node_names,
+    const std::vector<llvm::Optional<double>>& node_mins,
+    const std::vector<llvm::Optional<double>>& node_maxs,
+    tensorflow::DataType inference_type, QuantizationSpecs* quant_specs);
 
 }  // namespace TFL
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 87cae3dd957..702808ac892 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -109,8 +109,8 @@ class PrepareQuantizePass
   // Get the min and max values from the quantization specification for the
   // current function function and argument index. Uses default values if
   // the function is specified in the `quantize_whitelist`.
-  std::pair<double, double> GetMinMaxValuesForArgument(
-      llvm::StringRef func_name, int index) {
+  std::pair<llvm::Optional<double>, llvm::Optional<double>>
+  GetMinMaxValuesForArgument(llvm::StringRef func_name, int index) {
     if (func_name == quant_specs_.target_func) {
       return quant_specs_.input_ranges[index];
     } else {
@@ -160,10 +160,14 @@ bool PrepareQuantizePass::SetInputNodesQuantizationParams(FuncOp func) {
         }
 
         auto min_max = GetMinMaxValuesForArgument(func_name, i);
+        // The input min/max or mean/std are not specified, then skip.
+        if (!min_max.first.hasValue() || !min_max.second.hasValue()) return;
+
         TypeAttr params = quant::GetQuantizedTypeAttr(
-            builder, input_type, builder.getF64FloatAttr(min_max.first),
-            builder.getF64FloatAttr(min_max.second), /*quant_dim=*/-1, num_bits,
-            narrow_range, is_signed);
+            builder, input_type,
+            builder.getF64FloatAttr(min_max.first.getValue()),
+            builder.getF64FloatAttr(min_max.second.getValue()),
+            /*quant_dim=*/-1, num_bits, narrow_range, is_signed);
         builder.setInsertionPoint(block, insertion_point);
         auto q_op =
             builder.create<quant::QuantizeCastOp>(loc, params.getValue(), arg);
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
index 5df57de6f71..081ba7ac6e7 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace mlir {
@@ -92,7 +93,9 @@ class LstmUtilsTest : public ::testing::Test {
   LstmUtilsTest() {}
 
   void SetUp() override {
-    builder_ = std::unique_ptr<mlir::Builder>(new Builder(&context_));
+    RegisterDialects();
+    context_ = std::make_unique<mlir::MLIRContext>();
+    builder_ = std::unique_ptr<mlir::Builder>(new Builder(context_.get()));
     fused_lstm_func_ = createLstmCompositeFunc(builder_.get(), false, false);
     fused_lstm_func_cifg_ =
         createLstmCompositeFunc(builder_.get(), false, true);
@@ -105,10 +108,17 @@ class LstmUtilsTest : public ::testing::Test {
     fused_ln_lstm_func_.erase();
     builder_.reset();
   }
+
+  void RegisterDialects() {
+    mlir::registerDialect<mlir::StandardOpsDialect>();
+    mlir::registerDialect<mlir::TF::TensorFlowDialect>();
+    mlir::registerDialect<TensorFlowLiteDialect>();
+  }
+
   FuncOp fused_lstm_func_;
   FuncOp fused_lstm_func_cifg_;
   FuncOp fused_ln_lstm_func_;
-  mlir::MLIRContext context_;
+  std::unique_ptr<mlir::MLIRContext> context_;
   std::unique_ptr<mlir::Builder> builder_;
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 1df8f7fd519..9f407ea774a 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -1318,6 +1318,126 @@ greater than `clip_value_max` are set to `clip_value_max`.
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
 }
 
+def TF_CollectiveBcastRecvOp : TF_Op<"CollectiveBcastRecv", []> {
+  let summary = "Receives a tensor value broadcast from another device.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    I64Attr:$group_size,
+    I64Attr:$group_key,
+    I64Attr:$instance_key,
+    TF_ShapeAttr:$shape,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I1, I32, I64]>:$data
+  );
+
+  TF_DerivedResultTypeAttr T = TF_DerivedResultTypeAttr<0>;
+}
+
+def TF_CollectiveBcastSendOp : TF_Op<"CollectiveBcastSend", []> {
+  let summary = "Broadcasts a tensor value to one or more other devices.";
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, I1, I32, I64]>:$input,
+
+    I64Attr:$group_size,
+    I64Attr:$group_key,
+    I64Attr:$instance_key,
+    TF_ShapeAttr:$shape,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I1, I32, I64]>:$data
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_CollectiveGatherOp : TF_Op<"CollectiveGather", []> {
+  let summary = [{
+Mutually accumulates multiple tensors of identical type and shape.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, I32, I64]>:$input,
+
+    I64Attr:$group_size,
+    I64Attr:$group_key,
+    I64Attr:$instance_key,
+    TF_ShapeAttr:$shape,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I32, I64]>:$data
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_CollectivePermuteOp : TF_Op<"CollectivePermute", [NoSideEffect, SameOperandsAndResultShape]> {
+  let summary = "An Op to permute tensors across replicated TPU instances.";
+
+  let description = [{
+Each instance supplies its own input.
+
+For example, suppose there are 4 TPU instances: `[A, B, C, D]`. Passing
+source_target_pairs=`[[0,1],[1,2],[2,3],[3,0]]` gets the outputs:
+`[D, A, B, C]`.
+  }];
+
+  let arguments = (ins
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$input,
+    I32Tensor:$source_target_pairs
+  );
+
+  let results = (outs
+    TensorOf<[BF16, F16, F32, F64, I16, I32, I64, I8, TF_Complex128, TF_Complex64, TF_Qint32, TF_Qint8, TF_Quint8, TF_Uint16, TF_Uint32, TF_Uint64, TF_Uint8]>:$output
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
+def TF_CollectiveReduceOp : TF_Op<"CollectiveReduce", [SameOperandsAndResultType]> {
+  let summary = [{
+Mutually reduces multiple tensors of identical type and shape.
+  }];
+
+  let description = [{
+  }];
+
+  let arguments = (ins
+    TensorOf<[F16, F32, F64, I32, I64]>:$input,
+
+    I64Attr:$group_size,
+    I64Attr:$group_key,
+    I64Attr:$instance_key,
+    TF_AnyStrAttrOf<["Min", "Max", "Mul", "Add"]>:$merge_op,
+    TF_AnyStrAttrOf<["Id", "Div"]>:$final_op,
+    I64ArrayAttr:$subdiv_offsets,
+    DefaultValuedAttr<I64ArrayAttr, "{}">:$wait_for,
+    DefaultValuedAttr<StrAttr, "auto">:$communication_hint
+  );
+
+  let results = (outs
+    TensorOf<[F16, F32, F64, I32, I64]>:$data
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_ComplexOp : TF_Op<"Complex", [NoSideEffect, ResultsBroadcastableShape]> {
   let summary = "Converts two real numbers to a complex number.";
 
diff --git a/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir b/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
index ad007d0eb50..d6c164f8160 100644
--- a/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/buffer-assignment.mlir
@@ -203,12 +203,12 @@ func @moving_alloc_and_inserting_missing_dealloc(%cond : i1, %arg0 : memref<2xf3
     "buffer_assignment_test.unary_lowered"(%arg0, %1) : (memref<2xf32>, memref<2xf32>) -> ()
     br ^exit(%1 : memref<2xf32>)
   ^exit(%arg2: memref<2xf32>):
-    "bufer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+    "buffer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
     return
 }
 // CHECK-NEXT: %[[FIRST_ALLOC:.*]] = alloc()
 // CHECK-NEXT: %[[SECOND_ALLOC:.*]] = alloc()
-//      CHECK: "bufer_assignment_test.copy"
+//      CHECK: "buffer_assignment_test.copy"
 // CHECK-NEXT: dealloc
 // CHECK-NEXT: dealloc
 // CHECK-NEXT: return
@@ -226,11 +226,11 @@ func @moving_invalid_dealloc_op_complex(%cond : i1, %arg0 : memref<2xf32>, %arg1
     dealloc %1 : memref<2xf32>
     br ^exit(%1 : memref<2xf32>)
   ^exit(%arg2: memref<2xf32>):
-    "bufer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+    "buffer_assignment_test.copy"(%arg2, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
     return
 }
 // CHECK-NEXT: %[[ALLOC:.*]] = alloc()
-//      CHECK: bufer_assignment_test.copy
+//      CHECK: buffer_assignment_test.copy
 // CHECK-NEXT: dealloc
 // CHECK-NEXT: return
 
@@ -240,10 +240,10 @@ func @moving_invalid_dealloc_op_complex(%cond : i1, %arg0 : memref<2xf32>, %arg1
 func @inserting_missing_dealloc_simple(%arg0 : memref<2xf32>, %arg1: memref<2xf32>){
     %0 = alloc() : memref<2xf32>
     "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
-    "bufer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+    "buffer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
     return
 }
-//      CHECK: bufer_assignment_test.copy
+//      CHECK: buffer_assignment_test.copy
 // CHECK-NEXT: dealloc
 
 // -----
@@ -253,8 +253,8 @@ func @moving_invalid_dealloc_op(%arg0 : memref<2xf32>, %arg1: memref<2xf32>){
     %0 = alloc() : memref<2xf32>
     "buffer_assignment_test.unary_lowered"(%arg0, %0) : (memref<2xf32>, memref<2xf32>) -> ()
     dealloc %0 : memref<2xf32>
-    "bufer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
+    "buffer_assignment_test.copy"(%0, %arg1) : (memref<2xf32>, memref<2xf32>) -> ()
     return
 }
-//      CHECK: bufer_assignment_test.copy
-// CHECK-NEXT: dealloc
\ No newline at end of file
+//      CHECK: buffer_assignment_test.copy
+// CHECK-NEXT: dealloc
diff --git a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc
index 5a0d791079c..40c115f4cbc 100644
--- a/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc
+++ b/tensorflow/compiler/mlir/xla/transforms/buffer_assignment_test.cc
@@ -29,60 +29,66 @@ limitations under the License.
 namespace mlir {
 namespace xla {
 namespace {
+
+/// This dialect independent unary operation has been defined only for testing
+/// buffer assignment.
+class BufferAssignmentTestUnaryOp
+    : public Op<BufferAssignmentTestUnaryOp, OpTrait::OneResult,
+                OpTrait::OneOperand> {
+ public:
+  using Op::Op;
+  static StringRef getOperationName() { return "buffer_assignment_test.unary"; }
+  static void build(OpBuilder& b, OperationState& state, Value source) {
+    state.addOperands(source);
+  }
+};
+
+/// This dialect independent lowered unary operation has been defined only for
+/// testing buffer assignment.
+class BufferAssignmentTestUnaryLoweredOp
+    : public Op<BufferAssignmentTestUnaryLoweredOp, OpTrait::ZeroResult,
+                OpTrait::NOperands<2>::Impl> {
+ public:
+  using Op::Op;
+  static StringRef getOperationName() {
+    return "buffer_assignment_test.unary_lowered";
+  }
+  static void build(OpBuilder& b, OperationState& state, Value source,
+                    Value target) {
+    state.addOperands(source);
+    state.addOperands(target);
+  }
+};
+
+/// This dialect independent copy operation has been defined only for testing
+/// NonVoidToVoidReturnOpConverter
+class BufferAssignmentTestCopyOp
+    : public Op<BufferAssignmentTestCopyOp, OpTrait::ZeroResult,
+                OpTrait::NOperands<2>::Impl> {
+ public:
+  using Op::Op;
+  static StringRef getOperationName() { return "buffer_assignment_test.copy"; }
+  static void build(OpBuilder& b, OperationState& state, Value from, Value to) {
+    state.addOperands(from);
+    state.addOperands(to);
+  }
+};
+
+class BufferAssignmentTestDialect : public Dialect {
+ public:
+  explicit BufferAssignmentTestDialect(MLIRContext* context)
+      : Dialect(getDialectNamespace(), context) {
+    addOperations<BufferAssignmentTestCopyOp, BufferAssignmentTestUnaryOp,
+                  BufferAssignmentTestUnaryLoweredOp>();
+  }
+  static StringRef getDialectNamespace() { return "buffer_assignment_test"; }
+};
+
 /// This pass tests two provided operation converters,
 /// FunctionAndBlockSignatureConverter and NonVoidToVoidReturnOpConverter, for
 /// Buffer Assignment.
 struct BufferAssignmentPreparationTestPass
     : mlir::PassWrapper<BufferAssignmentPreparationTestPass, FunctionPass> {
-  /// This dialect independent unary operation has been defined only for testing
-  /// buffer assignment.
-  class BufferAssignmentTestUnaryOp
-      : public Op<BufferAssignmentTestUnaryOp, OpTrait::OneResult,
-                  OpTrait::OneOperand> {
-   public:
-    using Op::Op;
-    static StringRef getOperationName() {
-      return "buffer_assignment_test.unary";
-    }
-    static void build(OpBuilder& b, OperationState& state, Value source) {
-      state.addOperands(source);
-    }
-  };
-
-  /// This dialect independent lowered unary operation has been defined only for
-  /// testing buffer assignment.
-  class BufferAssignmentTestUnaryLoweredOp
-      : public Op<BufferAssignmentTestUnaryLoweredOp, OpTrait::ZeroResult,
-                  OpTrait::NOperands<2>::Impl> {
-   public:
-    using Op::Op;
-    static StringRef getOperationName() {
-      return "buffer_assignment_test.unary_lowered";
-    }
-    static void build(OpBuilder& b, OperationState& state, Value source,
-                      Value target) {
-      state.addOperands(source);
-      state.addOperands(target);
-    }
-  };
-
-  /// This dialect independent copy operation has been defined only for testing
-  /// NonVoidToVoidReturnOpConverter
-  class BufferAssignmentTestCopyOp
-      : public Op<BufferAssignmentTestCopyOp, OpTrait::ZeroResult,
-                  OpTrait::NOperands<2>::Impl> {
-   public:
-    using Op::Op;
-    static StringRef getOperationName() {
-      return "buffer_assignment_test.copy";
-    }
-    static void build(OpBuilder& b, OperationState& state, Value from,
-                      Value to) {
-      state.addOperands(from);
-      state.addOperands(to);
-    }
-  };
-
   /// A simple converter that legalizes a BufferAssignmentTestUnaryOp to a
   /// BufferAssignmentTestUnaryLoweredOp and creates buffer allocation for
   /// the result of the computation.
@@ -151,8 +157,12 @@ struct BufferAssignmentPreparationTestPass
     }
   };
 };
+
 }  // namespace
 
+static mlir::DialectRegistration<BufferAssignmentTestDialect>
+    buffer_assignment_test_ops;
+
 /// This pass tests helper methods such as computeAllocPosition,
 /// FunctionAndBlockSignatureConverter, NonVoidToVoidReturnOpConverter
 /// conversion patterns. Furthermore, it checks buffer-assignment pass that
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index c7be2c55de7..422695c374b 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -19,7 +19,6 @@ from __future__ import division
 from __future__ import print_function
 
 import itertools
-import os
 
 import numpy as np
 
@@ -1609,8 +1608,4 @@ class BinaryOpsTest(xla_test.XLATestCase):
 
 
 if __name__ == "__main__":
-  # TODO(b/130689556): XLA CPU does not honor inf/nan which causes problems
-  os.environ[
-      "XLA_FLAGS"] = "--xla_cpu_enable_fast_math=false " + os.environ.get(
-          "XLA_FLAGS", "")
   googletest.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index d0e928a5ce6..85bf89c4f9e 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -347,17 +347,15 @@ class UnaryOpsTest(xla_test.XLATestCase):
           expected=np.array(
               [1.55740772, -2.18503986, -0.14254654, 1.15782128], dtype=dtype))
 
-      # TODO(b/130689556): Turn this on for CPU when we start honoring NaNs.
-      if self.device != "XLA_CPU":
-        self._assertOpOutputMatchesExpected(
-            math_ops.tanh,
-            np.array([[1, 2, 3, 4], [np.inf, -np.inf, np.nan, 20],
-                      [19, -19, 22, -22]],
-                     dtype=dtype),
-            expected=np.array(
-                [[0.76159418, 0.96402758, 0.99505478, 0.99932933],
-                 [1.0, -1.0, np.nan, 1.0], [1.0, -1.0, 1.0, -1.0]],
-                dtype=dtype))
+      self._assertOpOutputMatchesExpected(
+          math_ops.tanh,
+          np.array([[1, 2, 3, 4], [np.inf, -np.inf, np.nan, 20],
+                    [19, -19, 22, -22]],
+                   dtype=dtype),
+          expected=np.array(
+              [[0.76159418, 0.96402758, 0.99505478, 0.99932933],
+               [1.0, -1.0, np.nan, 1.0], [1.0, -1.0, 1.0, -1.0]],
+              dtype=dtype))
 
       self._assertOpOutputMatchesExpected(
           nn_ops.log_softmax,
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index d01f094dc2e..976ff91f6ce 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -136,8 +136,11 @@ class TensorListReserveOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_elements));
     OP_REQUIRES(
         ctx, num_elements >= 0,
-        errors::InvalidArgument("XLA compilation requires a fixed tensor list "
-                                "size. Set the number of elements."));
+        errors::InvalidArgument(
+            "XLA compilation requires a fixed tensor list size. Set the number "
+            "of elements. This could also happen if you're using a TensorArray "
+            "in a while loop that does not have its maximum_iteration set, you "
+            "can fix this by setting maximum_iteration to a suitable value."));
 
     // If element shape is compile time constant and it's not "unknown rank"
     // shape (-1), create an initialized TensorList. Otherwise create an
@@ -197,10 +200,13 @@ class EmptyTensorListOp : public XlaOpKernel {
   void Compile(XlaOpKernelContext* ctx) override {
     int64 max_num_elements;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &max_num_elements));
-    OP_REQUIRES(
-        ctx, max_num_elements >= 0,
-        errors::InvalidArgument("XLA compilation requires a fixed tensor list "
-                                "size. Set the max number of elements."));
+    OP_REQUIRES(ctx, max_num_elements >= 0,
+                errors::InvalidArgument(
+                    "XLA compilation requires a fixed tensor list size. Set "
+                    "the max number of elements. This could also happen if "
+                    "you're using a TensorArray in a while loop that does not "
+                    "have its maximum_iteration set, you can fix this by "
+                    "setting maximum_iteration to a suitable value."));
 
     if (dtype_ != DT_VARIANT) {
       // We are creating a non-nested TensorList.
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index f1ac1fef451..5d7bd26b01e 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -63,10 +63,6 @@ class ExecutionInput {
   explicit ExecutionInput(xla::Shape shape) : buffers_(std::move(shape)) {}
   explicit ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers)
       : buffers_(std::move(buffers)) {}
-  ExecutionInput(ShapeTree<MaybeOwningDeviceMemory> buffers,
-                 std::vector<ShapeIndex> owner_held_indices)
-      : buffers_(std::move(buffers)),
-        unowned_indices_(std::move(owner_held_indices)) {}
   ExecutionInput(ExecutionInput&&) = default;
 
   ~ExecutionInput() {
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index 3c4e9f7c1e6..a3056b1ddad 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -40,7 +40,7 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
  public:
   // A NestedComputer computes an element of the output of the given computation
   // given a Span of its input elements.
-  using NestedComputer = std::function<StatusOr<llvm::Value*>(
+  using NestedComputer = std::function<StatusOr<std::vector<llvm::Value*>>(
       const HloComputation&, absl::Span<llvm::Value* const>)>;
 
   GpuElementalIrEmitter(const HloModuleConfig& hlo_module_config,
@@ -91,12 +91,7 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
       absl::string_view) override {
-    // TODO(b/118332391): Supported variadic return values.
-    auto result = compute_nested_(callee, parameters);
-    if (!result.ok()) {
-      return result.status();
-    }
-    return std::vector<llvm::Value*>{result.ValueOrDie()};
+    return compute_nested_(callee, parameters);
   }
 
   llvm::Value* EmitThreadId() override;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 744cd7b56bf..aa8a6215cc7 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -698,115 +698,6 @@ Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   return Status::OK();
 }
 
-Status IrEmitter::HandleReduce(HloInstruction* instr) {
-  const HloReduceInstruction* reduce = Cast<HloReduceInstruction>(instr);
-  const Shape& out_shape = reduce->shape();
-  bool returns_tuple = !out_shape.IsArray();
-  int accumulators_count = 1;
-  if (returns_tuple) {
-    CHECK(out_shape.IsTuple());
-    accumulators_count = out_shape.tuple_shapes_size();
-  }
-
-  auto arg = reduce->operand(0);
-  absl::Span<const int64> dimensions(reduce->dimensions());
-  HloComputation* function = reduce->to_apply();
-  return EmitTargetElementLoop(
-      *reduce,
-      [=](const llvm_ir::IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        std::vector<llvm::Value*> accumulator_addrs;
-        std::vector<llvm::Type*> accumulator_types;
-
-        // Initialize accumulators with initial values.
-        for (int i = 0; i < accumulators_count; i++) {
-          auto init_value = reduce->init_values()[i];
-          const Shape& element_shape =
-              returns_tuple ? out_shape.tuple_shapes(i) : out_shape;
-          PrimitiveType accumulator_type = element_shape.element_type();
-          llvm::Type* accumulator_llvm_type =
-              llvm_ir::PrimitiveTypeToIrType(accumulator_type, module_);
-          llvm::AllocaInst* accumulator_addr = Alloca(accumulator_llvm_type);
-          Store(Load(GetBasePointer(*init_value)), accumulator_addr);
-          accumulator_addrs.push_back(accumulator_addr);
-          accumulator_types.push_back(accumulator_llvm_type);
-        }
-
-        // The enclosing loops go over all the target elements. Now we have to
-        // compute the actual target element. For this, we build a new loop nest
-        // to iterate over all the reduction dimensions in the argument.
-        // AddLoopsForShapeOnDimensions will return an Index where induction
-        // Value*s are placed for each dimension in dimensions, and all the rest
-        // are nullptrs.
-        llvm_ir::ForLoopNest loops(IrName(reduce, "inner"), &b_);
-        std::vector<llvm::Value*> input_multi_index =
-            loops.AddLoopsForShapeOnDimensions(arg->shape(), dimensions,
-                                               "reduction_dim");
-
-        SetToFirstInsertPoint(loops.GetInnerLoopBodyBasicBlock(), &b_);
-
-        // Build a full index for the input argument, using reduced_dims_index
-        // as the base. In reduced_dims_index only the reduction dimensions are
-        // filled in. We fill in the rest of the dimensions with induction
-        // Value*s taken from 'index' which iterates over the target array.
-        // See the high-level description in the XLA documentation for details.
-        llvm_ir::IrArray::Index::const_iterator it = index.begin();
-
-        for (auto& i : input_multi_index) {
-          if (i == nullptr) {
-            i = *it++;
-          }
-        }
-        CHECK(index.end() == it);
-
-        // Apply the reduction function to the loaded value.
-        llvm_ir::IrArray::Index input_index(input_multi_index, arg->shape(),
-                                            b_.getInt64Ty());
-        std::vector<llvm::Value*> reduction_operands(accumulator_addrs.begin(),
-                                                     accumulator_addrs.end());
-        for (int i = 0; i < accumulators_count; i++) {
-          llvm::Value* input_address =
-              GetIrArray(*reduce->operand(i), *reduce)
-                  .EmitArrayElementAddress(input_index, &b_);
-          reduction_operands.push_back(input_address);
-        }
-
-        llvm::Value* ret_argument;
-        if (!returns_tuple) {
-          CHECK_EQ(accumulator_addrs.size(), 1);
-          ret_argument = accumulator_addrs[0];
-        } else {
-          const Shape& return_shape = function->root_instruction()->shape();
-
-          llvm::Type* return_value_buffer_type =
-              llvm_ir::ShapeToIrType(return_shape, module_);
-          ret_argument = Alloca(return_value_buffer_type);
-          llvm_ir::IrArray tuple_array(ret_argument, return_shape);
-          EmitTuple(tuple_array, accumulator_addrs, &b_);
-        }
-
-        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-            *function, reduction_operands, ret_argument));
-
-        SetToFirstInsertPoint(loops.GetOuterLoopExitBasicBlock(), &b_);
-
-        if (!returns_tuple) {
-          CHECK_EQ(accumulator_addrs.size(), 1);
-          return Load(accumulator_addrs[0]);
-        } else {
-          // Emit a struct for the LoopEmitter dealing with multi-output
-          // fusion.
-          llvm::Value* returned_structure = llvm::UndefValue::get(
-              llvm::StructType::get(b_.getContext(), accumulator_types));
-          for (int i = 0; i < accumulators_count; i++) {
-            llvm::Value* accumulator_value = Load(accumulator_addrs[i]);
-            returned_structure =
-                b_.CreateInsertValue(returned_structure, accumulator_value, i);
-          }
-          return returned_structure;
-        }
-      });
-}
-
 Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   // kFusion for library calls should be handled by
   // IrEmitterUnnested::HandleFusion.
@@ -866,22 +757,39 @@ Status IrEmitter::HandleBatchNormGrad(HloInstruction*) {
       "to a cudnn CustomCall using CudnnBatchNormRewriter.");
 }
 
-StatusOr<llvm::Value*> IrEmitter::ComputeNestedElement(
+StatusOr<std::vector<llvm::Value*>> IrEmitter::ComputeNestedElement(
     const HloComputation& computation,
     absl::Span<llvm::Value* const> parameter_elements) {
+  const Shape& return_shape = computation.root_instruction()->shape();
   llvm::Value* return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
-      llvm_ir::PrimitiveTypeToIrType(
-          computation.root_instruction()->shape().element_type(), module_),
-      "return_buffer", &b_);
+      llvm_ir::ShapeToIrType(return_shape, module_), "return_buffer", &b_);
   std::vector<llvm::Value*> parameter_buffers;
   for (llvm::Value* parameter_element : parameter_elements) {
     parameter_buffers.push_back(llvm_ir::EmitAllocaAtFunctionEntry(
         parameter_element->getType(), "parameter_buffer", &b_));
     Store(parameter_element, parameter_buffers.back());
   }
+
+  std::vector<llvm::Value*> allocas_for_returned_scalars;
+  if (!return_shape.IsTuple()) {
+    allocas_for_returned_scalars.push_back(return_buffer);
+  } else {
+    allocas_for_returned_scalars =
+        llvm_ir::EmitTupleAllocasAtFunctionEntry(return_shape, &b_);
+    llvm_ir::IrArray tuple_array(return_buffer, return_shape);
+
+    EmitTuple(tuple_array, allocas_for_returned_scalars, &b_);
+  }
+
   TF_RETURN_IF_ERROR(EmitCallToNestedComputation(computation, parameter_buffers,
                                                  return_buffer));
-  return Load(return_buffer);
+
+  std::vector<llvm::Value*> returned_scalars;
+  returned_scalars.reserve(allocas_for_returned_scalars.size());
+  for (llvm::Value* addr : allocas_for_returned_scalars) {
+    returned_scalars.push_back(Load(addr));
+  }
+  return returned_scalars;
 }
 
 std::vector<llvm_ir::IrArray> IrEmitter::ConstructIrArrayForOutputs(
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index e0fe454dcfe..93712961ea2 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -89,7 +89,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleRecv(HloInstruction* recv) override;
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleParameter(HloInstruction* parameter) override;
-  Status HandleReduce(HloInstruction* reduce) override;
   Status HandleTuple(HloInstruction* tuple) override;
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleSelect(HloInstruction* select) override;
@@ -213,7 +212,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
                        const llvm_ir::IrArray::Index& compare_keys_index,
                        const llvm_ir::IrArray& keys_array);
 
-  StatusOr<llvm::Value*> ComputeNestedElement(
+  StatusOr<std::vector<llvm::Value*>> ComputeNestedElement(
       const HloComputation& computation,
       absl::Span<llvm::Value* const> parameter_elements);
 
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 3930898d665..ad21efa13c9 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -312,12 +312,13 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
 class HloDotDumper {
  public:
   HloDotDumper(const HloComputation* computation, absl::string_view label,
-               const DebugOptions& debug_options, bool show_backend_config,
+               const DebugOptions& debug_options,
+               HloRenderOptions hlo_render_options,
                const HloExecutionProfile* profile, NodeFilter filter)
       : computation_(computation),
         label_(label),
         debug_options_(debug_options),
-        show_backend_config_(show_backend_config),
+        hlo_render_options_(hlo_render_options),
         profile_(profile),
         filter_(std::move(filter)) {}
 
@@ -384,7 +385,7 @@ class HloDotDumper {
   const HloComputation* computation_;  // never null
   const string label_;                 // overall name for the graph
   const DebugOptions& debug_options_;
-  const bool show_backend_config_;
+  const HloRenderOptions hlo_render_options_;
   const HloExecutionProfile* profile_;  // may be null
   const NodeFilter filter_;
 
@@ -565,7 +566,8 @@ bool HloDotDumper::ShouldShowFusionSubcomputation(const HloInstruction* instr) {
 bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
   if (subcomp->IsFusionComputation()) {
     const HloInstruction* fusion = subcomp->FusionInstruction();
-    if (!filter_.Show(fusion) || filter_.SomeOrAllOperandsOmitted(fusion)) {
+    if (!filter_.Show(fusion) || filter_.SomeOrAllOperandsOmitted(fusion) ||
+        !hlo_render_options_.show_fusion_subcomputations) {
       return false;
     }
   }
@@ -1133,7 +1135,8 @@ string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) {
 
 string HloDotDumper::GetInstructionNodeBackendConfig(
     const HloInstruction* instr) {
-  if (!show_backend_config_ || instr->raw_backend_config_string().empty()) {
+  if (!hlo_render_options_.show_backend_config ||
+      instr->raw_backend_config_string().empty()) {
     return "";
   }
 
@@ -1604,14 +1607,14 @@ StatusOr<string> RenderGraph(const HloComputation& computation,
                              const DebugOptions& debug_options,
                              RenderedGraphFormat format,
                              const HloExecutionProfile* hlo_execution_profile,
-                             bool show_backend_config) {
+                             HloRenderOptions hlo_render_options) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
     return Unavailable("Can't render as URL; no URL renderer was registered.");
   }
 
   string rendered_dot =
-      HloDotDumper(&computation, label, debug_options, show_backend_config,
+      HloDotDumper(&computation, label, debug_options, hlo_render_options,
                    hlo_execution_profile, NodeFilter())
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
@@ -1619,7 +1622,7 @@ StatusOr<string> RenderGraph(const HloComputation& computation,
 
 StatusOr<string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
-    bool show_backend_config,
+    HloRenderOptions hlo_render_options,
     const absl::flat_hash_set<const HloInstruction*>& boundary) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
@@ -1632,7 +1635,7 @@ StatusOr<string> RenderNeighborhoodAround(
   string rendered_dot =
       HloDotDumper(node.parent(), label,
                    node.GetModule()->config().debug_options(),
-                   show_backend_config, /*profile=*/nullptr,
+                   hlo_render_options, /*profile=*/nullptr,
                    MakeNodeRadiusAroundFilter(&node, radius, boundary))
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
@@ -1641,7 +1644,7 @@ StatusOr<string> RenderNeighborhoodAround(
 StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                                       const HloInstruction& to, int64 max_nodes,
                                       RenderedGraphFormat format,
-                                      bool show_backend_config) {
+                                      HloRenderOptions hlo_render_options) {
   tensorflow::mutex_lock lock(url_renderer_mu);
   if (format == RenderedGraphFormat::kUrl && url_renderer == nullptr) {
     return FailedPrecondition(
@@ -1663,7 +1666,7 @@ StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                    "NODES***<br/><br/>");
   }
   string rendered_dot =
-      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
+      HloDotDumper(from.parent(), label, debug_options, hlo_render_options,
                    /*profile=*/nullptr, filter)
           .Dump();
   return WrapDotInFormat(rendered_dot, format);
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 324ac67a6dd..528de77e4e6 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -50,6 +50,14 @@ enum class RenderedGraphFormat {
   kUrl,
 };
 
+struct HloRenderOptions {
+  // Include the backend config string in the rendered graph.
+  bool show_backend_config = false;
+
+  // Include the fusion subcomputations in the rendered graph.
+  bool show_fusion_subcomputations = true;
+};
+
 // Renders an HLO module as a human-readable visual graph.
 //
 // Note that this only works well for relatively small graphs (no more than a
@@ -61,7 +69,7 @@ StatusOr<string> RenderGraph(
     const HloComputation& computation, absl::string_view label,
     const DebugOptions& debug_options, RenderedGraphFormat format,
     const HloExecutionProfile* hlo_execution_profile = nullptr,
-    bool show_backend_config = false);
+    HloRenderOptions hlo_render_options = {});
 
 // Like RenderGraph, but renders only nodes "near" the given node in the graph.
 //
@@ -73,7 +81,7 @@ StatusOr<string> RenderGraph(
 // will be omitted even if they are within the radius.
 StatusOr<string> RenderNeighborhoodAround(
     const HloInstruction& node, int radius, RenderedGraphFormat format,
-    bool show_backend_config = false,
+    HloRenderOptions hlo_render_options = {},
     const absl::flat_hash_set<const HloInstruction*>& boundary = {});
 
 // Renders nodes on any of the paths from `from` to `to`.  If there are more
@@ -82,7 +90,7 @@ StatusOr<string> RenderNeighborhoodAround(
 StatusOr<string> RenderAllPathsFromTo(const HloInstruction& from,
                                       const HloInstruction& to, int64 max_nodes,
                                       RenderedGraphFormat format,
-                                      bool show_backend_config = false);
+                                      HloRenderOptions hlo_render_options = {});
 
 // Registers a function which implements RenderedGraphFormat::kUrl.
 //
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index b7a67b4e66e..995b0ece7cd 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -137,9 +137,8 @@ class ShapedBuffer {
 
 std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer);
 
-// ShapedBuffer derived class which allocates all internal buffers on
-// construction and deallocates the memory when the object is
-// destructed.
+// ScopedShapedBuffer takes allocated buffers as inputs, and deallocates on
+// destruction. This class represents an owning wrapper around `ShapedBuffer`.
 //
 // TODO(timshen): Remove inheritance between ScopedShapedBuffer and
 // ShapedBuffer.  There should never be a need to consider a ScopedShapedBuffer
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index 8eee452328e..068442ad5c7 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -81,11 +81,10 @@ void SpmdLogger::RegisterLogEntry(HloInstruction* hlo,
   string report = hlo->ToString();
   int64 max_value = -1;
   for (HloInstruction* inst : group) {
-    if (inst->shape().IsTuple()) {
+    if (!inst->shape().IsArray()) {
       continue;
     }
-    max_value =
-        std::max<int64>(max_value, ShapeUtil::ByteSizeOf(inst->shape(), 4));
+    max_value = std::max<int64>(max_value, ShapeSizeInBytes(inst->shape()));
     absl::StrAppend(&report, "     * ", inst->ToString(), "\n");
   }
   entries_.push_back(std::make_pair(max_value, report));
@@ -149,14 +148,14 @@ template <typename F>
   const auto add_report = [&](std::vector<HloInstruction*>* insts) {
     std::sort(insts->begin(), insts->end(),
               [](const HloInstruction* inst0, const HloInstruction* inst1) {
-                return ShapeUtil::ByteSizeOf(inst0->shape()) >
-                       ShapeUtil::ByteSizeOf(inst1->shape());
+                return ShapeSizeInBytes(inst0->shape()) >
+                       ShapeSizeInBytes(inst1->shape());
               });
     for (int64 i = 0;
          i < std::min<int64>(report_instruction_count, insts->size()); ++i) {
       absl::StrAppend(&report, "  ",
                       tensorflow::strings::HumanReadableNumBytes(
-                          ShapeUtil::ByteSizeOf((*insts)[i]->shape())),
+                          ShapeSizeInBytes((*insts)[i]->shape())),
                       " : ", (*insts)[i]->ToString(), "\n");
     }
   };
@@ -1180,8 +1179,8 @@ Status SpmdPartitioningVisitor::HandleScatter(HloInstruction* hlo) {
     if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
             operand, scatter_dims_to_operand_dims, slice_size,
             num_partitions_) &&
-        ShapeUtil::ByteSizeOf(updates.base_shape()) <
-            ShapeUtil::ByteSizeOf(scatter->shape())) {
+        ShapeSizeInBytes(updates.base_shape()) <
+            ShapeSizeInBytes(scatter->shape())) {
       // Operand is sharded on trivial slice dims (update slice size 1). We can
       // adjust the indices on each partition by subtracting the offsets. Then
       // we execute a scatter on full updated indices, and out-of-bound accesses
@@ -1968,8 +1967,8 @@ Status SpmdPartitioningVisitor::HandleGather(HloInstruction* hlo) {
     if (GatherScatterOperandPartitionedOnlyOnTrivialSliceDims(
             operand, start_index_map, gather->gather_slice_sizes(),
             num_partitions_) &&
-        ShapeUtil::ByteSizeOf(gather->shape()) <
-            ShapeUtil::ByteSizeOf(gather->operand(0)->shape())) {
+        ShapeSizeInBytes(gather->shape()) <
+            ShapeSizeInBytes(gather->operand(0)->shape())) {
       indices = indices.Reshard(HloSharding::Replicate());
       // Now the operand is partitioned in trivial slice dimensions, and the
       // indices are replicated. We execute a gather on partitioned operand,
@@ -2762,8 +2761,7 @@ Status SpmdPartitioningVisitor::HandleConvolutionTiledLhsAndRhs(
 
   auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::Zero(hlo->shape().element_type())));
-  if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
-      ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+  if (ShapeSizeInBytes(lhs.base_shape()) < ShapeSizeInBytes(rhs.base_shape())) {
     if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
       return DefaultAction(hlo);
     }
@@ -3005,8 +3003,8 @@ Status SpmdPartitioningVisitor::HandleConvolution(HloInstruction* hlo) {
       };
       auto zero = b_.AddInstruction(HloInstruction::CreateConstant(
           LiteralUtil::Zero(hlo->shape().element_type())));
-      if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
-          ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+      if (ShapeSizeInBytes(lhs.base_shape()) <
+          ShapeSizeInBytes(rhs.base_shape())) {
         if (unsupported_sharding(aligned_lhs_sharding, rhs.sharding())) {
           return DefaultAction(hlo);
         }
@@ -3731,7 +3729,7 @@ Status SpmdPartitioningVisitor::HandleDotHelper(
   };
   if (output_lhs_non_contracting_partitions == num_partitions_ &&
       output_sharding_transposed_to_match_lhs == lhs_sharding &&
-      ShapeUtil::ByteSizeOf(hlo->operand(1)->shape()) >=
+      ShapeSizeInBytes(hlo->operand(1)->shape()) >=
           options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
     if (rhs_contracting_partitions == num_partitions_) {
       return emit_windowed_dot_general(0, 1, true, false);
@@ -3745,7 +3743,7 @@ Status SpmdPartitioningVisitor::HandleDotHelper(
   }
   if (output_rhs_non_contracting_partitions == num_partitions_ &&
       output_sharding_transposed_to_match_rhs == rhs_sharding &&
-      ShapeUtil::ByteSizeOf(hlo->operand(0)->shape()) >=
+      ShapeSizeInBytes(hlo->operand(0)->shape()) >=
           options_.threshold_for_windowed_einsum_mib * 1024 * 1024) {
     if (lhs_contracting_partitions == num_partitions_) {
       return emit_windowed_dot_general(1, 0, true, false);
@@ -3775,8 +3773,8 @@ Status SpmdPartitioningVisitor::HandleDotHelper(
         LiteralUtil::Zero(hlo->shape().element_type())));
     // Pad both sides with zero, since NaN at one side cannot be masked by zero
     // on the other side.
-    if (ShapeUtil::ByteSizeOf(lhs.base_shape()) <
-        ShapeUtil::ByteSizeOf(rhs.base_shape())) {
+    if (ShapeSizeInBytes(lhs.base_shape()) <
+        ShapeSizeInBytes(rhs.base_shape())) {
       lhs =
           lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithValue(zero);
       rhs = rhs.PadWithValue(zero);
@@ -4607,8 +4605,8 @@ HloInstruction* SpmdPartitioner::AllGatherShards(SpmdBuilder* b,
       xpose_permutation[i] = i + tiled_dims.size() - split_dims_added;
     } else {
       xpose_permutation[i] = split_dims_added;
+      xpose_permutation[i + 1] = i + tiled_dims.size() - split_dims_added;
       split_dims_added++;
-      xpose_permutation[i + 1] = i + tiled_dims.size();
       i++;
     }
   }
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index 55d7dc43785..e766695385b 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -649,6 +649,43 @@ ENTRY entry {
                           op::ReduceWindow(masked, op::Constant())));
 }
 
+TEST_F(SpmdPartitioningTest, ReduceWindowTiledOneSideHaloBeyondNeighbor) {
+  const char* const hlo_string = R"(
+HloModule module
+
+sum {
+  a = f32[] parameter(0)
+  b = f32[] parameter(1)
+  ROOT add = f32[] add(a, b)
+}
+
+ENTRY entry {
+  param = f32[9,2] parameter(0), sharding={devices=[5,1]0,1,2,3,4}
+  constant.1 = f32[] constant(0), sharding={replicated}
+  ROOT reduce-window = f32[5,2]{1,0} reduce-window(param, constant.1),
+    window={size=4x1 stride=2x1 pad=3_0x0_0}, to_apply=sum,
+    sharding={devices=[5,1]0,1,2,3,4}
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/5));
+  VLOG(1) << module->ToString();
+  auto halo0 = AllOf(op::Shape("f32[1,2]"),
+                     op::CollectivePermute(op::Slice(op::Parameter(0))));
+  auto halo1 =
+      AllOf(op::Shape("f32[2,2]"), op::CollectivePermute(op::Parameter(0)));
+  auto pre_mask =
+      AllOf(op::Shape("f32[4,2]"),
+            op::Slice(AllOf(op::Shape("f32[5,2]"),
+                            op::Concatenate(halo0, halo1, op::Parameter(0)))));
+  auto masked =
+      op::Select(op::Compare(op::Add(op::Iota(), op::Broadcast(op::Multiply())),
+                             op::Broadcast(op::Constant())),
+                 pre_mask, op::Broadcast(op::Constant()));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, AllOf(op::Shape("f32[1,2]{1,0}"),
+                          op::ReduceWindow(masked, op::Constant())));
+}
+
 TEST_F(SpmdPartitioningTest, ReduceWindowTiledOneSideUnequalHalo) {
   const char* const hlo_string = R"(
 HloModule module
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
index 207f854cd9f..8db2ca84a05 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h"
 
+#include <algorithm>
+
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -23,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -104,6 +107,11 @@ Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding) {
   return sharding.TileShape(shape);
 }
 
+int64 ShapeSizeInBytes(const Shape& shape) {
+  return ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type()) *
+         ShapeUtil::ElementsIn(shape);
+}
+
 Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
                                           const HloSharding& sharding,
                                           int64 partition_id) {
@@ -402,33 +410,30 @@ absl::optional<HloInstruction*> ExchangeHalo(
   std::vector<HloInstruction*> concat_pieces;
 
   int64 max_left_halo_size = left_halo_size_function.MaxInRange(1, shard_count);
-  if (max_left_halo_size > input_shard_size) {
-    VLOG(1) << "ExchangeHalo failed: halo is beyond the left neighbor.";
-    return absl::nullopt;
-  }
-  if (max_left_halo_size > 0) {
+  for (int64 i = CeilOfRatio(max_left_halo_size, input_shard_size) - 1; i >= 0;
+       --i) {
     std::vector<std::pair<int64, int64>> source_target_pairs;
     target.tile_assignment().Each(
         [&](absl::Span<const int64> indices, int64 device) {
-          if (indices[dim] > 0) {
+          if (indices[dim] > i) {
             std::vector<int64> source_indices(indices.begin(), indices.end());
-            source_indices[dim] -= 1;
+            source_indices[dim] -= i + 1;
             source_target_pairs.emplace_back(
                 target.tile_assignment()(source_indices), device);
           }
         });
+    int64 halo_size =
+        std::min(max_left_halo_size - input_shard_size * i, input_shard_size);
     auto halo_shape = hlo->shape();
     auto source_halo_slice = hlo;
-    if (max_left_halo_size != hlo->shape().dimensions(dim)) {
-      halo_shape.set_dimensions(dim, max_left_halo_size);
+    if (halo_size != hlo->shape().dimensions(dim)) {
+      halo_shape.set_dimensions(dim, halo_size);
       std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
-      halo_start_indices[dim] =
-          hlo->shape().dimensions(dim) - max_left_halo_size;
+      halo_start_indices[dim] = hlo->shape().dimensions(dim) - halo_size;
       std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
-
-      source_halo_slice = b->AddInstruction(
-          hlo->CreateSlice(halo_shape, hlo, halo_start_indices,
-                           hlo->shape().dimensions(), halo_slice_strides));
+      source_halo_slice = b->AddInstruction(HloInstruction::CreateSlice(
+          halo_shape, hlo, halo_start_indices, hlo->shape().dimensions(),
+          halo_slice_strides));
     }
     auto left_halo =
         collective_ops_creator.create_cross_partition_collective_permute(
@@ -441,29 +446,30 @@ absl::optional<HloInstruction*> ExchangeHalo(
   // Right halo.
   int64 max_right_halo_size =
       right_halo_size_function.MaxInRange(0, shard_count - 1);
-  if (max_right_halo_size > input_shard_size) {
-    VLOG(1) << "ExchangeHalo failed: halo is beyond the right neighbor.";
-    return absl::nullopt;
-  }
-  if (max_right_halo_size > 0) {
+  for (int64 i = 0; i < CeilOfRatio(max_right_halo_size, input_shard_size);
+       ++i) {
     std::vector<std::pair<int64, int64>> source_target_pairs;
     target.tile_assignment().Each(
         [&](absl::Span<const int64> indices, int64 device) {
-          if (indices[dim] > 0) {
+          if (indices[dim] > i) {
             std::vector<int64> target_indices(indices.begin(), indices.end());
-            target_indices[dim] -= 1;
+            target_indices[dim] -= i + 1;
             source_target_pairs.emplace_back(
                 device, target.tile_assignment()(target_indices));
           }
         });
+    int64 halo_size =
+        std::min(max_right_halo_size - input_shard_size * i, input_shard_size);
     auto halo_shape = hlo->shape();
-    halo_shape.set_dimensions(dim, max_right_halo_size);
-    std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
-    std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
-
-    auto source_halo_slice = b->AddInstruction(
-        hlo->CreateSlice(halo_shape, hlo, halo_start_indices,
-                         halo_shape.dimensions(), halo_slice_strides));
+    HloInstruction* source_halo_slice = hlo;
+    if (halo_size != halo_shape.dimensions(dim)) {
+      halo_shape.set_dimensions(dim, halo_size);
+      std::vector<int64> halo_start_indices(halo_shape.rank(), 0);
+      std::vector<int64> halo_slice_strides(halo_shape.rank(), 1);
+      source_halo_slice = b->AddInstruction(HloInstruction::CreateSlice(
+          halo_shape, hlo, halo_start_indices, halo_shape.dimensions(),
+          halo_slice_strides));
+    }
     auto right_halo =
         collective_ops_creator.create_cross_partition_collective_permute(
             b, source_halo_slice, source_target_pairs, (*next_channel_id)++);
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
index f96b23d7073..440f0e78112 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -57,6 +57,10 @@ bool EvenlyPartitions(const Shape& shape, const HloSharding& sharding);
 // target sharding.
 Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding);
 
+// Similar to ShapeUtil::ByteSizeOf(), but does not check it has dense layout
+// since this can be before layout assignment.
+int64 ShapeSizeInBytes(const Shape& shape);
+
 // Returns the shard shape for a partition without padding due to uneven
 // sharding.
 Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
diff --git a/tensorflow/compiler/xla/tools/interactive_graphviz.cc b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
index 4f8a6b43314..b6c62beff74 100644
--- a/tensorflow/compiler/xla/tools/interactive_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/interactive_graphviz.cc
@@ -112,8 +112,7 @@ constexpr int64 kDefaultMaxNumNodesInAllPaths = 100;
 
 using absl::EqualsIgnoreCase;
 
-// A global control for whether backend configuration display is enabled.
-bool show_backend_config = true;
+HloRenderOptions hlo_render_options;
 
 HloInstruction* FindInstruction(const HloModule& module, string node_name) {
   if (absl::StartsWith(node_name, "%")) {
@@ -160,6 +159,8 @@ void DoHelpCommand() {
     Renders all nodes in <computation>.
   backend_config [on|off]
     Controls whether backend operation configuration information is printed.
+  show_fusion_subcomputations [on|off]
+    Controls whether fusion subcomputations are shown.
   list [name|op_name|op_type] <pattern>
     Lists all instructions whose name, metadata op_name, or metadata op_type
     contains <pattern> as a substring.
@@ -182,15 +183,32 @@ void DoHelpCommand() {
 // Turn metadata-printing on or off.
 void DoBackendConfigCommand(const std::vector<string>& tokens) {
   if (tokens.size() == 2 && tokens[1] == "on") {
-    show_backend_config = true;
+    hlo_render_options.show_backend_config = true;
   } else if (tokens.size() == 2 && tokens[1] == "off") {
-    show_backend_config = false;
+    hlo_render_options.show_backend_config = false;
   } else if (tokens.size() != 1) {
     std::cerr << "(Illegal backend_config value.  Use either 'on' or 'off'.)"
               << std::endl;
   }
   std::cout << "Backend configuration display "
-            << (show_backend_config ? "ON" : "OFF") << std::endl;
+            << (hlo_render_options.show_backend_config ? "ON" : "OFF")
+            << std::endl;
+}
+
+// Turn fusion computation display on or off.
+void DoShowFusionSubcomputationsCommand(const std::vector<string>& tokens) {
+  if (tokens.size() == 2 && tokens[1] == "on") {
+    hlo_render_options.show_fusion_subcomputations = true;
+  } else if (tokens.size() == 2 && tokens[1] == "off") {
+    hlo_render_options.show_fusion_subcomputations = false;
+  } else if (tokens.size() != 1) {
+    std::cerr << "(Illegal show_fusion_subcomputations value.  Use either "
+                 "'on' or 'off'.)"
+              << std::endl;
+  }
+  std::cout << "Fusion subcomputations display "
+            << (hlo_render_options.show_fusion_subcomputations ? "ON" : "OFF")
+            << std::endl;
 }
 
 // List all computations in the module.
@@ -373,7 +391,7 @@ void DoExtractCommand(const HloModule& module,
   auto extracted_module = ExtractModule(instr, height);
   std::cout << extracted_module->ToString(
                    HloPrintOptions::ShortParsable().set_print_backend_config(
-                       show_backend_config))
+                       hlo_render_options.show_backend_config))
             << std::endl;
 }
 
@@ -517,7 +535,7 @@ void DoAllPathsCommand(const Options& opts, const HloModule& module,
   }
   RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
     return RenderAllPathsFromTo(*from, *to, max_nodes, format,
-                                /*show_backend_config=*/show_backend_config);
+                                hlo_render_options);
   });
 }
 
@@ -582,15 +600,13 @@ void DoPlotCommand(const Options& opts, const HloModule& module,
     RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
       return RenderGraph(*comp, /*label=*/"",
                          comp->parent()->config().debug_options(), format,
-                         /*hlo_execution_profile=*/nullptr,
-                         /*show_backend_config=*/show_backend_config);
+                         /*hlo_execution_profile=*/nullptr, hlo_render_options);
     });
   } else {
     RenderAndDisplayGraph(opts, [&](RenderedGraphFormat format) {
-      return RenderNeighborhoodAround(
-          *instr, graph_width, format,
-          /*show_backend_config=*/show_backend_config,
-          /*boundary=*/boundary);
+      return RenderNeighborhoodAround(*instr, graph_width, format,
+                                      hlo_render_options,
+                                      /*boundary=*/boundary);
     });
   }
 }
@@ -617,6 +633,8 @@ void InteractiveDumpGraphs(const Options& opts, const HloModule& module) {
       DoHelpCommand();
     } else if (tokens[0] == "backend_config") {
       DoBackendConfigCommand(tokens);
+    } else if (tokens[0] == "show_fusion_subcomputations") {
+      DoShowFusionSubcomputationsCommand(tokens);
     } else if (tokens[0] == "list") {
       if (tokens.size() > 1 && tokens[1] == "computations") {
         DoListComputationsCommand(module, tokens);
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
index 09eff6177b1..ae5942b3617 100644
--- a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
@@ -84,6 +84,13 @@ END
     name: "Tout"
     description: <<END
 the types of the output tensors.
+END
+  }
+  attr {
+    name: "enable_large_batch_splitting"
+    description: <<END
+input with a large size (i.e., larger than the largest value of
+`allowed_batch_sizes`) will be splitted into multiple batches with batch size.
 END
   }
   summary: "Batches all the inputs tensors to the computation done by the function."
diff --git a/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt b/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt
new file mode 100644
index 00000000000..9a4e5abd110
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DeviceIndex.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "DeviceIndex"
+  visibility: HIDDEN
+  summary: "Return the index of device the op runs."
+}
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 86911c7310a..0a26ceca66f 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -100,7 +100,7 @@ string AttrSlice::DebugString() const {
   return absl::StrJoin(attr_key_vals, ", ");
 }
 
-string SummarizeNodeDef(const NodeDef& node_def) {
+string SummarizeNodeDef(const NodeDef& node_def, int max_inputs_in_summary) {
   string ret = strings::StrCat(errors::FormatNodeNameForError(node_def.name()),
                                " = ", node_def.op(), "[");
   strings::StrAppend(&ret, SummarizeAttrsHelper(node_def, node_def.device()));
@@ -111,6 +111,10 @@ string SummarizeNodeDef(const NodeDef& node_def) {
   for (const string& input : node_def.input()) {
     if (!first) strings::StrAppend(&ret, ", ");
     first = false;
+    if (max_inputs_in_summary-- == 0) {
+      strings::StrAppend(&ret, "...");
+      break;
+    }
     strings::StrAppend(&ret, input);
   }
   strings::StrAppend(&ret, ")");
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index db3f2570a92..d937a8e51e1 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -58,7 +58,12 @@ extern const char* const kColocationGroupPrefix;
 
 // Produce a human-readable version of a Node or NodeDef that is more concise
 // than a text-format proto.
-string SummarizeNodeDef(const NodeDef& node_def);
+//
+// The parameter `max_inputs_in_summary` specifies how many inputs at most to
+// serialize in the output (in order not to get a string which is overly large).
+// The value `-1` specifies that all inputs will be shown.
+string SummarizeNodeDef(const NodeDef& node_def,
+                        int max_inputs_in_summary = -1);
 string SummarizeAttrs(const NodeDef& node_def);
 string SummarizeAttrsHelper(AttrSlice attrs, StringPiece device);
 
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 030064e49fb..7432e2d54ea 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -1062,6 +1062,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index bab28d44686..a927afc5b30 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -603,16 +603,8 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "//tensorflow/core/grappler/clusters:cluster",
-        "//tensorflow/core/grappler/optimizers:arithmetic_optimizer",
-        "//tensorflow/core/grappler/optimizers:common_subgraph_elimination",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
-        "//tensorflow/core/grappler/optimizers:dependency_optimizer",
-        "//tensorflow/core/grappler/optimizers:function_optimizer",
-        "//tensorflow/core/grappler/optimizers:loop_optimizer",
-        "//tensorflow/core/grappler/optimizers:model_pruner",
-        "//tensorflow/core/grappler/optimizers:remapper",
-        "//tensorflow/core/grappler/optimizers:shape_optimizer",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index 3591cd525ac..5804c3ee01a 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -21,15 +21,7 @@ limitations under the License.
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/common_subgraph_elimination.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
-#include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/function_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/model_pruner.h"
-#include "tensorflow/core/grappler/optimizers/remapper.h"
-#include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
 #include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/ptr_util.h"
@@ -60,14 +52,6 @@ constexpr std::array<const char*, 15> kTFDataOptimizations = {
     "slack",
     "inject_prefetch"};
 
-// Standard grappler optimizations, in the order we want to perform them.
-// The order matches the order in the generic meta optimizer.
-constexpr std::array<const char*, 9> kGrapplerOptimizations = {
-    "pruning",  "function",   "common_subgraph_elimination",
-    "shape",    "arithmetic", "layout_optimizer",
-    "remapper", "loop",       "dependency",
-};
-
 // Parses a list of string optimizer configurations into a map from
 // optimizer name -> rewriter config for that optimizer.
 Status ToConfigMap(
@@ -118,11 +102,6 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
         ApplyOptimization(optimization, cluster, &optimized_item));
   }
 
-  for (const auto& optimization : kGrapplerOptimizations) {
-    TF_RETURN_IF_ERROR(
-        ApplyOptimization(optimization, cluster, &optimized_item));
-  }
-
   // Store the final result of all the optimizations in `output`.
   output->Swap(&optimized_item.graph);
 
@@ -132,16 +111,17 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
           .ReachableDefinitions(*output);
   const auto producer = output->versions().producer();
   bool optimized_functions = false;
-  for (const FunctionDef& func : output->library().function()) {
+  for (const auto& name : flib.ListFunctionNames()) {
+    auto* func = flib.Find(name);
     // Skip non tf.data functions.
-    if (!func.attr().contains(data::kTFDataFunction)) continue;
-    VLOG(3) << "Optimize function: function=" << func.signature().name();
+    if (!func->attr().contains(data::kTFDataFunction)) continue;
+    VLOG(3) << "Optimize function: function=" << func->signature().name();
     optimized_functions = true;
 
     // Make a GrapplerItem from a FunctionDef.
     GrapplerFunctionItem func_item;
     TF_RETURN_IF_ERROR(
-        MakeGrapplerFunctionItem(func, flib, producer, &func_item));
+        MakeGrapplerFunctionItem(*func, flib, producer, &func_item));
 
     GraphDef optimized_func_graph;
     TF_RETURN_IF_ERROR(Optimize(cluster, func_item, &optimized_func_graph));
@@ -162,7 +142,7 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     // Replace optimized function with a new FunctionDef.
     TF_RETURN_IF_ERROR(
-        flib.ReplaceFunction(func.signature().name(), optimized_func));
+        flib.ReplaceFunction(func->signature().name(), optimized_func));
   }
   if (optimized_functions) {
     *output->mutable_library() = flib.ToProto();
@@ -221,27 +201,6 @@ Status TFDataMetaOptimizer::Init(
     }
   }
 
-  // Enable a subset of grappler optimization that are enabled by default.
-  //
-  // Layout optimizations are excluded because they assume that ops without
-  // explicit device assignment will be placed on GPU (if available) but that's
-  // not the case for operations within tf.data functions.
-  //
-  // TODO(b/120437209): Re-enable constant folding.
-  //
-  // TODO(jsimsa): Make the set of generic Grappler optimization applied to
-  // tf.data functions configurable.
-  enabled_optimizers_["pruning"] = MakeUnique<ModelPruner>();
-  enabled_optimizers_["shape"] = MakeUnique<ShapeOptimizer>();
-  enabled_optimizers_["remapping"] = MakeUnique<Remapper>(RewriterConfig::ON);
-  enabled_optimizers_["common_subgraph_elimination"] =
-      MakeUnique<CommonSubgraphElimination>();
-  enabled_optimizers_["arithmetic"] = MakeUnique<ArithmeticOptimizer>();
-  enabled_optimizers_["dependency"] = MakeUnique<DependencyOptimizer>();
-  enabled_optimizers_["loop"] = MakeUnique<LoopOptimizer>();
-  enabled_optimizers_["function"] = MakeUnique<FunctionOptimizer>(
-      RewriterConfig::ON, /*lower_control_flow=*/true);
-
   return Status::OK();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.cc b/tensorflow/core/grappler/optimizers/implementation_selector.cc
index 9c4f74d7268..2b0a27aaa2d 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.cc
@@ -17,9 +17,12 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -36,6 +39,11 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+constexpr char kConstOp[] = "Const";
+constexpr char kCaseOp[] = "Case";
+constexpr char kDeviceIndexOp[] = "DeviceIndex";
+
+// TODO(b/157615690): clean up function implementation swap code.
 // The overall idea for the function swap is like below:
 //          -----------                            -----------
 //  inp_1 ->|  P_C    | -> out_1         g_inp_1 ->|  P_C    | -> g_out_1
@@ -292,6 +300,74 @@ Status ImplementationSelector::MaybeOptimizeFunctionCall(
   return Status::OK();
 }
 
+// Finds the index of the device from the device name list.
+Status FindDeviceIndex(const utils::MutableNodeView* device_index_node,
+                       const string& device, int* index) {
+  DeviceNameUtils::ParsedName parsed_name;
+  if (!DeviceNameUtils::ParseFullName(device, &parsed_name) ||
+      !parsed_name.has_type) {
+    return errors::Internal("Could not parse device name:", device);
+  }
+  const auto& device_list =
+      device_index_node->GetAttr("device_names")->list().s();
+  auto it = absl::c_find(device_list, parsed_name.type);
+  if (it != device_list.end()) {
+    *index = it - device_list.begin();
+  } else {
+    // Sets *index to device_list.size() because the default_fn is guaranteed to
+    // be the final item in the case op branching list.
+    *index = device_list.size();
+  }
+  return Status::OK();
+}
+
+// Rewrites the device_index op to a const op with value of the index.
+void RewriteDeviceIndexOp(utils::MutableNodeView* device_index_node,
+                          int index) {
+  // Modifies the DeviceIndex node to be an Const op with correct device index.
+  auto node = device_index_node->node();
+  node->set_op(kConstOp);
+  node->clear_attr();
+  (*node->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*node->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+  tensor->add_int_val(index);
+  VLOG(2) << "Node after rewriting:" << node->DebugString();
+}
+
+Status ImplementationSelector::SelectDeviceIndex(GraphDef* graph) const {
+  Status status;
+  VLOG(2) << "graph before rewriting device index:" << graph->DebugString();
+  utils::MutableGraphView graph_view(graph, &status);
+  TF_RETURN_IF_ERROR(status);
+  const int num_nodes = graph_view.NumNodes();
+  for (int k = 0; k < num_nodes; ++k) {
+    auto* node_view = graph_view.GetNode(k);
+    if (node_view->GetOp() != kDeviceIndexOp) {
+      continue;
+    }
+    VLOG(2) << "Found a node to rewrite the device index";
+
+    // Find the case node with device index node as input, rewrite the
+    // DeviceIndex node to have the value of the index of device type of the
+    // case node.
+    for (const auto& fanouts : node_view->GetRegularFanouts()) {
+      for (const auto& fanout : fanouts) {
+        if (fanout.node_view()->GetOp() != kCaseOp) continue;
+        int index;
+        // If any error is thrown out during device parsing, we simply skip
+        // and do not modify the DeviceIndexNode.
+        Status status =
+            FindDeviceIndex(node_view, fanout.node_view()->GetDevice(), &index);
+        if (status.ok()) {
+          RewriteDeviceIndexOp(node_view, index);
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
 Status ImplementationSelector::SelectImplementation(GraphDef* graph) const {
   if (!graph->has_library()) {
     VLOG(2) << "Skipping graph since it does not have function def";
@@ -307,8 +383,9 @@ Status ImplementationSelector::SelectImplementation(GraphDef* graph) const {
   TF_RETURN_IF_ERROR(status);
 
   const int num_nodes = graph_view.NumNodes();
-  for (int k = 0; k < num_nodes; ++k)
+  for (int k = 0; k < num_nodes; ++k) {
     TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph_view.GetNode(k)));
+  }
 
   return Status::OK();
 }
@@ -326,7 +403,13 @@ Status ImplementationSelector::Optimize(Cluster* cluster,
             << "libraries: " << status;
     return errors::Aborted("Skipped Optimization");
   }
+
   *optimized_graph = item.graph;
+  status = SelectDeviceIndex(optimized_graph);
+  if (!status.ok()) {
+    *optimized_graph = item.graph;
+    VLOG(2) << "Could not rewrite device index due to error:" << status;
+  }
   return SelectImplementation(optimized_graph);
 }
 
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector.h b/tensorflow/core/grappler/optimizers/implementation_selector.h
index 57d19fe7046..f6962e0a10d 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector.h
+++ b/tensorflow/core/grappler/optimizers/implementation_selector.h
@@ -34,6 +34,28 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+// Motivation: To achieve the same high level functionality, the underlying
+// implementations sometimes are different for various devices where the
+// function runs. In order to achieve the correct result and best performance,
+// the proper implementation needs to be picked dynamically.
+//
+// Currently there are two approaches to do this.
+// (1) Utilize case op and dynamacically change the branch index.
+// (2) Swap function implementation, it will be deprecated.
+//
+// Idea for approach 1.
+// This transformation rewrites the DeviceIndex op with a Const op with value
+// of the index of the device the associcated Case op runs.
+// Example:
+// def plus_one_gpu(x): return x + 1.0
+// def plus_one_reference_implementation(x): return x + 1.0
+// input = tf.constant(2.0, dtype=tf.float32)
+// cpu_fn = lambda:plus_one_reference_implementation(input)
+// gpu_fn = lambda:plus_one_gpu(input)
+// control_flow_ops.execute_fn_for_device(
+//  {"CPU": cpu_fn, "GPU":gpu_fn)}, default_fn=cpu_fn)
+//
+// Idea for approach 2.
 // This transformation replaces function calls by the appropriate function
 // definition based on properties of the runtime system. For instance,
 // we may choose one implementation over another if we have a GPU with
@@ -58,7 +80,8 @@ namespace grappler {
 // z = plus_one_gpu(input)
 // print(sess.run(z))
 //
-// At runtime, we will trim either `plus_one_gpu` or
+
+// At runtime, we will select either `plus_one_gpu` or
 // `plus_one_reference_implementation` based on the availability of the GPU.
 //
 // Available annotations:
@@ -106,6 +129,68 @@ class ImplementationSelector : public CustomGraphOptimizer {
   // gradients.
   Status SelectImplementation(GraphDef* graph) const;
 
+  // Rewrites the DeviceIndex op with a Const op with value of the index of the
+  // device the associcated Case op runs.
+
+  // This function first looks up all the DeviceIndex ops.
+  // Then for each of these ops, it finds the device of the
+  // associated Case op that takes the DeviceIndex op as the input, and
+  // caculates the index of the device in the device list of DeviceIndex op.
+  // Lastly, it rewrites the DeviceIndex op with a Const op and sets the value
+  // to be the index.
+  //
+  // Example input nodes:
+  // node {
+  //   name: "x"
+  //   op: "DeviceIndex"
+  //   device: "/device:CPU:0"
+  //   attr {
+  //     key: "device_names"
+  //     value {
+  //       list {
+  //         s: "CPU"
+  //         s: "TPU_REPLICATED_CORE"
+  //         s: "GPU"
+  //       }
+  //     }
+  //   }
+  // }
+  // node {
+  //   name: "case"
+  //   op: "Case"
+  //   input: "x"
+  //   device: "/device:GPU:0"
+  //   ...
+  // }
+  // Example output nodes:
+  //
+  //  name: "x"
+  //  op: "Const"
+  //  device: "/device:CPU:0"
+  //  attr {
+  //    key: "dtype"
+  //    value {
+  //      type: DT_INT32
+  //    }
+  //  }
+  //  attr {
+  //    key: "value"
+  //    value {
+  //      tensor {
+  //        dtype: DT_INT32
+  //        int_val: 2
+  //      }
+  //    }
+  //  }
+  // node {
+  //   name: "case"
+  //   op: "Case"
+  //   input: "x"
+  //   device: "/device:GPU:0"
+  //   ...
+  // }
+  Status SelectDeviceIndex(GraphDef* graph) const;
+
   std::unique_ptr<FunctionLibraryApiInfo> lib_info_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ImplementationSelector);
diff --git a/tensorflow/core/grappler/optimizers/implementation_selector_test.cc b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
index 914570fcadb..2ef8bb878cc 100644
--- a/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
+++ b/tensorflow/core/grappler/optimizers/implementation_selector_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -58,6 +59,167 @@ TEST_F(ImplementationSelectorTest, NoUpdate) {
   EXPECT_EQ(item.graph.node_size(), output.node_size());
 }
 
+TEST_F(ImplementationSelectorTest, SelectDeviceIndex) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, GpuDevice)});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of GPU index 1.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.attr().at("value").tensor().int_val(0));
+    }
+  }
+}
+
+TEST_F(ImplementationSelectorTest, SelectDeviceIndexMultiOps) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("TPU_REPLICATED_CORE");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, GpuDevice),
+       NDef("y", "DeviceIndex", {}, {{"device_names", device_names}},
+            GpuDevice),
+       NDef("case_y", "Case", {"y"}, {{"T", DT_FLOAT}}, TpuDevice)});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of GPU index 1.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(2, node.attr().at("value").tensor().int_val(0));
+    }
+    if (node.name() == "y") {
+      // Rewrite DeviceIndex op to a Const op with value of CPU index 0.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.attr().at("value").tensor().int_val(0));
+    }
+  }
+}
+
+TEST_F(ImplementationSelectorTest, SelectDeviceIndexNotFound) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, TpuDevice)});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of device names length.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(2, node.attr().at("value").tensor().int_val(0));
+    }
+  }
+}
+
+TEST_F(ImplementationSelectorTest, SelectDeviceIndexError) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("GPU");
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, "")});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Device parse has error, do not rewrite the DeviceIndexNode.
+      EXPECT_EQ("DeviceIndex", node.op());
+    }
+  }
+}
+
+TEST_F(ImplementationSelectorTest, TwoTypesOfSwapImplementation) {
+  using test::function::NDef;
+  ImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  // DeviceIndex op based implementation selector.
+  AttrValue device_names;
+  device_names.mutable_list()->add_s("CPU");
+  device_names.mutable_list()->add_s("TPU_REPLICATED_CORE");
+  device_names.mutable_list()->add_s("GPU");
+
+  // Function swap based implementation selector.
+  auto cpu_def = test::function::XTimesTwo();
+  auto* func_attr = cpu_def.mutable_attr();
+  (*func_attr)["api_implements"].set_s("times_two");
+  (*func_attr)["api_preferred_device"].set_s("CPU");
+
+  auto gpu_def = test::function::XAddX();
+  auto* func2_attr = gpu_def.mutable_attr();
+  (*func2_attr)["api_implements"].set_s("times_two");
+  (*func2_attr)["api_preferred_device"].set_s("GPU");
+
+  item.graph = test::function::GDef(
+      {NDef("x", "DeviceIndex", {}, {{"device_names", device_names}},
+            CpuDevice),
+       NDef("case", "Case", {"x"}, {{"T", DT_FLOAT}}, GpuDevice),
+       NDef("y", "DeviceIndex", {}, {{"device_names", device_names}},
+            GpuDevice),
+       NDef("case_y", "Case", {"y"}, {{"T", DT_FLOAT}}, TpuDevice),
+       NDef("y1", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, GpuDevice),
+       NDef("z1", "Identity", {"y1"}, {{"T", DT_FLOAT}}, GpuDevice),
+       NDef("y2", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, CpuDevice),
+       NDef("z2", "Identity", {"y2"}, {{"T", DT_FLOAT}}, CpuDevice)},
+      // FunctionLib
+      {cpu_def, gpu_def});
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "x") {
+      // Rewrite DeviceIndex op to a Const op with value of GPU index 1.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(2, node.attr().at("value").tensor().int_val(0));
+    }
+    if (node.name() == "y") {
+      // Rewrite DeviceIndex op to a Const op with value of CPU index 0.
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(1, node.attr().at("value").tensor().int_val(0));
+    }
+    if (node.name() == "y1") {
+      // Make sure the implementation has been swapped to use the GPU version.
+      EXPECT_EQ("XAddX", node.op());
+    } else if (node.name() == "y2") {
+      // Make sure the implementation is not changed.
+      EXPECT_EQ("XTimesTwo", node.op());
+    }
+  }
+}
+
 TEST_F(ImplementationSelectorTest, SwapImplementation) {
   using test::function::NDef;
   auto cpu_def = test::function::XTimesTwo();
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 151f2367c95..fd4bdacff93 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -272,6 +272,7 @@ class BatchResource : public ResourceBase {
                        int32 batch_timeout_micros, int32 max_enqueued_batches,
                        const std::vector<int32>& allowed_batch_sizes,
                        FunctionLibraryRuntime::Handle fhandle,
+                       bool enable_large_batch_splitting,
                        std::unique_ptr<BatchResource>* resource) {
     std::unique_ptr<BatchResource> new_resource(new BatchResource);
 
@@ -286,6 +287,10 @@ class BatchResource : public ResourceBase {
     new_resource->batcher_queue_options_.batch_timeout_micros =
         batch_timeout_micros;
 
+    // Support for splitting large batch is still in progress.
+    new_resource->batcher_queue_options_.enable_large_batch_splitting =
+        enable_large_batch_splitting;
+
     new_resource->allowed_batch_sizes_ = allowed_batch_sizes;
 
     new_resource->fhandle_ = fhandle;
@@ -786,6 +791,13 @@ class BatchFunctionKernel : public AsyncOpKernel {
     OP_REQUIRES_OK(c, c->GetAttr("f", &func));
     OP_REQUIRES_OK(
         c, lib->Instantiate(func.name(), AttrSlice(&func.attr()), &fhandle_));
+
+    if (c->HasAttr("enable_large_batch_splitting")) {
+      OP_REQUIRES_OK(c, c->GetAttr("enable_large_batch_splitting",
+                                   &enable_large_batch_splitting_));
+    } else {
+      enable_large_batch_splitting_ = false;
+    }
   }
 
   bool IsExpensive() override { return false; }
@@ -794,10 +806,10 @@ class BatchFunctionKernel : public AsyncOpKernel {
     BatchResource* br;
     std::function<Status(BatchResource**)> creator = [this](BatchResource** r) {
       std::unique_ptr<BatchResource> new_resource;
-      TF_RETURN_IF_ERROR(
-          BatchResource::Create(num_batch_threads_, max_batch_size_,
-                                batch_timeout_micros_, max_enqueued_batches_,
-                                allowed_batch_sizes_, fhandle_, &new_resource));
+      TF_RETURN_IF_ERROR(BatchResource::Create(
+          num_batch_threads_, max_batch_size_, batch_timeout_micros_,
+          max_enqueued_batches_, allowed_batch_sizes_, fhandle_,
+          enable_large_batch_splitting_, &new_resource));
       *r = new_resource.release();
       return Status::OK();
     };
@@ -844,6 +856,7 @@ class BatchFunctionKernel : public AsyncOpKernel {
   int32 max_enqueued_batches_;
   std::vector<int32> allowed_batch_sizes_;
   FunctionLibraryRuntime::Handle fhandle_;
+  bool enable_large_batch_splitting_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BatchFunction").Device(DEVICE_CPU),
@@ -876,7 +889,7 @@ class BatchKernel : public AsyncOpKernel {
       std::unique_ptr<BatchResource> new_resource;
       TF_RETURN_IF_ERROR(BatchResource::Create(
           num_batch_threads_, max_batch_size_, batch_timeout_micros_,
-          max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle,
+          max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle, false,
           &new_resource));
       *r = new_resource.release();
       return Status::OK();
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index c44de023ced..66bdff933d8 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -160,6 +160,10 @@ class SharedBatchScheduler
     // See the class documentation above for guidelines on how to tune this
     // parameter.
     size_t max_enqueued_batches = 10;
+
+    // If true, queue implementation would split one input batch task into
+    // subtasks and fit them into different batches.
+    bool enable_large_batch_splitting = false;
   };
   Status AddQueue(const QueueOptions& options,
                   std::function<void(std::unique_ptr<Batch<TaskType>>)>
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index d088abc00e6..6d0351202df 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -3,6 +3,7 @@
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "if_not_mobile",
     "tf_cc_test",
     "tf_kernel_library",
 )
@@ -150,6 +151,7 @@ cc_library(
         ":dataset_utils",
         ":single_threaded_executor",
         ":stats_utils",
+        "@com_google_absl//absl/time",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
@@ -158,8 +160,10 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:variable_ops",
         "//tensorflow/core/profiler/lib:traceme",
-        "@com_google_absl//absl/time",
-    ],
+    ] + if_not_mobile([
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+    ]),
 )
 
 cc_library(
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index dd64475d7d6..d79cb25ec8b 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -35,6 +35,11 @@ limitations under the License.
 #include "tensorflow/core/platform/notification.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
+#if !defined(IS_MOBILE_PLATFORM)
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#endif  // !IS_MOBILE_PLATFORM
+
 namespace tensorflow {
 namespace data {
 namespace {
@@ -612,6 +617,28 @@ Status CapturedFunction::Instantiate(
     for (size_t i = 0; i < fdef->signature().output_arg_size(); ++i) {
       inst_opts.output_devices.push_back(inst_opts.target);
     }
+
+#if !defined(IS_MOBILE_PLATFORM)
+    grappler::GrapplerItem::OptimizationOptions optimization_options;
+    optimization_options.allow_pruning_stateful_and_dataset_ops = false;
+    ConfigProto config_proto = inst_opts.config_proto;
+    // Layout optimizations are excluded because they assume that ops without
+    // explicit device assignment will be placed on GPU (if available) but
+    // that's not the case for operations within tf.data functions.
+    config_proto.mutable_graph_options()
+        ->mutable_rewrite_options()
+        ->set_layout_optimizer(RewriterConfig::OFF);
+    // TODO(b/120437209): Re-enable constant folding.
+    config_proto.mutable_graph_options()
+        ->mutable_rewrite_options()
+        ->set_constant_folding(RewriterConfig::OFF);
+    inst_opts.optimize_graph_fn =
+        std::bind(tensorflow::grappler::OptimizeGraph, std::placeholders::_1,
+                  std::placeholders::_2, std::placeholders::_3,
+                  std::placeholders::_4, std::placeholders::_5,
+                  std::move(config_proto), fdef->signature().name(),
+                  std::move(optimization_options), std::placeholders::_6);
+#endif  // !IS_MOBILE_PLATFORM
   }
 
   FunctionLibraryRuntime::Handle f_handle;
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 0c3fe43d6c3..0230bcd146d 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -100,9 +100,13 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
     AttrValue slack_period_attr;
     b->BuildAttrValue(slack_period_, &slack_period_attr);
-    TF_RETURN_IF_ERROR(b->AddDataset(
-        this, {input_graph_node, buffer_size},
-        {std::make_pair(kSlackPeriod, slack_period_attr)}, output));
+    AttrValue legacy_autotune_attr;
+    b->BuildAttrValue(legacy_autotune_, &legacy_autotune_attr);
+    TF_RETURN_IF_ERROR(
+        b->AddDataset(this, {input_graph_node, buffer_size},
+                      {std::make_pair(kSlackPeriod, slack_period_attr),
+                       std::make_pair(kLegacyAutotune, legacy_autotune_attr)},
+                      output));
     return Status::OK();
   }
 
diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index 94d523b5bfb..c6387a49f46 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -39,7 +39,9 @@ namespace data {
 constexpr char kCurrentFileIndex[] = "current_file_index";
 constexpr char kOffset[] = "offset";
 constexpr char kGcsFsPrefix[] = "gs://";
+constexpr char kS3FsPrefix[] = "s3://";
 constexpr int64 kCloudTpuBlockSize = 127LL << 20;  // 127MB.
+constexpr int64 kS3BlockSize = kCloudTpuBlockSize;
 
 bool is_cloud_tpu_gcs_fs() {
 #if defined(PLATFORM_CLOUD_TPU) && defined(TPU_GCS_FS)
@@ -237,12 +239,14 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
       errors::InvalidArgument("`filenames` must be a scalar or a vector."));
 
   bool is_gcs_fs = true;
+  bool is_s3_fs = true;
   std::vector<string> filenames;
   filenames.reserve(filenames_tensor->NumElements());
   for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
     VLOG(2) << "Reading file: " << filenames_tensor->flat<tstring>()(i);
     filenames.push_back(filenames_tensor->flat<tstring>()(i));
     is_gcs_fs &= absl::StartsWith(filenames[i], kGcsFsPrefix);
+    is_s3_fs &= absl::StartsWith(filenames[i], kS3FsPrefix);
   }
 
   tstring compression_type;
@@ -264,6 +268,13 @@ void TFRecordDatasetOp::MakeDataset(OpKernelContext* ctx,
     buffer_size = kCloudTpuBlockSize;
   }
 
+  if (is_s3_fs && buffer_size < kS3BlockSize) {
+    VLOG(2) << "User buffer size is too small for reading "
+            << "TFRecords stored in S3. Overriding " << buffer_size
+            << " to the minimum recommended buffer_size = " << kS3BlockSize;
+    buffer_size = kS3BlockSize;
+  }
+
   *output =
       new Dataset(ctx, std::move(filenames), compression_type, buffer_size);
 }
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 7f4d1144cb2..96c0a3d6bdc 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -924,5 +924,37 @@ class FakeParamOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_CPU), FakeParamOp);
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_GPU), FakeParamOp);
 
+// DeviceIndexOP returns the current device index.
+class DeviceIndexOp : public OpKernel {
+ public:
+  explicit DeviceIndexOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("device_names", &device_names_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    Tensor* device_name_t;
+    OP_REQUIRES_OK(ctx,
+                   ctx->allocate_output(0, TensorShape({}), &device_name_t));
+    DeviceNameUtils::ParsedName parsed_name;
+    int index = device_names_.size();
+    if (DeviceNameUtils::ParseFullName(ctx->device()->name(), &parsed_name) &&
+        parsed_name.has_type) {
+      auto it = absl::c_find(device_names_, parsed_name.type);
+      if (it != device_names_.end()) {
+        index = it - device_names_.begin();
+      }
+    }
+    device_name_t->scalar<int32>()() = index;
+  }
+
+ private:
+  PersistentTensor value_handle_;
+  std::vector<string> device_names_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("DeviceIndex").Device(DEVICE_CPU), DeviceIndexOp);
+REGISTER_KERNEL_BUILDER(
+    Name("DeviceIndex").Device(DEVICE_GPU).HostMemory("index"), DeviceIndexOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index 21be643eaa5..ec5f80cb3fa 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -244,7 +244,7 @@ class MklAddNOp : public OpKernel {
 
       // Create Sum op, and submit net for execution.
       std::vector<primitive> net;
-      auto sum_stream = CPU_STREAM(cpu_engine);
+      stream* fwd_cpu_stream = CreateStream(ctx, cpu_engine);
 #ifdef ENABLE_MKLDNN_V1
       mkldnn::sum sum_op(sum_pd);
       std::unordered_map<int, memory> net_args = {
@@ -253,10 +253,10 @@ class MklAddNOp : public OpKernel {
       for (int i = 0; i < num_inputs; ++i) {
         net_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, inputs[i]});
       }
-      sum_op.execute(sum_stream, net_args);
+      sum_op.execute(*fwd_cpu_stream, net_args);
 #else
       net.push_back(sum(sum_pd, inputs, dst.GetOpMem()));
-      sum_stream.submit(net).wait();
+      fwd_cpu_stream->submit(net).wait();
 #endif
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index 47b999e0e11..8de3e327f96 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -136,9 +136,10 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
       const T* src_data = input_tensor.flat<T>().data();
 
       T* dst_data = output_tensor->flat<T>().data();
-
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, pooling_fwd->GetEngine()));
       // Execute pooling op.
-      pooling_fwd->Execute(src_data, dst_data);
+      pooling_fwd->Execute(src_data, dst_data, nullptr, fwd_cpu_stream);
 
       // Pass min, max from input to output.
       if (int8_forward_inference) {
@@ -240,8 +241,8 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
               : memory::desc(diff_dst_dims, MklDnnType<T>(),
                              this->data_format_mkldnn_);
 
-      // Pass prop_kind::forward_training to create a forward primitive
-      // that is used in the backward pass.
+// Pass prop_kind::forward_training to create a forward primitive
+// that is used in the backward pass.
 #ifdef ENABLE_MKLDNN_V1
       // TODO(DNNL): Find out what should we use src_md.data.format.
       MklPoolingParams bwdParams(
@@ -260,6 +261,8 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, pooling_bwd->GetEngine()));
       Tensor* output_tensor = nullptr;
       this->AllocateOutputTensor(context, *(pooling_bwd->GetPoolingBwdPd()),
                                  orig_input_dims_mkl_order,
@@ -286,7 +289,8 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       T* diff_src_data = output_tensor->flat<T>().data();
 
       // Execute pooling op.
-      pooling_bwd->Execute(diff_dst_data, diff_src_data);
+      pooling_bwd->Execute(diff_dst_data, diff_src_data, nullptr,
+                           bwd_cpu_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index d918327ef5f..976f778424e 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -265,8 +265,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
  public:
   explicit MklConcatFwdPrimitive(const MklConcatFwdParams& concat_fwd_dims,
                                  const std::vector<memory::desc>& srcs_md)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create concat primitive
     Setup(concat_fwd_dims, srcs_md);
   }
@@ -278,7 +277,8 @@ class MklConcatFwdPrimitive : public MklPrimitive {
   //   dst_data:    output data buffer of dst
   void Execute(const std::vector<mkldnn::memory>& in_data,
                const mkldnn::memory& dst_data,
-               const MklConcatFwdParams& concat_fwd_dims) {
+               const MklConcatFwdParams& concat_fwd_dims,
+               std::shared_ptr<stream> fwd_stream) {
     DCHECK_EQ(in_data.size(), context_.data_mem.size());
     for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
       context_.data_mem_shdptr[i]->set_data_handle(
@@ -292,10 +292,10 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     }
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.fwd_primitives, context_.fwd_stream,
+    execute_primitives(context_.fwd_primitives, fwd_stream,
                        context_.fwd_primitives_args);
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_stream->submit(context_.fwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // After exec, set data handle back
@@ -335,7 +335,6 @@ class MklConcatFwdPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::concat::primitive_desc> fwd_pd;
     std::shared_ptr<mkldnn::primitive> concat_fwd;
 
-    std::shared_ptr<mkldnn::stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -343,10 +342,7 @@ class MklConcatFwdPrimitive : public MklPrimitive {
 #endif  // ENABLE_MKLDNN_V1
 
     ConcatFwdContext()
-        : dst_mem(nullptr),
-          fwd_pd(nullptr),
-          concat_fwd(nullptr),
-          fwd_stream(nullptr) {}
+        : dst_mem(nullptr), fwd_pd(nullptr), concat_fwd(nullptr) {}
   };
 
   // Creates the src and dst memory descriptor for mkl concat
@@ -417,7 +413,6 @@ class MklConcatFwdPrimitive : public MklPrimitive {
   }
 
   struct ConcatFwdContext context_;
-  engine cpu_engine_;
 };
 
 // Class to create/cache the mkl concat primitives based on the
@@ -758,7 +753,7 @@ class MklConcatOp : public OpKernel {
         for (int k = 0; k < input_tensors.size(); k++) {
           if (input_tensors[k].NumElements() > 0) {
             srcs[k].CheckReorderToOpMem(
-                MEMORY_PD_WITHOUT_DATA(srcs_pd[k], cpu_engine));
+                MEMORY_PD_WITHOUT_DATA(srcs_pd[k], cpu_engine), context);
             inputs.push_back(srcs[k].GetOpMem());
           }
         }
@@ -796,7 +791,8 @@ class MklConcatOp : public OpKernel {
           if (dnn_shape_dst.IsMklTensor())
             dst_md = dnn_shape_dst.GetMklLayout();
           dst.SetUsrMem(dst_md, dst_tensor);
-          stream concat_stream = CPU_STREAM(cpu_engine);
+          std::shared_ptr<stream> fwd_cpu_stream;
+          fwd_cpu_stream.reset(CreateStream(context, cpu_engine));
 #ifdef ENABLE_MKLDNN_V1
           auto concat_op = concat(concat_pd);
           std::unordered_map<int, memory> net_args = {
@@ -805,12 +801,12 @@ class MklConcatOp : public OpKernel {
           for (int i = 0; i < inputs.size(); ++i) {
             net_args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, inputs[i]});
           }
-          concat_op.execute(concat_stream, net_args);
+          concat_op.execute(*fwd_cpu_stream, net_args);
 #else
           auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
           std::vector<primitive> net;
           net.push_back(concat_op);
-          concat_stream.submit(net).wait();
+          fwd_cpu_stream->submit(net).wait();
 #endif  // ENABLE_MKLDNN_V1
         } else {
           MklConcatFwdPrimitive<T>* concat_fwd = nullptr;
@@ -835,9 +831,11 @@ class MklConcatOp : public OpKernel {
           dst_md = dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout()
                                                : dst_md;
           dst.SetUsrMem(dst_md, dst_tensor);
-
+          std::shared_ptr<stream> fwd_cpu_stream;
+          fwd_cpu_stream.reset(CreateStream(context, concat_fwd->GetEngine()));
           // Execute concat
-          concat_fwd->Execute(srcs_mem, dst.GetOpMem(), concat_fwd_dims);
+          concat_fwd->Execute(srcs_mem, dst.GetOpMem(), concat_fwd_dims,
+                              fwd_cpu_stream);
         }
 
         // For quantized concat, min and max outputs are also computed.
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index 269513f2e7d..4c3cea4b6ff 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -97,9 +97,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
  public:
   explicit MklConvBwdFilterPrimitive(
       const MklConvBwdFilterParams& convBwdFilterDims)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.bwd_filter_stream.reset(new CPU_STREAM(cpu_engine_));
-
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create convolution backward filter primitive.
     if (context_.conv_bwd_filter == nullptr) {
       Setup(convBwdFilterDims);
@@ -114,7 +112,8 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   //   diff_bias_data:   output data buffer for diff_bias
   //   diff_dst_data:    input data buffer for diff_dst
   void Execute(const T* src_data, const T* diff_filter_data,
-               const T* diff_bias_data, const T* diff_dst_data) {
+               const T* diff_bias_data, const T* diff_dst_data,
+               std::shared_ptr<stream> bwd_filter_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_filter_mem->set_data_handle(
@@ -127,11 +126,10 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
         static_cast<void*>(const_cast<T*>(diff_dst_data)));
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.bwd_filter_primitives,
-                       context_.bwd_filter_stream,
+    execute_primitives(context_.bwd_filter_primitives, bwd_filter_stream,
                        context_.bwd_filter_primitives_args);
 #else
-    context_.bwd_filter_stream->submit(context_.bwd_filter_primitives);
+    bwd_filter_stream->submit(context_.bwd_filter_primitives);
 #endif
 
     context_.src_mem->set_data_handle(DummyData);
@@ -147,8 +145,10 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   //   diff_filter_data: output data buffer of diff_filter
   //   diff_dst_data:    input data buffer of diff_dst
   void Execute(const T* src_data, const T* diff_filter_data,
-               const T* diff_dst_data) {
-    Execute(src_data, diff_filter_data, nullptr, diff_dst_data);
+               const T* diff_dst_data,
+               std::shared_ptr<stream> bwd_filter_stream) {
+    Execute(src_data, diff_filter_data, nullptr, diff_dst_data,
+            bwd_filter_stream);
   }
 
 #ifndef ENABLE_MKLDNN_V1
@@ -223,8 +223,7 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
           src_md(nullptr),
           diff_filter_md(nullptr),
           diff_bias_md(nullptr),
-          diff_dst_md(nullptr),
-          bwd_filter_stream(nullptr) {
+          diff_dst_md(nullptr) {
     }
   };
 
@@ -345,7 +344,6 @@ class MklConvBwdFilterPrimitive : public MklPrimitive {
   }
 
   struct ConvBwdFilterContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -600,8 +598,10 @@ class MklConvCustomBackpropFilterOp
       auto bwd_filter_pd = conv_bwd_filter->GetPrimitiveDesc();
       if (IS_SRC_REORDER_NEEDED(fwd_src_md, bwd_filter_pd, conv_bwd_filter)) {
         src.SetUsrMem(fwd_src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            bwd_filter_pd->PRIMITIVE_DESC_SRC, cpu_engine_));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(bwd_filter_pd->PRIMITIVE_DESC_SRC,
+                                   cpu_engine_),
+            context);
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
       } else {
         src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
@@ -612,8 +612,10 @@ class MklConvCustomBackpropFilterOp
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, bwd_filter_pd,
                                      conv_bwd_filter)) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            bwd_filter_pd->PRIMITIVE_DESC_DIFF_DST, cpu_engine_));
+        diff_dst.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(bwd_filter_pd->PRIMITIVE_DESC_DIFF_DST,
+                                   cpu_engine_),
+            context);
         diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data =
@@ -646,18 +648,21 @@ class MklConvCustomBackpropFilterOp
       }
 
       // Execute convolution backward filter.
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, conv_bwd_filter->GetEngine()));
       if (bias_enabled) {
         T* diff_bias_data =
             static_cast<T*>(const_cast<T*>(diff_bias_tensor->flat<T>().data()));
         conv_bwd_filter->Execute(src_data, diff_filter_data, diff_bias_data,
-                                 diff_dst_data);
+                                 diff_dst_data, bwd_cpu_stream);
       } else {
-        conv_bwd_filter->Execute(src_data, diff_filter_data, diff_dst_data);
+        conv_bwd_filter->Execute(src_data, diff_filter_data, diff_dst_data,
+                                 bwd_cpu_stream);
       }
 
       // Reorder diff_filter back to Tensorflow layout if necessary.
       if (diff_filter_reorder_required) {
-        diff_filter.InsertReorderToUserMem();
+        diff_filter.InsertReorderToUserMem(context);
       }
 
       // Delete primitive since it is not cached.
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index bcd0446b748..f9c8d11c67c 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -99,9 +99,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
  public:
   explicit MklConvBwdInputPrimitive(
       const MklConvBwdInputParams& convBwdInputDims)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.bwd_input_stream.reset(new CPU_STREAM(cpu_engine_));
-
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create conv bwd input primitive
     if (context_.conv_bwd_input == nullptr) {
       Setup(convBwdInputDims);
@@ -116,7 +114,8 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   //   diff_dst_data: input data buffer for dst
   // Bias does not matter here
   void Execute(const T* diff_src_data, const T* filter_data,
-               const T* diff_dst_data) {
+               const T* diff_dst_data,
+               std::shared_ptr<stream> bwd_input_stream) {
     context_.diff_src_mem->set_data_handle(
         static_cast<T*>(const_cast<T*>(diff_src_data)));
     context_.filter_mem->set_data_handle(
@@ -125,10 +124,10 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
         static_cast<T*>(const_cast<T*>(diff_dst_data)));
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.bwd_input_primitives, context_.bwd_input_stream,
+    execute_primitives(context_.bwd_input_primitives, bwd_input_stream,
                        context_.bwd_input_primitives_args);
 #else
-    context_.bwd_input_stream->submit(context_.bwd_input_primitives);
+    bwd_input_stream->submit(context_.bwd_input_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // Set data handle back to DummyData.
@@ -180,7 +179,6 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
     std::shared_ptr<memory::desc> diff_dst_md;
 
     // MKL-DNN pipeline for executing primitives.
-    std::shared_ptr<mkldnn::stream> bwd_input_stream;
     std::vector<mkldnn::primitive> bwd_input_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -203,8 +201,7 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
           fwd_pd(nullptr),
           diff_src_md(nullptr),
           filter_md(nullptr),
-          diff_dst_md(nullptr),
-          bwd_input_stream(nullptr) {
+          diff_dst_md(nullptr) {
     }
   };
 
@@ -290,7 +287,6 @@ class MklConvBwdInputPrimitive : public MklPrimitive {
   }
 
   struct ConvBwdInputContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -522,8 +518,10 @@ class MklConvCustomBackpropInputOp
       if (IS_FILTER_REORDER_NEEDED(fwd_filter_md, bwd_input_pd,
                                    conv_bwd_input)) {
         filter.SetUsrMem(fwd_filter_md, &filter_tensor);
-        filter.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            bwd_input_pd.get()->PRIMITIVE_DESC_WEIGHTS, cpu_engine_));
+        filter.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(bwd_input_pd.get()->PRIMITIVE_DESC_WEIGHTS,
+                                   cpu_engine_),
+            context);
         filter_data = static_cast<T*>(filter.GetOpMem().get_data_handle());
       } else {
         filter_data =
@@ -535,23 +533,29 @@ class MklConvCustomBackpropInputOp
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, bwd_input_pd,
                                      conv_bwd_input)) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            bwd_input_pd.get()->PRIMITIVE_DESC_DIFF_DST, cpu_engine_));
+        diff_dst.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(bwd_input_pd.get()->PRIMITIVE_DESC_DIFF_DST,
+                                   cpu_engine_),
+            context);
         diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data =
             static_cast<T*>(const_cast<T*>(diff_dst_tensor.flat<T>().data()));
       }
 
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, conv_bwd_input->GetEngine()));
       // Execute conv bwd input primitive.
       if (!eager_mode) {
-        conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data);
+        conv_bwd_input->Execute(diff_src_data, filter_data, diff_dst_data,
+                                bwd_cpu_stream);
       } else {
         // In eager mode we first write the output to temporary
         // buffer in MKL format. Then we convert the data to TF format.
         T* tmp_data =
             static_cast<T*>(const_cast<T*>(tmp_tensor.flat<T>().data()));
-        conv_bwd_input->Execute(tmp_data, filter_data, diff_dst_data);
+        conv_bwd_input->Execute(tmp_data, filter_data, diff_dst_data,
+                                bwd_cpu_stream);
         auto output_tf_md = diff_src_mkl_shape.GetTfLayout();
 #ifndef ENABLE_MKLDNN_V1
         auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine_);
@@ -563,7 +567,7 @@ class MklConvCustomBackpropInputOp
         memory* dst_data_mem =
             new MEMORY_CONSTRUCTOR(OUTPUT_TF_MD, cpu_engine_, diff_src_data);
         CreateAndExecuteReorder(reorder_pd, *tmp_data_mem, *dst_data_mem,
-                                cpu_engine_);
+                                cpu_engine_, context);
       }
 
       // Delete primitive since it is not cached.
diff --git a/tensorflow/core/kernels/mkl_dequantize_op.cc b/tensorflow/core/kernels/mkl_dequantize_op.cc
index 8737581c726..06570c1db1c 100644
--- a/tensorflow/core/kernels/mkl_dequantize_op.cc
+++ b/tensorflow/core/kernels/mkl_dequantize_op.cc
@@ -155,7 +155,8 @@ class MklDequantizeOp : public OpKernel {
       // Also it does not define round_nearest (enum).
       attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
 #endif  // !ENABLE_MKLDNN_V1
-      stream reorder_stream = CPU_STREAM(cpu_engine);
+      std::shared_ptr<stream> reorder_stream;
+      reorder_stream.reset(CreateStream(ctx, cpu_engine));
       std::vector<primitive> net;
 
       // Create reorder primitive and then execute.
@@ -169,11 +170,10 @@ class MklDequantizeOp : public OpKernel {
       reorder_net_args.push_back({{MKLDNN_ARG_FROM, *src.GetUsrMem()},
                                   { MKLDNN_ARG_TO,
                                     *dst.GetUsrMem() }});
-      execute_primitives(net, std::make_shared<stream>(reorder_stream),
-                         reorder_net_args);
+      execute_primitives(net, reorder_stream, reorder_net_args);
 #else
       net.push_back(reorder(reorder_pd, *src.GetUsrMem(), *dst.GetUsrMem()));
-      reorder_stream.submit(net);
+      reorder_stream->submit(net);
 #endif  // ENABLE_MKLDNN_V1
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 6ef806d94c7..954ae0492df 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -79,13 +79,7 @@ template <typename T, typename U>
 class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
  public:
   explicit MklFusedBatchNormFwdPrimitive(const MklBatchNormFwdParams& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-#ifdef ENABLE_MKLDNN_V1
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
-#else
-    context_.fwd_stream.reset(
-        new mkldnn::stream(mkldnn::stream::kind::eager_nostore));
-#endif
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     if (context_.bn_fwd == nullptr) Setup(fwdParams);
   }
 
@@ -98,7 +92,8 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
   //   mean_data:     output data buffer of means
   //   variance_data: output data buffer of variances
   void Execute(const T* src_data, const U* weights_data, T* dst_data,
-               U* mean_data, U* variance_data, U* workspace_data) {
+               U* mean_data, U* variance_data,
+               std::shared_ptr<stream> fwd_stream, U* workspace_data) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -117,10 +112,10 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
     }
 #ifdef ENABLE_MKLDNN_V1
     // Execute batch-normalization forward primitives.
-    execute_primitives(context_.fwd_primitives, context_.fwd_stream,
-                       context_.net_args);
+    execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_stream.reset(new stream(stream::kind::eager_nostore));
+    fwd_stream->submit(context_.fwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     context_.src_mem->set_data_handle(DummyData);
@@ -180,7 +175,6 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
 
     // BatchNorm forward primitive.
     std::shared_ptr<mkldnn::primitive> bn_fwd;
-    std::shared_ptr<mkldnn::stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -195,9 +189,8 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
           dst_mem(nullptr),
           mean_mem(nullptr),
           variance_mem(nullptr),
-          ws_mem(nullptr),
           bn_fwd(nullptr),
-          fwd_stream(nullptr) {}
+          ws_mem(nullptr) {}
   };
 
   void Setup(const MklBatchNormFwdParams& fwdParams) {
@@ -392,7 +385,6 @@ class MklFusedBatchNormFwdPrimitive : public MklPrimitive {
   }
 
   struct BatchNormFwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T, typename U>
@@ -489,13 +481,7 @@ template <typename T, typename U>
 class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
  public:
   explicit MklFusedBatchNormBwdPrimitive(const MklBatchNormBwdParams& bwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-#ifdef ENABLE_MKLDNN_V1
-    context_.bwd_stream.reset(new CPU_STREAM(cpu_engine_));
-#else
-    context_.bwd_stream.reset(
-        new mkldnn::stream(mkldnn::stream::kind::eager_nostore));
-#endif
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     if (context_.bn_bwd == nullptr) Setup(bwdParams);
   }
 
@@ -515,7 +501,8 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
   //                          on CPU as of now.
   void Execute(const T* src_data, const U* mean_data, const U* variance_data,
                const T* diff_dst_data, const U* weights_data, T* diff_src_data,
-               U* diff_weights_data, U* res_space_data) {
+               U* diff_weights_data, U* res_space_data,
+               std::shared_ptr<stream> bwd_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.mean_mem->set_data_handle(
@@ -537,10 +524,10 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
 #ifdef ENABLE_MKLDNN_V1
     // Execute backward batch-normalization primitives.
     DCHECK_EQ(context_.bwd_primitives.size(), context_.net_args.size());
-    execute_primitives(context_.bwd_primitives, context_.bwd_stream,
-                       context_.net_args);
+    execute_primitives(context_.bwd_primitives, bwd_stream, context_.net_args);
 #else
-    context_.bwd_stream->submit(context_.bwd_primitives);
+    bwd_stream.reset(new stream(stream::kind::eager_nostore));
+    bwd_stream->submit(context_.bwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // After execution, set data handle back to DummyData.
@@ -593,7 +580,6 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
     // Backward batch-normalization primitive.
     std::shared_ptr<mkldnn::primitive> bn_bwd;
     std::vector<mkldnn::primitive> bwd_primitives;
-    std::shared_ptr<mkldnn::stream> bwd_stream;
 
 #ifdef ENABLE_MKLDNN_V1
     std::vector<std::unordered_map<int, memory>> net_args;
@@ -606,8 +592,7 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
           diff_dst_mem(nullptr),
           weights_mem(nullptr),
           diff_weights_mem(nullptr),
-          diff_src_mem(nullptr),
-          bwd_stream(nullptr) {}
+          diff_src_mem(nullptr) {}
   };
 
   void Setup(const MklBatchNormBwdParams& bwdParams) {
@@ -616,7 +601,7 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
             ? GET_FLAG(use_scale_shift)
             : (GET_FLAG(use_scale_shift) | GET_FLAG(use_global_stats));
 
-    // Memory descriptors.
+// Memory descriptors.
 #ifndef ENABLE_MKLDNN_V1
     auto src_md = memory::desc({bwdParams.src_dims}, MklDnnType<T>(),
                                bwdParams.src_format);
@@ -689,7 +674,6 @@ class MklFusedBatchNormBwdPrimitive : public MklPrimitive {
   }
 
   struct BatchNormBwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T, typename U>
@@ -960,8 +944,10 @@ class MklFusedBatchNormOp : public OpKernel {
       std::shared_ptr<BatchNormFwdPd> bn_fwd_pd = bn_fwd->GetBatchNormFwdPd();
       if (IS_SRC_REORDER_NEEDED(src_md, bn_fwd_pd, bn_fwd)) {
         src.SetUsrMem(src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            GET_SRC_DESC_FROM_OP_PD(bn_fwd_pd), cpu_engine_));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(GET_SRC_DESC_FROM_OP_PD(bn_fwd_pd),
+                                   cpu_engine_),
+            context);
         src_data = static_cast<T*>(src.GetOpMem().get_data_handle());
       } else {
         src_data = static_cast<T*>(const_cast<T*>(src_tensor.flat<T>().data()));
@@ -987,9 +973,10 @@ class MklFusedBatchNormOp : public OpKernel {
       T* dst_data = dst_tensor->flat<T>().data();
 
       // Execute
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, bn_fwd->GetEngine()));
       bn_fwd->Execute(src_data, weights_op_data, dst_data, mean_op_data,
-                      variance_op_data, ws_data);
-
+                      variance_op_data, fwd_cpu_stream, ws_data);
       float adjust_factor = 1.0;
       if (is_training_) {
         size_t orig_size = src_dims[0] * src_dims[2] * src_dims[3];
@@ -1003,9 +990,6 @@ class MklFusedBatchNormOp : public OpKernel {
       auto batch_variance_data = batch_variance_tensor->flat<U>().data();
       auto est_mean_data = est_mean_tensor.flat<U>().data();
       auto est_variance_data = est_variance_tensor.flat<U>().data();
-
-      // TODO(intel-tf): Merge the `is_training && exponential_avg_factor == 1`
-      // case with the `else` (`!is_training`) case if possible.
       if (is_training_) {
         if (exponential_avg_factor_ == U(1.0)) {
           for (int k = 0; k < depth_; k++) {
@@ -1328,8 +1312,10 @@ class MklFusedBatchNormGradOp : public OpKernel {
       std::shared_ptr<BatchNormBwdPd> bn_bwd_pd = bn_bwd->GetBatchNormBwdPd();
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, bn_bwd_pd, bn_bwd)) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            GET_DIFF_DST_DESC_FROM_OP_PD(bn_bwd_pd), cpu_engine_));
+        diff_dst.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(GET_DIFF_DST_DESC_FROM_OP_PD(bn_bwd_pd),
+                                   cpu_engine_),
+            context);
         diff_dst_data = static_cast<T*>(diff_dst.GetOpMem().get_data_handle());
       } else {
         diff_dst_data =
@@ -1366,10 +1352,11 @@ class MklFusedBatchNormGradOp : public OpKernel {
                             : nullptr);
 
       // Execute
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, bn_bwd->GetEngine()));
       bn_bwd->Execute(src_data, mean_data, variance_data, diff_dst_data,
                       weights_data, diff_src_data, diff_weights_data,
-                      res_space_data);
-
+                      res_space_data, bwd_cpu_stream);
       // Allocate output TF tensors diff_scale and diff_shift.
       Tensor* diff_scale_tensor = nullptr;
       Tensor* diff_shift_tensor = nullptr;
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 532dbaa79b4..a11e7ebcbf5 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -88,7 +88,6 @@ class MklLRNOp : public OpKernel {
     workspace_enabled_ = false;
     OP_REQUIRES_OK(context,
                    context->GetAttr("workspace_enabled", &workspace_enabled_));
-    fwd_stream_.reset(new CPU_STREAM(cpu_engine_));
   }
 
   void Compute(OpKernelContext* context) override {
@@ -169,6 +168,7 @@ class MklLRNOp : public OpKernel {
           lrn_prim_desc.PRIMITIVE_DESC_SRC, cpu_engine_));
 
       std::vector<primitive> net;
+      fwd_stream_.reset(CreateStream(context, cpu_engine_));
 #ifdef ENABLE_MKLDNN_V1
       net.push_back(lrn_forward(lrn_prim_desc));
       std::vector<std::unordered_map<int, memory>> net_args;
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index 3eccf97f53c..687f67f6283 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -168,14 +168,13 @@ class MklMatMulOp : public OpKernel {
     const int index_transa = transa ? 1 : 0;
     const int index_transb = transb ? 1 : 0;
 
-    Tensor c_float;
-    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, {m, n}, &c_float));
 #ifdef ENABLE_MKLDNN_V1
     const char ftrans[] = {'N', 'T', 'C'};
     dnnl_gemm<bfloat16>(ftrans[index_transa], ftrans[index_transb], m, n, k,
-                        alpha, a, lda, b, ldb, beta,
-                        c_float.flat<float>().data(), ldc);
+                        alpha, a, lda, b, ldb, beta, c, ldc);
 #else
+    Tensor c_float;
+    OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_FLOAT, {m, n}, &c_float));
     const char* const ftrans[] = {"N", "T", "C"};
 
     // MKL-DNN only supports the Fortran API and requires column major while
@@ -185,8 +184,8 @@ class MklMatMulOp : public OpKernel {
                             reinterpret_cast<const mkldnn_bfloat16_t*>(b), &ldb,
                             reinterpret_cast<const mkldnn_bfloat16_t*>(a), &lda,
                             &beta, c_float.flat<float>().data(), &ldc);
-#endif  // ENABLE_MKLDNN_V1
     FloatToBFloat16(c_float.flat<float>().data(), c, c_float.NumElements());
+#endif  // ENABLE_MKLDNN_V1
   }
 #endif  // ENABLE_INTEL_MKL_BFLOAT16
 };
diff --git a/tensorflow/core/kernels/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl_matmul_ops_common.h
index ab816ce73fa..d3a05a4a6d2 100644
--- a/tensorflow/core/kernels/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl_matmul_ops_common.h
@@ -516,33 +516,190 @@ class MklDnnMatMulOpBase : public OpKernel {
 
 // MatMul support for bfloat16 and int8 types is introduced in DNNLv1.2.
 #ifdef ENABLE_MKLDNN_V1
+
+using mkldnn::matmul;
+
 namespace {
 
-void dnnl_gemm_exec(const memory::desc& a_md, const memory::desc& b_md,
-                    const memory::desc& c_md, const void* a, const void* b,
-                    void* c, const primitive_attr& attr) {
-  // Create a MatMul primitive
-  mkldnn::engine cpu_engine = mkldnn::engine(ENGINE_CPU, 0);
-  mkldnn::matmul::desc matmul_desc(a_md, b_md, c_md);
-  mkldnn::matmul::primitive_desc matmul_pd(matmul_desc, attr, cpu_engine);
-  mkldnn::matmul matmul_prim(matmul_pd);
-  // Wrap raw pointers into DNNL memory objects
-  mkldnn::memory a_memory(a_md, cpu_engine, const_cast<void*>(a));
-  mkldnn::memory b_memory(b_md, cpu_engine, const_cast<void*>(b));
-  mkldnn::memory c_memory(c_md, cpu_engine, c);
-  // Execute the MatMul primitive.
-  // Since here all shapes and parameters are static, please note that we
-  // don't need to pass alpha (scales) again, as they are already hard-coded
-  // in the primitive descriptor. Also, we are not allowed to change the
-  // shapes of matrices A, B, and C -- they should exactly match
-  // the memory descriptors passed to MatMul operation descriptor.
-  mkldnn::stream s(cpu_engine);
-  matmul_prim.execute(s, {{DNNL_ARG_SRC, a_memory},
-                          {DNNL_ARG_WEIGHTS, b_memory},
-                          { DNNL_ARG_DST,
-                            c_memory }});
-  s.wait();
-}
+struct MklMatMulParams {
+  memory::dims a_dims;
+  memory::dims b_dims;
+  memory::dims c_dims;
+  memory::dims a_strides;
+  memory::dims b_strides;
+  memory::dims c_strides;
+
+  MklMatMulParams(memory::dims a_dims, memory::dims b_dims, memory::dims c_dims,
+                  memory::dims a_strides, memory::dims b_strides,
+                  memory::dims c_strides)
+      : a_dims(a_dims),
+        b_dims(b_dims),
+        c_dims(c_dims),
+        a_strides(a_strides),
+        b_strides(b_strides),
+        c_strides(c_strides) {}
+};
+
+template <typename T>
+class MklMatMulPrimitive : public MklPrimitive {
+ public:
+  explicit MklMatMulPrimitive(const MklMatMulParams& params)
+      : cpu_engine_(ENGINE_CPU, 0) {
+    context_.stream.reset(new CPU_STREAM(cpu_engine_));
+    // Create matmul primitive
+    Setup(params);
+  }
+
+  ~MklMatMulPrimitive() {}
+
+  void Execute(const T* a_data, const T* b_data, T* c_data) {
+    context_.a_mem->set_data_handle(static_cast<void*>(const_cast<T*>(a_data)));
+    context_.b_mem->set_data_handle(static_cast<void*>(const_cast<T*>(b_data)));
+    context_.c_mem->set_data_handle(static_cast<void*>(const_cast<T*>(c_data)));
+
+    execute_primitives(context_.matmul_primitives, context_.stream,
+                       context_.net_args);
+
+    // After execution, set data handle back
+    context_.a_mem->set_data_handle(DummyData);
+    context_.b_mem->set_data_handle(DummyData);
+    context_.c_mem->set_data_handle(DummyData);
+  }
+
+ private:
+  // Primitive reuse context for MatMul op
+  struct MklMatMulContext {
+    // MKL-DNN memory.
+    std::shared_ptr<mkldnn::memory> a_mem;
+    std::shared_ptr<mkldnn::memory> b_mem;
+    std::shared_ptr<mkldnn::memory> c_mem;
+
+    // Descriptor and primitive-descriptor for MatMul.
+    std::shared_ptr<matmul::desc> desc;
+    std::shared_ptr<matmul::primitive_desc> prim_desc;
+
+    // Memory descriptors.
+    std::shared_ptr<mkldnn::memory::desc> a_md;
+    std::shared_ptr<mkldnn::memory::desc> b_md;
+    std::shared_ptr<mkldnn::memory::desc> c_md;
+
+    // MatMul primitive.
+    std::shared_ptr<mkldnn::stream> stream;
+    std::vector<mkldnn::primitive> matmul_primitives;
+    std::vector<std::unordered_map<int, memory>> net_args;
+
+    MklMatMulContext()
+        : a_mem(nullptr),
+          b_mem(nullptr),
+          c_mem(nullptr),
+          desc(nullptr),
+          prim_desc(nullptr),
+          a_md(nullptr),
+          b_md(nullptr),
+          c_md(nullptr),
+          stream(nullptr) {}
+  };
+
+  void Setup(const MklMatMulParams& params) {
+    std::shared_ptr<mkldnn::primitive> matmul_primitive = nullptr;
+
+    // Create MatMul descriptor and primitive descriptor.
+    context_.a_md.reset(
+        new memory::desc({params.a_dims}, MklDnnType<T>(), params.a_strides));
+
+    context_.b_md.reset(
+        new memory::desc({params.b_dims}, MklDnnType<T>(), params.b_strides));
+
+    context_.c_md.reset(
+        new memory::desc({params.c_dims}, MklDnnType<T>(), params.c_strides));
+
+    // Create matmul.
+    context_.desc.reset(
+        new matmul::desc(*context_.a_md, *context_.b_md, *context_.c_md));
+    context_.prim_desc.reset(
+        new matmul::primitive_desc(*context_.desc, cpu_engine_));
+
+    // Create memory primitive based on dummy data.
+    context_.a_mem.reset(
+        new mkldnn::memory(*context_.a_md, cpu_engine_, DummyData));
+    context_.b_mem.reset(
+        new mkldnn::memory(*context_.b_md, cpu_engine_, DummyData));
+    context_.c_mem.reset(
+        new mkldnn::memory(*context_.b_md, cpu_engine_, DummyData));
+
+    // Create matmul primitive.
+    matmul_primitive.reset(new mkldnn::matmul(*context_.prim_desc));
+    context_.net_args.push_back({{MKLDNN_ARG_SRC, *context_.a_mem},
+                                 {MKLDNN_ARG_WEIGHTS, *context_.b_mem},
+                                 { MKLDNN_ARG_DST,
+                                   *context_.c_mem }});
+
+    context_.matmul_primitives.push_back(*matmul_primitive);
+    return;
+  }
+
+  struct MklMatMulContext context_;
+  engine cpu_engine_;
+};
+
+template <typename T>
+class MklMatMulPrimitiveFactory : public MklPrimitiveFactory<T> {
+ public:
+  static MklMatMulPrimitive<T>* Get(const MklMatMulParams& params,
+                                    bool do_not_cache) {
+    MklMatMulPrimitive<T>* matmul_prim = nullptr;
+
+    if (do_not_cache) {
+      // Always create new primitive
+      matmul_prim = new MklMatMulPrimitive<T>(params);
+    } else {
+      // Try to find a suitable one in pool
+      matmul_prim = dynamic_cast<MklMatMulPrimitive<T>*>(
+          MklMatMulPrimitiveFactory<T>::GetInstance().GetMklMatMul(params));
+      if (matmul_prim == nullptr) {
+        matmul_prim = new MklMatMulPrimitive<T>(params);
+        MklMatMulPrimitiveFactory<T>::GetInstance().SetMklMatMul(params,
+                                                                 matmul_prim);
+      }
+    }
+
+    return matmul_prim;
+  }
+
+ private:
+  MklMatMulPrimitiveFactory() {}
+  ~MklMatMulPrimitiveFactory() {}
+
+  static MklMatMulPrimitiveFactory& GetInstance() {
+    static MklMatMulPrimitiveFactory instance_;
+    return instance_;
+  }
+
+  static string CreateKey(const MklMatMulParams& params) {
+    string prefix = "matmul_";
+    FactoryKeyCreator key_creator;
+    key_creator.AddAsKey(prefix);
+    key_creator.AddAsKey(params.a_dims);
+    key_creator.AddAsKey(params.b_dims);
+    key_creator.AddAsKey(params.c_dims);
+    key_creator.AddAsKey(params.a_strides);
+    key_creator.AddAsKey(params.b_strides);
+    key_creator.AddAsKey(params.c_strides);
+    key_creator.AddAsKey(typeid(T).name());
+
+    return key_creator.GetKey();
+  }
+
+  MklPrimitive* GetMklMatMul(const MklMatMulParams& params) {
+    string key = CreateKey(params);
+    return this->GetOp(key);
+  }
+
+  void SetMklMatMul(const MklMatMulParams& params, MklPrimitive* op) {
+    string key = CreateKey(params);
+    this->SetOp(key, op);
+  }
+};
 
 template <typename T>
 void dnnl_gemm_batch(const std::vector<bool>& transa,
@@ -589,45 +746,47 @@ void dnnl_gemm_batch(const std::vector<bool>& transa,
       !transb[0] ? dims{k[0] * n[0], n[0], 1} : dims{n[0] * k[0], 1, k[0]};
   dims c_strides = dims{m[0] * n[0], n[0], 1};
 
-  // Prepare memory descriptors
-  memory::desc a_md(a_sizes, MklDnnType<T>(), a_strides);
-  memory::desc b_md(b_sizes, MklDnnType<T>(), b_strides);
-  memory::desc c_md(c_sizes, MklDnnType<T>(), c_strides);
-  // Create attributes (to handle alpha and beta if necessary)
-  mkldnn::primitive_attr attr;
-  if (alpha[0] != 1.f) attr.set_output_scales(/* mask */ 0, {alpha[0]});
-  if (beta[0] != 0.f) {
-    mkldnn::post_ops po;
-    po.append_sum(beta[0]);
-    attr.set_post_ops(po);
-  }
-  dnnl_gemm_exec(a_md, b_md, c_md, static_cast<const void*>(a),
-                 static_cast<const void*>(b), static_cast<void*>(c), attr);
+  // MklMatMul uses const alpha and beta, make guarantee here to ensure
+  // they are never changed.
+  DCHECK_EQ(alpha, 1.0f);
+  DCHECK_EQ(beta, 0.f);
+
+  MklMatMulParams params(a_sizes, b_sizes, c_sizes, a_strides, b_strides,
+                         c_strides);
+  MklMatMulPrimitive<T>* matmul_prim =
+      MklMatMulPrimitiveFactory<T>::Get(params, 0);
+
+  // Execute matmul primitive.
+  matmul_prim->Execute(a, b, c);
 }
 
 template <typename T>
 void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k,
                float alpha, const T* a, int64_t lda, const T* b, int64_t ldb,
-               float beta, float* c, int64_t ldc) {
+               float beta, T* c, int64_t ldc) {
   using dims = mkldnn::memory::dims;
+
   // Prepare strides based on the transa and transb flags: transposed
   // matrices have strides swapped
+  dims a_dims = dims{m, k};
+  dims b_dims = dims{k, n};
+  dims c_dims = dims{m, n};
   dims a_strides = tolower(transa) == 'n' ? dims{lda, 1} : dims{1, lda};
   dims b_strides = tolower(transb) == 'n' ? dims{ldb, 1} : dims{1, ldb};
-  // Prepare memory descriptors
-  memory::desc a_md({m, k}, MklDnnType<T>(), a_strides);
-  memory::desc b_md({k, n}, MklDnnType<T>(), b_strides);
-  memory::desc c_md({m, n}, MklDnnType<float>(), {ldc, 1});
-  // Create attributes (to handle alpha and beta if necessary)
-  mkldnn::primitive_attr attr;
-  if (alpha != 1.f) attr.set_output_scales(/* mask */ 0, {alpha});
-  if (beta != 0.f) {
-    mkldnn::post_ops po;
-    po.append_sum(beta);
-    attr.set_post_ops(po);
-  }
-  dnnl_gemm_exec(a_md, b_md, c_md, static_cast<const void*>(a),
-                 static_cast<const void*>(b), static_cast<void*>(c), attr);
+  dims c_strides = dims{ldc, 1};
+
+  // MklMatMul uses const alpha and beta, make guarantee here to ensure
+  // they are never changed.
+  DCHECK_EQ(alpha, 1.0f);
+  DCHECK_EQ(beta, 0.f);
+
+  MklMatMulParams params(a_dims, b_dims, c_dims, a_strides, b_strides,
+                         c_strides);
+  MklMatMulPrimitive<T>* matmul_prim =
+      MklMatMulPrimitiveFactory<T>::Get(params, 0);
+
+  // Execute matmul primitive.
+  matmul_prim->Execute(a, b, c);
 }
 
 }  // anonymous namespace
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index dbccb35b88b..ac6a1046507 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -167,10 +167,12 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       const T* src_data = input_tensor.flat<T>().data();
 
       T* dst_data = output_tensor->flat<T>().data();
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, pooling_fwd->GetEngine()));
 
       if (int8_forward_inference) {
         // Execute pooling op
-        pooling_fwd->Execute(src_data, dst_data);
+        pooling_fwd->Execute(src_data, dst_data, nullptr, fwd_cpu_stream);
 
         // Pass min, max from input to output.
         const Tensor& min_input_t = MklGetInput(context, 1);
@@ -197,7 +199,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
         T* ws_data =
             static_cast<T*>(dnn_data_wksp.GetOpMem().get_data_handle());
         // Execute pooling op.
-        pooling_fwd->Execute(src_data, dst_data, ws_data);
+        pooling_fwd->Execute(src_data, dst_data, ws_data, fwd_cpu_stream);
       }
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
@@ -322,6 +324,8 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, pooling_bwd->GetEngine()));
       // Allocate output tensor and memory primitive.
       Tensor* output_tensor = nullptr;
       this->AllocateOutputTensor(context, *(pooling_bwd->GetPoolingBwdPd()),
@@ -335,8 +339,10 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, pooling_bwd_pd,
                                      pooling_bwd)) {
         grad_dnn_data.SetUsrMem(diff_dst_md, &grad_tensor);
-        grad_dnn_data.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            GET_DIFF_DST_DESC_FROM_OP_PD(pooling_bwd_pd), cpu_engine_));
+        grad_dnn_data.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(GET_DIFF_DST_DESC_FROM_OP_PD(pooling_bwd_pd),
+                                   cpu_engine_),
+            context);
         diff_dst_data =
             static_cast<T*>(grad_dnn_data.GetOpMem().get_data_handle());
       } else {
@@ -361,7 +367,8 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
       T* diff_src_data = output_tensor->flat<T>().data();
 
       // Execute pooling op.
-      pooling_bwd->Execute(diff_dst_data, diff_src_data, ws_data);
+      pooling_bwd->Execute(diff_dst_data, diff_src_data, ws_data,
+                           bwd_cpu_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status:" + std::to_string(e.status) +
                          ", message: " + string(e.message) + ". in file " +
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 5bd9c17f95e..2dfc6db0075 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/kernel_shape_util.h"
-
 namespace tensorflow {
 using mkldnn::prop_kind;
 
@@ -38,11 +37,11 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
   context_.alg_kind = fwdParams.alg_kind;
   context_.prop_kind = fwdParams.prop_kind;
 
-  // Create memory descriptor
-  // FIXME: Pooling doesn't expose to get the src_primitive_desc,
-  //        so src format is currently hard-coded.
-  //        A utility function is used to do this,
-  //        which may be broken with future CPU architectures
+// Create memory descriptor
+// FIXME: Pooling doesn't expose to get the src_primitive_desc,
+//        so src format is currently hard-coded.
+//        A utility function is used to do this,
+//        which may be broken with future CPU architectures
 #ifndef ENABLE_MKLDNN_V1
   bool is_2d = (fwdParams.src_dims.size() == 4);
   if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value)
@@ -126,7 +125,8 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
 
 template <typename T>
 void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
-                                        void* ws_data) {
+                                        void* ws_data,
+                                        std::shared_ptr<stream> fwd_stream) {
   context_.src_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(src_data)));
   context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -138,10 +138,9 @@ void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
   }
 
 #ifdef ENABLE_MKLDNN_V1
-  execute_primitives(context_.fwd_primitives, context_.fwd_stream,
-                     context_.net_args);
+  execute_primitives(context_.fwd_primitives, fwd_stream, context_.net_args);
 #else
-  context_.fwd_stream->submit(context_.fwd_primitives);
+  fwd_stream->submit(context_.fwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
   // Set back data handle.
@@ -268,7 +267,8 @@ void MklPoolingBwdPrimitive<T>::Setup(const MklPoolingParams& bwdParams) {
 
 template <typename T>
 void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
-                                        T* diff_src_data, const void* ws_data) {
+                                        T* diff_src_data, const void* ws_data,
+                                        std::shared_ptr<stream> bwd_stream) {
   context_.diff_dst_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(diff_dst_data)));
   context_.diff_src_mem->set_data_handle(static_cast<void*>(diff_src_data));
@@ -278,10 +278,9 @@ void MklPoolingBwdPrimitive<T>::Execute(const T* diff_dst_data,
   }
 
 #ifdef ENABLE_MKLDNN_V1
-  execute_primitives(context_.bwd_primitives, context_.bwd_stream,
-                     context_.net_args);
+  execute_primitives(context_.bwd_primitives, bwd_stream, context_.net_args);
 #else
-  context_.bwd_stream->submit(context_.bwd_primitives);
+  bwd_stream->submit(context_.bwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
   // Set back data handle.
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 54f4dc8503e..cb3674b2dd4 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -86,8 +86,7 @@ template <typename T>
 class MklPoolingFwdPrimitive : public MklPrimitive {
  public:
   explicit MklPoolingFwdPrimitive(const MklPoolingParams& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     if (context_.fwd == nullptr) Setup(fwdParams);
   }
 
@@ -97,7 +96,8 @@ class MklPoolingFwdPrimitive : public MklPrimitive {
   //   src_data:  input data buffer of src
   //   ws_data:   output data buffer of workspace
   //   dst_data:  output data buffer of dst
-  void Execute(const T* src_data, T* dst_data, void* ws_data = nullptr);
+  void Execute(const T* src_data, T* dst_data, void* ws_data,
+               std::shared_ptr<stream> fwd_stream);
 
   std::shared_ptr<PoolingFwdPd> GetPoolingFwdPd() const {
     return context_.fwd_pd;
@@ -159,12 +159,10 @@ class MklPoolingFwdPrimitive : public MklPrimitive {
           fwd_pd(nullptr),
           src_md(nullptr),
           dst_md(nullptr),
-          fwd(nullptr),
-          fwd_stream(nullptr) {}
+          fwd(nullptr) {}
   };
 
   struct PoolingFwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -229,8 +227,7 @@ template <typename T>
 class MklPoolingBwdPrimitive : public MklPrimitive {
  public:
   explicit MklPoolingBwdPrimitive(const MklPoolingParams& bwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.bwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     if (context_.bwd == nullptr) Setup(bwdParams);
   }
 
@@ -240,8 +237,8 @@ class MklPoolingBwdPrimitive : public MklPrimitive {
   //   diff_dst_data:  input data buffer of diff_dst
   //   diff_src_data:  output data buffer of diff_src
   //   ws_data:        input data buffer of workspace
-  void Execute(const T* diff_dst_data, T* diff_src_data,
-               const void* ws_data = nullptr);
+  void Execute(const T* diff_dst_data, T* diff_src_data, const void* ws_data,
+               std::shared_ptr<stream> bwd_stream);
 
  public:
   std::shared_ptr<PoolingFwdPd> GetPoolingFwdPd() const {
@@ -315,12 +312,10 @@ class MklPoolingBwdPrimitive : public MklPrimitive {
           bwd_desc(nullptr),
           fwd_pd(nullptr),
           bwd_pd(nullptr),
-          bwd(nullptr),
-          bwd_stream(nullptr) {}
+          bwd(nullptr) {}
   };
 
   struct PoolingBwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
diff --git a/tensorflow/core/kernels/mkl_quantize_op.cc b/tensorflow/core/kernels/mkl_quantize_op.cc
index d049b5f58d2..5adb9862250 100644
--- a/tensorflow/core/kernels/mkl_quantize_op.cc
+++ b/tensorflow/core/kernels/mkl_quantize_op.cc
@@ -77,7 +77,7 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
  public:
   explicit MklReorderWithScalePrimitive(
       const MklReorderWithScaleFwdParams& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     // Create reorder primitive
     Setup(fwdParams);
   }
@@ -86,14 +86,14 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
 
   std::shared_ptr<primitive> GetPrimitive() { return context_.reorder_prim; }
 
-  void Execute(void* src_data, void* dst_data) {
+  void Execute(void* src_data, void* dst_data,
+               std::shared_ptr<stream> reorder_stream) {
     context_.src_mem->set_data_handle(src_data);
     context_.dst_mem->set_data_handle(dst_data);
 #ifndef ENABLE_MKLDNN_V1
-    context_.reorder_stream->submit(context_.net);
+    reorder_stream->submit(context_.net);
 #else
-    context_.reorder_prim->execute(*context_.reorder_stream,
-                                   context_.prim_args);
+    context_.reorder_prim->execute(*reorder_stream, context_.prim_args);
 #endif  // !ENABLE_MKLDNN_V1
     // After execution, set data handle back.
     context_.src_mem->set_data_handle(DummyData);
@@ -124,12 +124,9 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
         : src_mem(nullptr),
           dst_mem(nullptr),
           reorder_pd(nullptr),
-          reorder_prim(nullptr),
-          reorder_stream(nullptr) {}
+          reorder_prim(nullptr) {}
   } context_;
 
-  engine cpu_engine_;
-
   // Reorder primitive setup
   void Setup(const MklReorderWithScaleFwdParams& fwdParams) {
     // Create memory descriptors for reorder data with specified format
@@ -163,7 +160,6 @@ class MklReorderWithScalePrimitive : public MklPrimitive {
     context_.prim_args.insert({MKLDNN_ARG_FROM, *context_.src_mem});
     context_.prim_args.insert({MKLDNN_ARG_TO, *context_.dst_mem});
 #endif  // !ENABLE_MKLDNN_V1
-    context_.reorder_stream.reset(new CPU_STREAM(cpu_engine_));
   }
 };
 
@@ -491,7 +487,10 @@ class MklQuantizeV2Op : public OpKernel {
     MklReorderWithScalePrimitive* reorder_prim =
         MklReorderWithScalePrimitiveFactory<T>::Get(src.GetUsrMem(),
                                                     dst.GetUsrMem(), fwdParams);
-    reorder_prim->Execute(src.GetUsrMemDataHandle(), dst.GetUsrMemDataHandle());
+    std::shared_ptr<stream> cpu_stream;
+    cpu_stream.reset(CreateStream(ctx, reorder_prim->GetEngine()));
+    reorder_prim->Execute(src.GetUsrMemDataHandle(), dst.GetUsrMemDataHandle(),
+                          cpu_stream);
 
     output_min_tensor->flat<float>()(0) = min_range;
     output_max_tensor->flat<float>()(0) = max_range;
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index ffbc1e28355..784bbc682dc 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -61,13 +61,11 @@ template <typename T>
 class MklEltwiseFwdPrimitive : public MklPrimitive {
  public:
   explicit MklEltwiseFwdPrimitive(const MklEltwiseFwdParams<T>& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
 #ifndef ENABLE_MKLDNN_V1
     context_.src_fmt =
         static_cast<mkldnn::memory::format>(fwdParams.src_md.data.format);
 #endif
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
-
     // create eltwise primitive
     if (context_.eltwise_fwd == nullptr) {
       Setup(fwdParams);
@@ -79,7 +77,8 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
   // Eltwise forward execute
   //   src_data:  input data buffer of src
   //   dst_data:  output data buffer of dst
-  void Execute(const T* src_data, T* dst_data) {
+  void Execute(const T* src_data, T* dst_data,
+               std::shared_ptr<stream> fwd_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
@@ -87,12 +86,10 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.fwd_primitives.size(),
               context_.fwd_primitives_args.size());
-    for (size_t i = 0; i < context_.fwd_primitives.size(); ++i) {
-      context_.fwd_primitives.at(i).execute(*context_.fwd_stream,
-                                            context_.fwd_primitives_args.at(i));
-    }
+    execute_primitives(context_.fwd_primitives, fwd_stream,
+                       context_.fwd_primitives_args);
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_stream->submit(context_.fwd_primitives);
 #endif
 
     // After execution, set data handle back.
@@ -134,7 +131,6 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
     // Eltwise primitive
     std::shared_ptr<mkldnn::primitive> eltwise_fwd;
 
-    std::shared_ptr<stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -153,8 +149,7 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
           src_md(nullptr),
           dst_md(nullptr),
           src_mpd(nullptr),
-          eltwise_fwd(nullptr),
-          fwd_stream(nullptr) {
+          eltwise_fwd(nullptr) {
     }
   };
 
@@ -169,14 +164,12 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
 #else
         new MEMORY_PD_CONSTRUCTOR_2_PARAMS(*context_.src_md, cpu_engine_));
 #endif
-
     // Create an eltwise forward descriptor and primitive descriptor
     context_.fwd_desc.reset(new eltwise_forward::desc(
         prop_kind::forward, fwdParams.alg_kind, *context_.src_md,
         fwdParams.alpha, fwdParams.beta));
     context_.fwd_pd.reset(new EltwiseFwdPd(*context_.fwd_desc, cpu_engine_));
     auto fwd_pd = context_.fwd_pd.get();
-
 #ifdef ENABLE_MKLDNN_V1
     // Create memory primitive based on dummy data
     context_.src_mem.reset(new MEMORY_CONSTRUCTOR(fwd_pd->PRIMITIVE_DESC_SRC,
@@ -195,12 +188,10 @@ class MklEltwiseFwdPrimitive : public MklPrimitive {
     context_.eltwise_fwd.reset(new eltwise_forward(
         *context_.fwd_pd, *context_.src_mem, *context_.dst_mem));
 #endif
-
     context_.fwd_primitives.push_back(*context_.eltwise_fwd);
   }
 
   struct EltwiseFwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -281,14 +272,13 @@ template <typename T>
 class MklEltwiseBwdPrimitive : public MklPrimitive {
  public:
   explicit MklEltwiseBwdPrimitive(const MklEltwiseBwdParams<T>& bwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
 #ifndef ENABLE_MKLDNN_V1
     context_.src_fmt =
         static_cast<mkldnn::memory::format>(bwdParams.common_md.data.format);
     context_.diff_dst_fmt =
         static_cast<mkldnn::memory::format>(bwdParams.common_md.data.format);
 #endif
-    context_.bwd_stream.reset(new stream(CPU_STREAM(cpu_engine_)));
     // create eltwise primitive
     if (context_.eltwise_bwd == nullptr) {
       Setup(bwdParams);
@@ -301,7 +291,8 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
   //   src_data:       input data buffer of src
   //   diff_dst_data:  input data buffer of diff_dst
   //   diff_src_data:  output data buffer of diff_src
-  void Execute(const T* src_data, const T* diff_dst_data, T* diff_src_data) {
+  void Execute(const T* src_data, const T* diff_dst_data, T* diff_src_data,
+               std::shared_ptr<stream> bwd_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.diff_dst_mem->set_data_handle(
@@ -311,12 +302,10 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
 #ifdef ENABLE_MKLDNN_V1
     DCHECK_EQ(context_.bwd_primitives.size(),
               context_.bwd_primitives_args.size());
-    for (size_t i = 0; i < context_.bwd_primitives.size(); ++i) {
-      context_.bwd_primitives.at(i).execute(*context_.bwd_stream,
-                                            context_.bwd_primitives_args.at(i));
-    }
+    execute_primitives(context_.bwd_primitives, bwd_stream,
+                       context_.bwd_primitives_args);
 #else
-    context_.bwd_stream->submit(context_.bwd_primitives);
+    bwd_stream->submit(context_.bwd_primitives);
 #endif  // ENABLE_MKLDNN_V1
 
     // after execution, set data handle back
@@ -367,7 +356,6 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
     // Eltwise primitive.
     std::shared_ptr<mkldnn::primitive> eltwise_bwd;
 
-    std::shared_ptr<stream> bwd_stream;
     std::vector<mkldnn::primitive> bwd_primitives;
 
 #ifdef ENABLE_MKLDNN_V1
@@ -391,8 +379,7 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
           fwd_desc(nullptr),
           fwd_pd(nullptr),
           bwd_pd(nullptr),
-          eltwise_bwd(nullptr),
-          bwd_stream(nullptr) {
+          eltwise_bwd(nullptr) {
     }
   };
 
@@ -448,7 +435,6 @@ class MklEltwiseBwdPrimitive : public MklPrimitive {
   }
 
   struct EltwiseBwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -525,12 +511,10 @@ class MklReluOpBase : public OpKernel {
       const Tensor& src_tensor = MklGetInput(context, src_index);
       MklDnnShape dnn_shape_src;
       GetMklShape(context, src_index, &dnn_shape_src);
-
       if (src_tensor.dims() == 0) {
         Compute_Scalar(context);
         return;
       }
-
       MklDnnShape dnn_shape_dst;
       TensorShape tf_shape_dst;
       Tensor* dst_tensor = nullptr;
@@ -542,7 +526,6 @@ class MklReluOpBase : public OpKernel {
                                   dnn_shape_dst);
         return;
       }
-
       // Set DNN primitive - src
       MklDnnData<T> src(&cpu_engine);
       memory::dims src_dims;
@@ -556,26 +539,25 @@ class MklReluOpBase : public OpKernel {
         // Create blocked memory descriptor
         src_md = MklDnnData<T>::CreateBlockedMemDesc(src_dims, src_strides);
       }
-
       // Try to get an eltwise forward primitive from caching pool
       MklEltwiseFwdParams<T> fwdParams(src_dims, src_md, alg_kind, alpha_,
                                        beta_);
-
       MklEltwiseFwdPrimitive<T>* eltwise_fwd =
           MklEltwiseFwdPrimitiveFactory<T>::Get(fwdParams);
-
       auto eltwise_fwd_pd = eltwise_fwd->GetEltwiseFwdPd();
-
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, eltwise_fwd->GetEngine()));
       // Check if src needs to be reordered
       const T* src_data = src_tensor.flat<T>().data();
       if (IS_SRC_REORDER_NEEDED(src_md, eltwise_fwd_pd, eltwise_fwd)) {
         src.SetUsrMem(src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            eltwise_fwd_pd->PRIMITIVE_DESC_SRC, cpu_engine));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(eltwise_fwd_pd->PRIMITIVE_DESC_SRC,
+                                   cpu_engine),
+            context);
         src_data = const_cast<T*>(
             reinterpret_cast<T*>(src.GetOpMem().get_data_handle()));
       }
-
       // Allocate dst tensor, always set it as MKL-DNN layout
       if (dnn_shape_src.IsMklTensor()) {
         dnn_shape_dst.SetMklTensor(true);
@@ -590,7 +572,6 @@ class MklReluOpBase : public OpKernel {
         dnn_shape_dst.SetMklTensor(false);
         tf_shape_dst = src_tensor.shape();
       }
-
       OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                   {static_cast<const int>(src_index)},
                                   static_cast<const int>(dst_index),
@@ -600,7 +581,7 @@ class MklReluOpBase : public OpKernel {
       T* dst_data = dst_tensor->flat<T>().data();
 
       // execute eltwise
-      eltwise_fwd->Execute(src_data, dst_data);
+      eltwise_fwd->Execute(src_data, dst_data, fwd_cpu_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -727,13 +708,16 @@ class MklReluGradOpBase : public OpKernel {
           MklEltwiseBwdPrimitiveFactory<T>::Get(bwdParams);
 
       auto eltwise_bwd_pd = eltwise_bwd->GetEltwiseBwdPd();
-
+      std::shared_ptr<stream> bwd_cpu_stream;
+      bwd_cpu_stream.reset(CreateStream(context, eltwise_bwd->GetEngine()));
       // check whether need reorder for src / diff_dst
       const T* src_data = src_tensor.flat<T>().data();
       if (IS_SRC_REORDER_NEEDED(src_md, eltwise_bwd_pd, eltwise_bwd)) {
         src.SetUsrMem(src_md, &src_tensor);
-        src.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            eltwise_bwd_pd.get()->PRIMITIVE_DESC_DIFF_SRC, cpu_engine));
+        src.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(
+                eltwise_bwd_pd.get()->PRIMITIVE_DESC_DIFF_SRC, cpu_engine),
+            context);
         src_data = const_cast<T*>(
             reinterpret_cast<T*>(src.GetOpMem().get_data_handle()));
       }
@@ -742,8 +726,10 @@ class MklReluGradOpBase : public OpKernel {
       if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, eltwise_bwd_pd,
                                      eltwise_bwd)) {
         diff_dst.SetUsrMem(diff_dst_md, &diff_dst_tensor);
-        diff_dst.CheckReorderToOpMem(MEMORY_PD_WITHOUT_DATA(
-            eltwise_bwd_pd.get()->PRIMITIVE_DESC_DIFF_SRC, cpu_engine));
+        diff_dst.CheckReorderToOpMem(
+            MEMORY_PD_WITHOUT_DATA(
+                eltwise_bwd_pd.get()->PRIMITIVE_DESC_DIFF_SRC, cpu_engine),
+            context);
         diff_dst_data = const_cast<T*>(
             reinterpret_cast<T*>(diff_dst.GetOpMem().get_data_handle()));
       }
@@ -779,7 +765,8 @@ class MklReluGradOpBase : public OpKernel {
       T* diff_src_data = diff_src_tensor->flat<T>().data();
 
       // execute eltwise bwd
-      eltwise_bwd->Execute(src_data, diff_dst_data, diff_src_data);
+      eltwise_bwd->Execute(src_data, diff_dst_data, diff_src_data,
+                           bwd_cpu_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
index f7b72f77cb9..0a0464f648b 100644
--- a/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
+++ b/tensorflow/core/kernels/mkl_requantize_per_channel_op.cc
@@ -130,9 +130,10 @@ class MklRequantizePerChannelOp : public OpKernel {
               GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(input_mem_prim),
               GET_MEMORY_PRIMITIVE_DESC_FROM_MEM_PTR(output_mem_prim),
               cpu_engine_, reorder_attr);
-      mkldnn::stream reorder_stream = CPU_STREAM(cpu_engine_);
+      std::shared_ptr<stream> reorder_stream;
+      reorder_stream.reset(CreateStream(ctx, cpu_engine_));
 #ifndef ENABLE_MKLDNN_V1
-      reorder_stream.submit(
+      reorder_stream->submit(
           {mkldnn::reorder(reorder_pd, *input_mem_prim, *output_mem_prim)});
 #else
       std::unordered_map<int, mkldnn::memory> reorder_args = {
@@ -140,7 +141,7 @@ class MklRequantizePerChannelOp : public OpKernel {
           {MKLDNN_ARG_TO, *output_mem_prim}};
       std::unique_ptr<mkldnn::primitive> reorder_prim(
           new mkldnn::reorder(reorder_pd));
-      reorder_prim->execute(reorder_stream, reorder_args);
+      reorder_prim->execute(*reorder_stream, reorder_args);
 #endif  // !ENABLE_MKLDNN_V1
 
       Tensor* output_min = nullptr;
diff --git a/tensorflow/core/kernels/mkl_slice_op.cc b/tensorflow/core/kernels/mkl_slice_op.cc
index 699c3d44eb7..4115691c79d 100644
--- a/tensorflow/core/kernels/mkl_slice_op.cc
+++ b/tensorflow/core/kernels/mkl_slice_op.cc
@@ -181,22 +181,22 @@ template <typename T>
 class MklSlicePrimitive : public MklPrimitive {
  public:
   explicit MklSlicePrimitive(const MklSliceParams& sliceParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.slice_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     Setup(sliceParams);
   }
 
   ~MklSlicePrimitive() {}
 
-  void Execute(const MklSliceParams& sliceParams) {
+  void Execute(const MklSliceParams& sliceParams,
+               std::shared_ptr<stream> slice_stream) {
     context_.src_mem->set_data_handle(sliceParams.from->get_data_handle());
     context_.dst_mem->set_data_handle(sliceParams.to->get_data_handle());
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.slice_primitives, context_.slice_stream,
+    execute_primitives(context_.slice_primitives, slice_stream,
                        context_.slice_primitives_args);
 #else
-    context_.slice_stream->submit(context_.slice_primitives);
+    slice_stream->submit(context_.slice_primitives);
 #endif
 
     // We should set it back to DummyData so as to make the primitive
@@ -228,8 +228,6 @@ class MklSlicePrimitive : public MklPrimitive {
         : src_mem(nullptr), dst_mem(nullptr), reorder_prim(nullptr) {}
   } context_;
 
-  engine cpu_engine_;
-
   void Setup(const MklSliceParams& sliceParams) {
     // Actually, DummyData will not be used in computation,
     // because the real data will be filled before execution.
@@ -465,7 +463,7 @@ class MklSliceOp : public OpKernel {
       auto op_md =
           MklDnnData<T>::CreateBlockedMemDesc(input_dims, input_strides);
 #ifdef ENABLE_MKLDNN_V1
-      src.CheckReorderToOpMem(op_md, cpu_engine);
+      src.CheckReorderToOpMem(op_md, cpu_engine, context);
 #else
       auto op_pd = memory::primitive_desc(op_md, cpu_engine);
       src.CheckReorderToOpMem(op_pd);
@@ -492,7 +490,9 @@ class MklSliceOp : public OpKernel {
       MklSlicePrimitive<T>* reorder_prim =
           MklSlicePrimitiveFactory<T>::Get(sliceParams);
       // Execute slice reorder.
-      reorder_prim->Execute(sliceParams);
+      std::shared_ptr<stream> slice_stream;
+      slice_stream.reset(CreateStream(context, reorder_prim->GetEngine()));
+      reorder_prim->Execute(sliceParams, slice_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index b07cd58cfd2..4d1cf90f28d 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -48,8 +48,7 @@ template <typename T>
 class MklSoftmaxPrimitive : public MklPrimitive {
  public:
   explicit MklSoftmaxPrimitive(const MklSoftmaxParams& fwdParams)
-      : cpu_engine_(ENGINE_CPU, 0) {
-    context_.fwd_stream.reset(new CPU_STREAM(cpu_engine_));
+      : MklPrimitive(engine(ENGINE_CPU, 0)) {
     Setup(fwdParams);
   }
 
@@ -58,16 +57,18 @@ class MklSoftmaxPrimitive : public MklPrimitive {
   // Softmax forward execute
   //   src_data:  input data buffer of src
   //   dst_data:  output data buffer of dst
-  void Execute(const T* src_data, T* dst_data) {
+  void Execute(const T* src_data, T* dst_data,
+               std::shared_ptr<stream> fwd_cpu_stream) {
     context_.src_mem->set_data_handle(
         static_cast<void*>(const_cast<T*>(src_data)));
     context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
 
 #ifdef ENABLE_MKLDNN_V1
-    execute_primitives(context_.fwd_primitives, context_.fwd_stream,
+    DCHECK_EQ(context_.fwd_primitives.size(), context_.fwd_net_args.size());
+    execute_primitives(context_.fwd_primitives, fwd_cpu_stream,
                        context_.fwd_net_args);
 #else
-    context_.fwd_stream->submit(context_.fwd_primitives);
+    fwd_cpu_stream->submit(context_.fwd_primitives);
 #endif
 
     // After execution, set data handle back.
@@ -95,7 +96,6 @@ class MklSoftmaxPrimitive : public MklPrimitive {
     std::shared_ptr<mkldnn::softmax_forward::primitive_desc> fwd_pd;
     std::shared_ptr<mkldnn::primitive> softmax_fwd;
 
-    std::shared_ptr<stream> fwd_stream;
     std::vector<mkldnn::primitive> fwd_primitives;
     std::vector<MemoryArgsMap> fwd_net_args;
 
@@ -105,8 +105,7 @@ class MklSoftmaxPrimitive : public MklPrimitive {
           fwd_desc(nullptr),
           src_md(nullptr),
           fwd_pd(nullptr),
-          softmax_fwd(nullptr),
-          fwd_stream(nullptr) {}
+          softmax_fwd(nullptr) {}
   };
 
   // Softmax forward primitive setup
@@ -143,7 +142,6 @@ class MklSoftmaxPrimitive : public MklPrimitive {
   }
 
   struct SoftmaxFwdContext context_;
-  engine cpu_engine_;
 };
 
 template <typename T>
@@ -303,9 +301,9 @@ class MklSoftmaxOp : public OpKernel {
 
       const T* src_data = src_tensor.flat<T>().data();
       T* dst_data = reinterpret_cast<T*>(output_tensor->flat<T>().data());
-
-      // Execute softmax primitive.
-      softmax_fwd->Execute(src_data, dst_data);
+      std::shared_ptr<stream> fwd_cpu_stream;
+      fwd_cpu_stream.reset(CreateStream(context, softmax_fwd->GetEngine()));
+      softmax_fwd->Execute(src_data, dst_data, fwd_cpu_stream);
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index a82aed0243c..77a68afa752 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -144,12 +144,14 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
 
     std::vector<primitive> net;
 #ifdef ENABLE_MKLDNN_V1
+    std::shared_ptr<stream> transpose_stream;
     auto* prim = FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem());
+    transpose_stream.reset(CreateStream(context, prim->GetEngine()));
     net.push_back(*(prim->GetPrimitive()));
     std::vector<MemoryArgsMap> net_args;
     net_args.push_back({{MKLDNN_ARG_FROM, *in.GetUsrMem()},
                         {MKLDNN_ARG_TO, *out.GetUsrMem()}});
-    execute_primitives(net, prim->GetStream(), net_args);
+    execute_primitives(net, transpose_stream, net_args);
 #else
     std::shared_ptr<stream> transpose_stream;
     transpose_stream.reset(new CPU_STREAM(cpu_engine));
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index b606d411a3d..0fc1d53749f 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -887,6 +887,17 @@ class ResourceScatterUpdateOp : public OpKernel {
     const Tensor& indices = c->input(1);
     const Tensor& updates = c->input(2);
 
+    // Check that rank(updates.shape) = rank(indices.shape + params.shape[1:])
+    OP_REQUIRES(c,
+                updates.dims() == 0 ||
+                    updates.dims() == indices.dims() + params->dims() - 1,
+                errors::InvalidArgument(
+                    "Must have updates.shape = indices.shape + "
+                    "params.shape[1:] or updates.shape = [], got ",
+                    "updates.shape ", updates.shape().DebugString(),
+                    ", indices.shape ", indices.shape().DebugString(),
+                    ", params.shape ", params->shape().DebugString()));
+
     // Check that we have enough index space
     const int64 N_big = indices.NumElements();
     OP_REQUIRES(
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index 8954dcd4681..6c3fad668ae 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -508,6 +508,12 @@ class SparseSegmentReductionOpBase : public OpKernel {
                 errors::InvalidArgument("segment ids must be >= 0"));
     auto output_flat = output->flat_outer_dims<T>();
 
+    Tensor temp;
+    if (input.dtype() == DT_BFLOAT16) {
+      temp = tensorflow::Tensor(DT_FLOAT, output_shape);
+    }
+    auto temp_flat = temp.flat_outer_dims<float>();
+
     int64 start = 0, end = 1;
     // Index from which the output is not initialized.
     SegmentId uninitialized_index = 0;
@@ -546,8 +552,9 @@ class SparseSegmentReductionOpBase : public OpKernel {
       }
 
       auto out = output_flat.template chip<0>(out_index);
-      const int bad_offset =
-          Reduce(input_flat, indices_vec, start, end - start, out);
+      auto temp = temp_flat.template chip<0>(out_index);
+      const int bad_offset = Reduce<T, Index>(input_flat, indices_vec, start,
+                                              end - start, out, temp);
       OP_REQUIRES(context, bad_offset < 0,
                   errors::InvalidArgument(
                       "Bad: indices[", start + bad_offset,
@@ -572,40 +579,89 @@ class SparseSegmentReductionOpBase : public OpKernel {
   }
 
  private:
-  int64 Reduce(const typename TTypes<T>::ConstMatrix& input_flat,
-               const typename TTypes<Index>::ConstVec& indices_vec, int64 start,
-               int64 num,
-               Eigen::TensorChippingOp<0, typename TTypes<T>::Matrix> out) {
+  template <typename Tin>
+  using EnableIfBfloat16 =
+      typename std::enable_if<std::is_same<Tin, bfloat16>::value, int>::type;
+  template <typename Tin>
+  using EnableIfNotBfloat16 =
+      typename std::enable_if<!std::is_same<Tin, bfloat16>::value, int>::type;
+
+  template <typename Tin, typename Tindex, EnableIfNotBfloat16<Tin> = 0>
+  EIGEN_ALWAYS_INLINE auto fetch_val(
+      const typename TTypes<Tin>::ConstMatrix& input_flat, Tindex index) {
+    return input_flat.template chip<0>(index);
+  }
+
+  template <typename Tin, typename Tindex, EnableIfBfloat16<Tin> = 0>
+  EIGEN_ALWAYS_INLINE auto fetch_val(
+      const typename TTypes<Tin>::ConstMatrix& input_flat, Tindex index) {
+    return input_flat.template chip<0>(index).template cast<float>();
+  }
+
+  template <typename Tout>
+  EIGEN_ALWAYS_INLINE Tout get_scaling_factor(int64 num) {
+    Tout m(1);
+    if (is_mean_ && (num < 10)) {
+      m = Tout(num);
+    }
+    if (is_sqrtn_ && (num < 10)) {
+      m = Tout(sqrt(num));
+    }
+    return Tout(1) / m;
+  }
+
+  template <typename Tin, typename Tindex, EnableIfNotBfloat16<Tin> = 0>
+  int64 Reduce(
+      const typename TTypes<Tin>::ConstMatrix& input_flat,
+      const typename TTypes<Tindex>::ConstVec& indices_vec, int64 start,
+      int64 num, Eigen::TensorChippingOp<0, typename TTypes<Tin>::Matrix> out,
+      Eigen::TensorChippingOp<0, typename TTypes<float>::Matrix> temp) {
+    return ReduceImpl<Tin, Tindex, Tin>(input_flat, indices_vec, start, num,
+                                        out, get_scaling_factor<Tin>(num));
+  }
+
+  template <typename Tin, typename Tindex, EnableIfBfloat16<Tin> = 0>
+  int64 Reduce(
+      const typename TTypes<Tin>::ConstMatrix& input_flat,
+      const typename TTypes<Tindex>::ConstVec& indices_vec, int64 start,
+      int64 num, Eigen::TensorChippingOp<0, typename TTypes<Tin>::Matrix> out,
+      Eigen::TensorChippingOp<0, typename TTypes<float>::Matrix> temp) {
+    int64 res =
+        ReduceImpl<Tin, Tindex, float>(input_flat, indices_vec, start, num,
+                                       temp, get_scaling_factor<float>(num));
+    out = temp.template cast<bfloat16>();
+    return res;
+  }
+
+  template <typename Tin, typename Tindex, typename Tout>
+  int64 ReduceImpl(
+      const typename TTypes<Tin>::ConstMatrix& input_flat,
+      const typename TTypes<Tindex>::ConstVec& indices_vec, int64 start,
+      int64 num, Eigen::TensorChippingOp<0, typename TTypes<Tout>::Matrix> out,
+      const Tout scaling_factor) {
 #define INDEX(n, i)                               \
   const auto index##n = indices_vec(start + (i)); \
   if (!FastBoundsCheck(index##n, input_flat.dimension(0))) return (i);
 
-#define L(n) input_flat.template chip<0>(index##n)
+#define L(n) fetch_val<Tin, Tindex>(input_flat, index##n)
 
     if (num == 1) {
       INDEX(0, 0);
       out = L(0);
     } else {
-      int64 r = num % 8;
-      T m(1);
-      if (is_mean_ && (num < 10)) {
-        m = T(num);
-      }
-      if (is_sqrtn_ && (num < 10)) {
-        m = T(sqrt(num));
-      }
+      int64 r = num & 7;
       switch (r) {
         case 2: {
           INDEX(0, 0);
           INDEX(1, 1);
-          out = (L(0) + L(1)) / m;
+          out = (L(0) + L(1)) * scaling_factor;
           break;
         }
         case 3: {
           INDEX(0, 0);
           INDEX(1, 1);
           INDEX(2, 2);
-          out = (L(0) + L(1) + L(2)) / m;
+          out = (L(0) + L(1) + L(2)) * scaling_factor;
           break;
         }
         case 4: {
@@ -613,7 +669,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(1, 1);
           INDEX(2, 2);
           INDEX(3, 3);
-          out = (L(0) + L(1) + L(2) + L(3)) / m;
+          out = (L(0) + L(1) + L(2) + L(3)) * scaling_factor;
           break;
         }
         case 5: {
@@ -622,7 +678,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(2, 2);
           INDEX(3, 3);
           INDEX(4, 4);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4)) / m;
+          out = (L(0) + L(1) + L(2) + L(3) + L(4)) * scaling_factor;
           break;
         }
         case 6: {
@@ -632,7 +688,7 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(3, 3);
           INDEX(4, 4);
           INDEX(5, 5);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) / m;
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5)) * scaling_factor;
           break;
         }
         case 7: {
@@ -643,7 +699,8 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(4, 4);
           INDEX(5, 5);
           INDEX(6, 6);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) / m;
+          out =
+              (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6)) * scaling_factor;
           break;
         }
         case 0: {
@@ -655,7 +712,8 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(5, 5);
           INDEX(6, 6);
           INDEX(7, 7);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) / m;
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7)) *
+                scaling_factor;
           r = 8;
           break;
         }
@@ -669,8 +727,8 @@ class SparseSegmentReductionOpBase : public OpKernel {
           INDEX(6, 6);
           INDEX(7, 7);
           INDEX(8, 8);
-          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) /
-                m;
+          out = (L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7) + L(8)) *
+                scaling_factor;
           r = 9;
           break;
         }
@@ -687,10 +745,10 @@ class SparseSegmentReductionOpBase : public OpKernel {
         out += L(0) + L(1) + L(2) + L(3) + L(4) + L(5) + L(6) + L(7);
       }
       if (is_mean_ && num >= 10) {
-        out = out / static_cast<T>(num);
+        out = out / static_cast<Tout>(num);
       }
       if (is_sqrtn_ && num >= 10) {
-        out = out / static_cast<T>(sqrt(num));
+        out = out / static_cast<Tout>(sqrt(num));
       }
     }
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
index fee0f818c5e..03a448e52b3 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
@@ -64,6 +64,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE);
                                                   segment_ids_type>);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(bfloat16);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
 #define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
@@ -85,6 +86,7 @@ REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
           CPUDevice, type, index_type, segment_ids_type>);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(float);
 REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(double);
+REGISTER_CPU_SPARSE_KERNELS_FOR_EACH_INDEX_TYPE(bfloat16);
 #undef REGISTER_CPU_SPARSE_KERNELS
 
 #define REGISTER_CPU_SPARSE_KERNELS(type, index_type, segment_ids_type) \
diff --git a/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc b/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
index d86c7b2fe23..02dd9259a76 100644
--- a/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
+++ b/tensorflow/core/kernels/strided_slice_op_gpu_int.cu.cc
@@ -21,9 +21,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/strided_slice_op_gpu_impl.h"
 
 namespace tensorflow {
-TF_CALL_int8(DEFINE_GPU_KERNELS);
-TF_CALL_int32(DEFINE_GPU_KERNELS);
-TF_CALL_int64(DEFINE_GPU_KERNELS);
+TF_CALL_INTEGRAL_TYPES(DEFINE_GPU_KERNELS);
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/ops/batch_ops.cc b/tensorflow/core/ops/batch_ops.cc
index ba7faeb5e8a..cfa4049938d 100644
--- a/tensorflow/core/ops/batch_ops.cc
+++ b/tensorflow/core/ops/batch_ops.cc
@@ -25,6 +25,19 @@ REGISTER_OP("BatchFunction")
     .Output("out_tensors: Tout")
     .Attr("f: func")
     .Attr("num_batch_threads: int")
+    // 'max_batch_size' denotes the maximum batch size acceptable, i.e., inputs
+    // with larger batch size are simply invalidated.
+    // By default, 'max_batch_size' must be equal to max value of
+    // 'allowed_batch_sizes'.
+    // By setting 'enable_large_batch_splitting' (attribute below) to true,
+    // 'max_batch_size' can be greater than or equal to max value of
+    // 'allowed_batch_sizes', in other words,
+    // 1) input with size > 'max_batch_size' is still invalidated.
+    // 2) input with
+    //    a) size <= 'max_batch_size'
+    //    b) size > max value of 'allowed_batch_sizes'
+    //    will automatically be split into multiple batches (with batch size in
+    //    'allowed_batch_sizes'), executed, and re-composed (as final output).
     .Attr("max_batch_size: int")
     .Attr("batch_timeout_micros: int")
     .Attr("max_enqueued_batches: int = 10")
@@ -35,6 +48,12 @@ REGISTER_OP("BatchFunction")
     .Attr("Tin: list(type)")
     .Attr("Tcaptured: list(type) >= 0")
     .Attr("Tout: list(type)")
+    // If 'enable_large_batch_splitting' is true, for input batches exceeding
+    // the largest value in "allowed_batch_sizes", allow the batch to be split
+    // into multiple batches with batch size within "allowed_batch_sizes".
+    // NOTE: Support for `enable_large_batch_splitting == true` is still
+    // developed in progress.
+    .Attr("enable_large_batch_splitting: bool = false")
     // TODO(apassos): Fix this shape inference function. It requires shape
     // inference of function calls.
     .SetShapeFn(shape_inference::UnknownShape);
diff --git a/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
index daf3c4692b8..1a3cb96431d 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/BatchFunction.pbtxt
@@ -82,3 +82,94 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "BatchFunction"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "captured_tensors"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "out_tensors"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "enable_large_batch_splitting"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
index 526c2c25c04..31b04288b75 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMean.pbtxt
@@ -53,3 +53,59 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
index b9984f8df25..1fe616e3552 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentMeanWithNumSegments.pbtxt
@@ -70,3 +70,76 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentMeanWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
index 17562d4f333..581fad0b5d8 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtN.pbtxt
@@ -53,3 +53,59 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSqrtN"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
index 1f24446a587..1ac5edbd39a 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SparseSegmentSqrtNWithNumSegments.pbtxt
@@ -70,3 +70,76 @@ op {
     }
   }
 }
+op {
+  name: "SparseSegmentSqrtNWithNumSegments"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tidx"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tsegmentids"
+  }
+  input_arg {
+    name: "num_segments"
+    type_attr: "Tnumsegments"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "Tidx"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tnumsegments"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "Tsegmentids"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 0a08925f7e1..11b10f3c504 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -299,4 +299,10 @@ REGISTER_OP("FakeParam")
       return Status::OK();
     });
 
+// Returns the device index.
+REGISTER_OP("DeviceIndex")
+    .Output("index: int32")
+    .Attr("device_names: list(string)")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 972d6e27b75..dfc2463915c 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1337,7 +1337,7 @@ REGISTER_OP("SparseSegmentMean")
     .Input("indices: Tidx")
     .Input("segment_ids: Tsegmentids")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
@@ -1348,7 +1348,7 @@ REGISTER_OP("SparseSegmentMeanWithNumSegments")
     .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
@@ -1370,7 +1370,7 @@ REGISTER_OP("SparseSegmentSqrtN")
     .Input("indices: Tidx")
     .Input("segment_ids: Tsegmentids")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
     .SetShapeFn(SparseSegmentReductionShapeFn);
@@ -1381,7 +1381,7 @@ REGISTER_OP("SparseSegmentSqrtNWithNumSegments")
     .Input("segment_ids: Tsegmentids")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: {float, double}")
+    .Attr("T: {bfloat16, float, double}")
     .Attr("Tidx: {int32, int64} = DT_INT32")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .Attr("Tsegmentids: {int32, int64} = DT_INT32")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index e2f2e3d00fa..d2a14590bc5 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -3553,6 +3553,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "enable_large_batch_splitting"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "BatchIFFT"
@@ -46097,6 +46104,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46215,6 +46223,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46283,6 +46292,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
@@ -46401,6 +46411,7 @@ op {
     type: "type"
     allowed_values {
       list {
+        type: DT_BFLOAT16
         type: DT_FLOAT
         type: DT_DOUBLE
       }
diff --git a/tensorflow/core/ops/state_ops_test.cc b/tensorflow/core/ops/state_ops_test.cc
index 6d05dd0b96c..a0caad4a49f 100644
--- a/tensorflow/core/ops/state_ops_test.cc
+++ b/tensorflow/core/ops/state_ops_test.cc
@@ -58,6 +58,15 @@ TEST(StateOpsTest, ScatterUpdate_ShapeFn) {
 
   // Resolve shape on first updates dimension.
   INFER_OK(op, "[1,2];[3];[?,2]", "in0");
+
+  // Allow the update to be a scalar.
+  INFER_OK(op, "[1,2];[3];?", "in0");
+
+  // Allow a scalar index.
+  INFER_OK(op, "[1,2];[];[2]", "in0");
+
+  // Check the requirement updates.shape = indices.shape + ref.shape[1:].
+  INFER_ERROR("Shapes must be equal rank, but are 1 and 0", op, "[2];[];[2]");
 }
 
 TEST(StateOpsTest, TemporaryVariable_ShapeFn) {
diff --git a/tensorflow/core/platform/protobuf.cc b/tensorflow/core/platform/protobuf.cc
index 1912ab11e62..5176207d956 100644
--- a/tensorflow/core/platform/protobuf.cc
+++ b/tensorflow/core/platform/protobuf.cc
@@ -23,7 +23,7 @@ const char* kProtobufUint64Typename = "::tensorflow::protobuf_uint64";
 TStringOutputStream::TStringOutputStream(tstring* target) : target_(target) {}
 
 bool TStringOutputStream::Next(void** data, int* size) {
-  int old_size = target_->size();
+  size_t old_size = target_->size();
 
   // Grow the string.
   if (old_size < target_->capacity()) {
@@ -32,16 +32,16 @@ bool TStringOutputStream::Next(void** data, int* size) {
     target_->resize_uninitialized(target_->capacity());
   } else {
     // Size has reached capacity, try to double the size.
-    if (old_size > std::numeric_limits<int>::max() / 2) {
+    if (old_size > std::numeric_limits<size_t>::max() / 2) {
       // Can not double the size otherwise it is going to cause integer
       // overflow in the expression below: old_size * 2 ";
       return false;
     }
     // Double the size, also make sure that the new size is at least
     // kMinimumSize.
-    target_->resize_uninitialized(
-        std::max(old_size * 2,
-                 kMinimumSize + 0));  // "+ 0" works around GCC4 weirdness.
+    target_->resize_uninitialized(std::max(
+        old_size * 2,
+        (size_t)kMinimumSize + 0));  // "+ 0" works around GCC4 weirdness.
   }
 
   *data = target_->data() + old_size;
diff --git a/tensorflow/core/platform/s3/s3_file_system.cc b/tensorflow/core/platform/s3/s3_file_system.cc
index 1726c9fbc6c..ee7dded6f98 100644
--- a/tensorflow/core/platform/s3/s3_file_system.cc
+++ b/tensorflow/core/platform/s3/s3_file_system.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <aws/core/utils/StringUtils.h>
 #include <aws/core/utils/logging/AWSLogging.h>
 #include <aws/core/utils/logging/LogSystemInterface.h>
+#include <aws/core/utils/stream/PreallocatedStreamBuf.h>
 #include <aws/s3/S3Client.h>
 #include <aws/s3/S3Errors.h>
 #include <aws/s3/model/AbortMultipartUploadRequest.h>
@@ -58,10 +59,16 @@ static const char* kS3TempFileTemplate = "/tmp/s3_filesystem_XXXXXX";
 static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
 static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
 static const int64 kS3TimeoutMsec = 300000;                       // 5 min
-static const uint64 kS3MultiPartCopyPartSize = 50 * 1024 * 1024;  // 50MB
+static const uint64 kS3MultiPartUploadChunkSize = 50 * 1024 * 1024;   // 50 MB
+static const uint64 kS3MultiPartDownloadChunkSize = 2 * 1024 * 1024;  // 50 MB
 static const int kS3GetChildrenMaxKeys = 100;
-static const int kExecutorPoolSize = 5;
-static const int kUploadRetries = 5;
+
+// With this change multiple threads are used in one single download.
+// Increasing the thread pool size since multiple downloads
+// and uploads can occur in parallel.
+static const int kExecutorPoolSize = 25;
+static const int kUploadRetries = 3;
+static const int kDownloadRetries = 3;
 static const char* kExecutorTag = "TransferManagerExecutor";
 
 Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
@@ -223,9 +230,16 @@ static Status CreateStatusFromAwsError(
 
 class S3RandomAccessFile : public RandomAccessFile {
  public:
-  S3RandomAccessFile(const string& bucket, const string& object,
-                     std::shared_ptr<Aws::S3::S3Client> s3_client)
-      : bucket_(bucket), object_(object), s3_client_(s3_client) {}
+  S3RandomAccessFile(
+      const string& bucket, const string& object,
+      const bool use_multi_part_download,
+      std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager,
+      std::shared_ptr<Aws::S3::S3Client> s3_client)
+      : bucket_(bucket),
+        object_(object),
+        use_multi_part_download_(use_multi_part_download),
+        transfer_manager_(transfer_manager),
+        s3_client_(s3_client) {}
 
   Status Name(StringPiece* result) const override {
     return errors::Unimplemented("S3RandomAccessFile does not support Name()");
@@ -235,6 +249,66 @@ class S3RandomAccessFile : public RandomAccessFile {
               char* scratch) const override {
     VLOG(1) << "ReadFilefromS3 s3://" << bucket_ << "/" << object_ << " from "
             << offset << " for n:" << n;
+    if (use_multi_part_download_) {
+      return ReadS3TransferManager(offset, n, result, scratch);
+    } else {
+      return ReadS3Client(offset, n, result, scratch);
+    }
+  }
+
+  Status ReadS3TransferManager(uint64 offset, size_t n, StringPiece* result,
+                               char* scratch) const {
+    VLOG(3) << "Using TransferManager";
+
+    auto create_stream_fn = [&]() {  // create stream lambda fn
+      return Aws::New<TFS3UnderlyingStream>(
+          "S3ReadStream",
+          Aws::New<Aws::Utils::Stream::PreallocatedStreamBuf>(
+              "S3ReadStream", reinterpret_cast<unsigned char*>(scratch), n));
+    };
+
+    VLOG(3) << "Created stream to read with transferManager";
+
+    std::shared_ptr<Aws::Transfer::TransferHandle> handle =
+        transfer_manager_.get()->DownloadFile(bucket_.c_str(), object_.c_str(),
+                                              offset, n, create_stream_fn);
+    handle->WaitUntilFinished();
+
+    // todo change this
+    int retries = 0;
+
+    while (handle->GetStatus() == Aws::Transfer::TransferStatus::FAILED &&
+           handle->GetLastError().GetResponseCode() !=
+               Aws::Http::HttpResponseCode::REQUESTED_RANGE_NOT_SATISFIABLE &&
+           retries++ < kDownloadRetries) {
+      // only failed parts will be downloaded again
+      VLOG(1) << "Retrying read of s3://" << bucket_ << "/" << object_
+              << " after failure. Current retry count:" << retries;
+      transfer_manager_.get()->RetryDownload(handle);
+      handle->WaitUntilFinished();
+    }
+
+    if (handle->GetStatus() != Aws::Transfer::TransferStatus::COMPLETED) {
+      auto error = handle->GetLastError();
+      if (error.GetResponseCode() ==
+          Aws::Http::HttpResponseCode::REQUESTED_RANGE_NOT_SATISFIABLE) {
+        // expected when end of file is reached
+        n = 0;
+        *result = StringPiece(scratch, n);
+        return Status(error::OUT_OF_RANGE, "Read less bytes than requested");
+      }
+      return CreateStatusFromAwsError(error);
+    } else {
+      n = handle->GetBytesTotalSize();
+      *result = StringPiece(scratch, handle->GetBytesTransferred());
+      return Status::OK();
+    }
+  }
+
+  Status ReadS3Client(uint64 offset, size_t n, StringPiece* result,
+                      char* scratch) const {
+    VLOG(3) << "ReadFile using S3Client s3://" << bucket_ << "/" << object_;
+
     Aws::S3::Model::GetObjectRequest getObjectRequest;
     getObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
     string bytes = strings::StrCat("bytes=", offset, "-", offset + n - 1);
@@ -242,6 +316,7 @@ class S3RandomAccessFile : public RandomAccessFile {
     getObjectRequest.SetResponseStreamFactory([]() {
       return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag);
     });
+
     auto getObjectOutcome = this->s3_client_->GetObject(getObjectRequest);
     if (!getObjectOutcome.IsSuccess()) {
       auto error = getObjectOutcome.GetError();
@@ -252,18 +327,21 @@ class S3RandomAccessFile : public RandomAccessFile {
         return Status(error::OUT_OF_RANGE, "Read less bytes than requested");
       }
       return CreateStatusFromAwsError(error);
-    }
-    n = getObjectOutcome.GetResult().GetContentLength();
-    getObjectOutcome.GetResult().GetBody().read(scratch, n);
+    } else {
+      n = getObjectOutcome.GetResult().GetContentLength();
+      getObjectOutcome.GetResult().GetBody().read(scratch, n);
 
-    *result = StringPiece(scratch, n);
-    return Status::OK();
+      *result = StringPiece(scratch, n);
+      return Status::OK();
+    }
   }
 
  private:
   string bucket_;
   string object_;
   std::shared_ptr<Aws::S3::S3Client> s3_client_;
+  std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager_;
+  bool use_multi_part_download_;
 };
 
 class S3WritableFile : public WritableFile {
@@ -375,16 +453,53 @@ class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
 S3FileSystem::S3FileSystem()
     : s3_client_(nullptr, ShutdownClient),
       initialization_lock_(),
-      transfer_manager_(nullptr, ShutdownTransferManager),
       executor_(nullptr, ShutdownExecutor) {
-  const char* part_size_str = getenv("S3_MULTI_PART_COPY_PART_SIZE");
-  multi_part_copy_part_size_ = kS3MultiPartCopyPartSize;
+  const char* part_size_str = getenv("S3_MULTI_PART_UPLOAD_CHUNK_SIZE");
+  multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD] =
+      kS3MultiPartUploadChunkSize;
   if (part_size_str) {
     uint64 part_size_num;
     if (strings::safe_strtou64(part_size_str, &part_size_num)) {
-      multi_part_copy_part_size_ = part_size_num;
+      multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD] =
+          part_size_num;
     }
   }
+
+  // Different TensorFlow APIs call the download API with different
+  // buffer size. Download performance depends on that size and this chunk size.
+  part_size_str = getenv("S3_MULTI_PART_DOWNLOAD_CHUNK_SIZE");
+  multi_part_chunk_size_[Aws::Transfer::TransferDirection::DOWNLOAD] =
+      kS3MultiPartDownloadChunkSize;
+  if (part_size_str) {
+    uint64 part_size_num;
+    if (strings::safe_strtou64(part_size_str, &part_size_num)) {
+      multi_part_chunk_size_[Aws::Transfer::TransferDirection::DOWNLOAD] =
+          part_size_num;
+    }
+  }
+
+  use_multi_part_download_ = true;
+  const char* disable_transfer_mgr = getenv("S3_DISABLE_MULTI_PART_DOWNLOAD");
+  if (disable_transfer_mgr) {
+    if (disable_transfer_mgr[0] == '1') {
+      use_multi_part_download_ = false;
+    }
+  }
+
+  auto upload_pair = std::pair<Aws::Transfer::TransferDirection,
+                               std::shared_ptr<Aws::Transfer::TransferManager>>(
+      Aws::Transfer::TransferDirection::UPLOAD,
+      std::shared_ptr<Aws::Transfer::TransferManager>(nullptr,
+                                                      ShutdownTransferManager));
+  auto download_pair =
+      std::pair<Aws::Transfer::TransferDirection,
+                std::shared_ptr<Aws::Transfer::TransferManager>>(
+          Aws::Transfer::TransferDirection::DOWNLOAD,
+          std::shared_ptr<Aws::Transfer::TransferManager>(
+              nullptr, ShutdownTransferManager));
+
+  this->transfer_managers_.insert(upload_pair);
+  this->transfer_managers_.insert(download_pair);
 }
 
 S3FileSystem::~S3FileSystem() {}
@@ -424,20 +539,22 @@ std::shared_ptr<Aws::S3::S3Client> S3FileSystem::GetS3Client() {
 }
 
 std::shared_ptr<Aws::Transfer::TransferManager>
-S3FileSystem::GetTransferManager() {
+S3FileSystem::GetTransferManager(
+    const Aws::Transfer::TransferDirection& direction) {
   std::shared_ptr<Aws::S3::S3Client> s3_client = this->GetS3Client();
   std::lock_guard<mutex> lock(this->initialization_lock_);
-  if (this->transfer_manager_.get() == nullptr) {
+  if (this->transfer_managers_[direction].get() == nullptr) {
     Aws::Transfer::TransferManagerConfiguration config(
         this->GetExecutor().get());
     config.s3Client = s3_client;
-    config.bufferSize = this->multi_part_copy_part_size_;
-    // must be larger than pool size * multi_part_copy_part_size
+    config.bufferSize = this->multi_part_chunk_size_[direction];
+    // must be larger than pool size * multi part chunk size
     config.transferBufferMaxHeapSize =
-        (kExecutorPoolSize + 1) * this->multi_part_copy_part_size_;
-    this->transfer_manager_ = Aws::Transfer::TransferManager::Create(config);
+        (kExecutorPoolSize + 1) * this->multi_part_chunk_size_[direction];
+    this->transfer_managers_[direction] =
+        Aws::Transfer::TransferManager::Create(config);
   }
-  return this->transfer_manager_;
+  return this->transfer_managers_[direction];
 }
 
 std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor>
@@ -452,9 +569,21 @@ S3FileSystem::GetExecutor() {
 
 Status S3FileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
+  return NewRandomAccessFile(fname, result, true);
+}
+
+Status S3FileSystem::NewRandomAccessFile(
+    const string& fname, std::unique_ptr<RandomAccessFile>* result,
+    bool use_multi_part_download) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3RandomAccessFile(bucket, object, this->GetS3Client()));
+
+  // check if an override was defined for this file. used for testing
+  bool use_mpd = this->use_multi_part_download_ && use_multi_part_download;
+  result->reset(new S3RandomAccessFile(
+      bucket, object, use_mpd,
+      this->GetTransferManager(Aws::Transfer::TransferDirection::DOWNLOAD),
+      this->GetS3Client()));
   return Status::OK();
 }
 
@@ -462,8 +591,11 @@ Status S3FileSystem::NewWritableFile(const string& fname,
                                      std::unique_ptr<WritableFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3WritableFile(bucket, object, this->GetTransferManager(),
-                                   this->GetS3Client()));
+  result->reset(new S3WritableFile(
+      bucket, object,
+      this->GetTransferManager(Aws::Transfer::TransferDirection::UPLOAD),
+      this->GetS3Client()));
+
   return Status::OK();
 }
 
@@ -478,8 +610,10 @@ Status S3FileSystem::NewAppendableFile(const string& fname,
 
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
-  result->reset(new S3WritableFile(bucket, object, this->GetTransferManager(),
-                                   this->GetS3Client()));
+  result->reset(new S3WritableFile(
+      bucket, object,
+      this->GetTransferManager(Aws::Transfer::TransferDirection::UPLOAD),
+      this->GetS3Client()));
 
   while (true) {
     status = reader->Read(offset, kS3ReadAppendableFileBufferSize, &read_chunk,
@@ -773,10 +907,13 @@ Status S3FileSystem::CopyFile(const Aws::String& source_bucket,
   TF_RETURN_IF_ERROR(
       this->GetFileSize(string(source_full_path.c_str()), &file_length));
   int num_parts;
-  if (file_length <= multi_part_copy_part_size_) {
+  if (file_length <=
+      multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD]) {
     num_parts = 1;
   } else {
-    num_parts = ceil((float)file_length / multi_part_copy_part_size_);
+    num_parts =
+        ceil((float)file_length /
+             multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD]);
   }
 
   if (num_parts == 1) {
@@ -786,7 +923,8 @@ Status S3FileSystem::CopyFile(const Aws::String& source_bucket,
         "MultiPartCopy with number of parts more than 10000 is not supported. "
         "Your object ",
         source, " required ", num_parts,
-        " as multi_part_copy_part_size is set to ", multi_part_copy_part_size_,
+        " as multi_part_copy_part_size is set to ",
+        multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD],
         ". You can control this part size using the environment variable ",
         "S3_MULTI_PART_COPY_PART_SIZE to increase it.");
     return tensorflow::errors::Unimplemented(message);
@@ -831,7 +969,9 @@ Status S3FileSystem::MultiPartCopy(const Aws::String& source,
 
   Aws::String uploadID = multipartUploadOutcome.GetResult().GetUploadId();
   VLOG(1) << "Copying from " << source << " in " << num_parts
-          << " parts of size " << multi_part_copy_part_size_ << " each";
+          << " parts of size "
+          << multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD]
+          << " each";
   Aws::S3::Model::CompletedMultipartUpload completedMPURequest;
 
   // passed to each callback keyed by partNumber
@@ -859,8 +999,12 @@ Status S3FileSystem::MultiPartCopy(const Aws::String& source,
     for (std::map<int, PartState>::iterator it = incompletePartStates.begin();
          it != incompletePartStates.end(); it++) {
       int partNumber = it->first;
-      uint64 startPos = (partNumber - 1) * multi_part_copy_part_size_;
-      uint64 endPos = startPos + kS3MultiPartCopyPartSize - 1;
+      uint64 startPos =
+          (partNumber - 1) *
+          multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD];
+      uint64 endPos =
+          startPos +
+          multi_part_chunk_size_[Aws::Transfer::TransferDirection::UPLOAD] - 1;
       if (endPos >= file_length) {
         endPos = file_length - 1;
       }
diff --git a/tensorflow/core/platform/s3/s3_file_system.h b/tensorflow/core/platform/s3/s3_file_system.h
index 7b70c374a51..c69d678185e 100644
--- a/tensorflow/core/platform/s3/s3_file_system.h
+++ b/tensorflow/core/platform/s3/s3_file_system.h
@@ -52,6 +52,10 @@ class S3FileSystem : public FileSystem {
   Status NewRandomAccessFile(
       const string& fname, std::unique_ptr<RandomAccessFile>* result) override;
 
+  Status NewRandomAccessFile(const string& fname,
+                             std::unique_ptr<RandomAccessFile>* result,
+                             bool use_multi_part_download);
+
   Status NewWritableFile(const string& fname,
                          std::unique_ptr<WritableFile>* result) override;
 
@@ -101,8 +105,12 @@ class S3FileSystem : public FileSystem {
   std::shared_ptr<Aws::S3::S3Client> s3_client_;
 
   // Returns the member transfer manager, initializing as-needed.
-  std::shared_ptr<Aws::Transfer::TransferManager> GetTransferManager();
-  std::shared_ptr<Aws::Transfer::TransferManager> transfer_manager_;
+  std::shared_ptr<Aws::Transfer::TransferManager> GetTransferManager(
+      const Aws::Transfer::TransferDirection& direction);
+  void InitializeTransferManagers();
+  std::map<Aws::Transfer::TransferDirection,
+           std::shared_ptr<Aws::Transfer::TransferManager> >
+      transfer_managers_;
 
   // Returns the member executor for transfer manager, initializing as-needed.
   std::shared_ptr<Aws::Utils::Threading::PooledThreadExecutor> GetExecutor();
@@ -132,8 +140,10 @@ class S3FileSystem : public FileSystem {
   // Lock held when checking for s3_client_ and transfer_manager_ initialization
   mutex initialization_lock_;
 
-  // size to split objects during multipart copy
-  uint64 multi_part_copy_part_size_;
+  // size to split objects during multipart upload/download/copy
+  std::map<Aws::Transfer::TransferDirection, uint64> multi_part_chunk_size_;
+
+  bool use_multi_part_download_;
 };
 
 /// S3 implementation of a file system with retry on failures.
@@ -147,6 +157,16 @@ class RetryingS3FileSystem : public RetryingFileSystem<S3FileSystem> {
                         )) {}
 };
 
+// AWS Streams destroy the buffer (buf) passed, so creating a new
+// IOStream that retains the buffer so the calling function
+// can control it's lifecycle
+class TFS3UnderlyingStream : public Aws::IOStream {
+ public:
+  using Base = Aws::IOStream;
+  TFS3UnderlyingStream(std::streambuf* buf) : Base(buf) {}
+  virtual ~TFS3UnderlyingStream() = default;
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
diff --git a/tensorflow/core/platform/s3/s3_file_system_test.cc b/tensorflow/core/platform/s3/s3_file_system_test.cc
index 224e30c6bb3..06432165e68 100644
--- a/tensorflow/core/platform/s3/s3_file_system_test.cc
+++ b/tensorflow/core/platform/s3/s3_file_system_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/platform/s3/s3_file_system.h"
 
+#include <time.h>
+
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/path.h"
@@ -62,6 +64,96 @@ class S3FileSystemTest : public ::testing::Test {
     return Status::OK();
   }
 
+  Status ReadAllInChunks(const string& fname, string* content,
+                         bool use_multi_part_download = true) {
+    std::unique_ptr<RandomAccessFile> reader;
+
+    TF_RETURN_IF_ERROR(
+        s3fs.NewRandomAccessFile(fname, &reader, use_multi_part_download));
+
+    uint64 file_size = 0;
+    TF_RETURN_IF_ERROR(s3fs.GetFileSize(fname, &file_size));
+
+    content->resize(file_size);
+
+    uint64 buffer_size = 16 * 1024 * 1024;
+
+    std::size_t part_count = (std::max)(
+        static_cast<size_t>((file_size + buffer_size - 1) / buffer_size),
+        static_cast<std::size_t>(1));
+    VLOG(1) << "buffersize:" << buffer_size << " file_size:" << file_size
+            << " part_count=" << part_count;
+    std::unique_ptr<char[]> buffer{new char[buffer_size]};
+    std::stringstream ss;
+
+    int offset = 0;
+    int result_size = 0;
+
+    using namespace std::chrono;
+    auto start = high_resolution_clock::now();
+
+    for (int i = 0; i < part_count; i++) {
+      StringPiece result;
+      offset = i * buffer_size;
+      TF_RETURN_IF_ERROR(
+          reader->Read(offset, buffer_size, &result, buffer.get()));
+
+      if (result.size() != 0) {
+        ss.write(result.data(), result.size());
+        result_size += result.size();
+      }
+      if (result_size == file_size) {
+        break;
+      }
+      if (result.size() != buffer_size) {
+        VLOG(1) << "Result size and buffer size did not match";
+        if (result.empty()) {
+          return errors::OutOfRange("eof");
+        } else {
+          return errors::DataLoss("truncated record at ", offset);
+        }
+      }
+    }
+
+    if (file_size != result_size) {
+      return errors::DataLoss("expected ", file_size, " got ", result_size,
+                              " bytes");
+    }
+
+    auto stop = high_resolution_clock::now();
+    duration<double> time_taken = duration_cast<duration<double>>(stop - start);
+    VLOG(1) << "Time Taken"
+            << " : " << time_taken.count() << "seconds";
+
+    memcpy((char*)(content->data()), ss.str().data(),
+           static_cast<size_t>(file_size));
+
+    return Status::OK();
+  }
+
+  Status ReadLargeFile() {
+    // const string fname = TmpDir("train-00001-of-01024");
+    auto large_file_name = getenv("LARGE_DOWNLOAD_FILE_NAME");
+    const string fname = TmpDir(large_file_name);
+    string content_xfer;
+    string content_s3client;
+
+    // Read using Chunked Transfer Manager
+    VLOG(1) << "Using transfer manager";
+    TF_RETURN_IF_ERROR(ReadAllInChunks(fname, &content_xfer));
+
+    VLOG(1) << "Without transfer manager";
+    // Read using old S3 API and see if the contents match with TransferManager
+    TF_RETURN_IF_ERROR(ReadAllInChunks(fname, &content_s3client, false));
+
+    if (content_xfer == content_s3client) {
+      return Status::OK();
+    } else {
+      VLOG(1) << "ReadLargeFile contents DO NOT match";
+      return Status(error::OUT_OF_RANGE, "ReadLargeFile contents DO NOT match");
+    }
+  }
+
   S3FileSystem s3fs;
 };
 
@@ -236,5 +328,9 @@ TEST_F(S3FileSystemTest, HasAtomicMove) {
   EXPECT_EQ(has_atomic_move, false);
 }
 
+TEST_F(S3FileSystemTest, NewRandomAccessBigFile) {
+  TF_EXPECT_OK(ReadLargeFile());
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/utils/op_utils.cc b/tensorflow/core/profiler/utils/op_utils.cc
index 921e0617902..75789bc1071 100644
--- a/tensorflow/core/profiler/utils/op_utils.cc
+++ b/tensorflow/core/profiler/utils/op_utils.cc
@@ -63,13 +63,11 @@ void HostOpMetricsDbBuilder::UpdateHostInfeedEnqInfo(
       start_timestamp_ps_diff);
 }
 
-void DeviceOpMetricsDbBuilder::EnterOp(uint64 program_id,
-                                       absl::string_view name,
-                                       absl::string_view category,
-                                       absl::string_view provenance,
-                                       bool is_eager, uint64 occurrences,
-                                       uint64 time_ps, uint64 children_time_ps,
-                                       int64 flops, int64 bytes_accessed) {
+void DeviceOpMetricsDbBuilder::EnterOp(
+    uint64 program_id, absl::string_view name, absl::string_view category,
+    absl::string_view provenance, bool is_eager, uint64 occurrences,
+    uint64 time_ps, uint64 children_time_ps, int64 flops, int64 bytes_accessed,
+    const std::vector<OpMetrics::MemoryAccessed>& memory_accessed_breakdown) {
   uint64 self_time_ps = time_ps - children_time_ps;
   DCHECK_GE(time_ps, self_time_ps);
   OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, name);
@@ -89,6 +87,9 @@ void DeviceOpMetricsDbBuilder::EnterOp(uint64 program_id,
       op_metrics->bytes_accessed() +
       GetCappedPerf(bytes_accessed * occurrences, self_time_ps,
                     peak_hbm_bw_giga_bytes_per_second_ / 1000));
+  for (const auto& memory_accessed : memory_accessed_breakdown) {
+    *op_metrics->add_memory_accessed_breakdown() = memory_accessed;
+  }
   db()->set_total_op_time_ps(db()->total_op_time_ps() + self_time_ps);
 }
 
diff --git a/tensorflow/core/profiler/utils/op_utils.h b/tensorflow/core/profiler/utils/op_utils.h
index f94328d1b8d..9c9762853b8 100644
--- a/tensorflow/core/profiler/utils/op_utils.h
+++ b/tensorflow/core/profiler/utils/op_utils.h
@@ -69,10 +69,14 @@ class DeviceOpMetricsDbBuilder : public OpMetricsDbBuilder {
   //                      picoseconds.
   //   flops = the number of floating-point operations computed.
   //   bytes_accessed = the sum of bytes read and bytes written by this OP.
+  //   memory_accessed_breakdown = the breakdown of memory accessed by operation
+  //                               type and memory space.
   void EnterOp(uint64 program_id, absl::string_view name,
                absl::string_view category, absl::string_view provenance,
                bool is_eager, uint64 occurrences, uint64 time_ps,
-               uint64 children_time_ps, int64 flops, int64 bytes_accessed);
+               uint64 children_time_ps, int64 flops, int64 bytes_accessed,
+               const std::vector<OpMetrics::MemoryAccessed>&
+                   memory_accessed_breakdown = {});
 
  protected:
   // Peak performance of a TensorCore or a GPU in TFLOP/s.
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 7131d1f7227..4f06edd4162 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 414  // Updated: 2020/5/27
+#define TF_GRAPH_DEF_VERSION 415  // Updated: 2020/5/28
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index 2654d126e86..54e24da0ff5 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -474,7 +474,9 @@ inline SparseTensor SparseTensor::Concat(
     const int st_num_entries = st.num_entries();
 
     // Fill in indices & values.
-    std::copy_n(&st.vals_.vec<T>()(0), st_num_entries, &vals_t(offset));
+    if (st_num_entries > 0) {
+      std::copy_n(&st.vals_.vec<T>()(0), st_num_entries, &vals_t(offset));
+    }
 
     const auto* st_ix = &st.ix_.matrix<int64>()(0, 0);
     auto* ix_out = &ix_t(offset, 0);
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index 6477c0491f9..810f3ab1a2b 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -252,6 +252,7 @@ cc_library(
         ":version",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/api",
+        "//tensorflow/lite/delegates:status",
         "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         "//tensorflow/lite/experimental/resource",
         "//tensorflow/lite/kernels/internal:compatibility",
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index f6cdb981328..fd51ad0a4aa 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -680,6 +680,9 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
         if failure_type[i] != "none":
             args.append("--failure_type=%s" % failure_type[i])
         i = i + 1
+
+        # Avoid coverage timeouts for large/enormous tests.
+        coverage_tags = ["nozapfhahn"] if size in ["large", "enormous"] else []
         native.py_test(
             name = "model_coverage_test_%s_%s" % (model_name, target_op_sets.lower().replace(",", "_")),
             srcs = [src],
@@ -696,7 +699,7 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
                 "no_gpu",  # Executing with TF GPU configurations is redundant.
                 "no_oss",
                 "no_windows",
-            ] + tags,
+            ] + tags + coverage_tags,
             deps = [
                 "//tensorflow/lite/testing/model_coverage:model_coverage_lib",
                 "//tensorflow/lite/python:lite",
diff --git a/tensorflow/lite/core/api/profiler.h b/tensorflow/lite/core/api/profiler.h
index 938652cf698..897efbe1438 100644
--- a/tensorflow/lite/core/api/profiler.h
+++ b/tensorflow/lite/core/api/profiler.h
@@ -22,34 +22,56 @@ namespace tflite {
 // A simple utility for enabling profiled event tracing in TensorFlow Lite.
 class Profiler {
  public:
+  // As certain Profiler instance might be only interested in certain event
+  // types, we define each event type value to allow a Profiler to use
+  // bitmasking bitwise operations to determine whether an event should be
+  // recorded or not.
   enum class EventType {
     // Default event type, the metadata field has no special significance.
-    DEFAULT = 0,
+    DEFAULT = 1,
 
     // The event is an operator invocation and the event_metadata field is the
     // index of operator node.
-    OPERATOR_INVOKE_EVENT = 1,
+    OPERATOR_INVOKE_EVENT = 2,
 
     // The event is an invocation for an internal operator of a TFLite delegate.
     // The event_metadata field is the index of operator node that's specific to
     // the delegate.
-    DELEGATE_OPERATOR_INVOKE_EVENT = 2
+    DELEGATE_OPERATOR_INVOKE_EVENT = 4,
+
+    // The event is a recording of runtime instrumentation such as the overall
+    // TFLite runtime status, the TFLite delegate status (if a delegate
+    // is applied), and the overall model inference latency etc.
+    // Note, the delegate status and overall status are stored as separate
+    // event_metadata fields. In particular, the delegate status is encoded
+    // as DelegateStatus::full_status().
+    GENERAL_RUNTIME_INSTRUMENTATION_EVENT = 8,
   };
 
   virtual ~Profiler() {}
 
-  // Signals the beginning of an event from a subgraph indexed at
-  // 'event_subgraph_index', returning a handle to the profile event.
+  // Signals the beginning of an event and returns a handle to the profile
+  // event. The `event_metadata1` and `event_metadata2` have different
+  // interpretations based on the actual Profiler instance and the `event_type`.
+  // For example, as for the 'SubgraphAwareProfiler' defined in
+  // lite/core/subgraph.h, when the event_type is OPERATOR_INVOKE_EVENT,
+  // `event_metadata1` represents the index of a TFLite node, and
+  // `event_metadata2` represents the index of the subgraph that this event
+  // comes from.
   virtual uint32_t BeginEvent(const char* tag, EventType event_type,
-                              uint32_t event_metadata,
-                              uint32_t event_subgraph_index) = 0;
-  // Similar w/ the above, but the event comes from the primary subgraph that's
-  // indexed at 0.
-  virtual uint32_t BeginEvent(const char* tag, EventType event_type,
-                              uint32_t event_metadata) {
-    return BeginEvent(tag, event_type, event_metadata, /*primary subgraph*/ 0);
+                              int64_t event_metadata1,
+                              int64_t event_metadata2) = 0;
+  // Similar w/ the above, but `event_metadata2` defaults to 0.
+  uint32_t BeginEvent(const char* tag, EventType event_type,
+                      int64_t event_metadata) {
+    return BeginEvent(tag, event_type, event_metadata, /*event_metadata2*/ 0);
   }
 
+  // Signals an end to the specified profile event with 'event_metadata's, This
+  // is useful when 'event_metadata's are not available when the event begins
+  // or when one wants to overwrite the 'event_metadata's set at the beginning.
+  virtual void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                        int64_t event_metadata2) {}
   // Signals an end to the specified profile event.
   virtual void EndEvent(uint32_t event_handle) = 0;
 
@@ -60,15 +82,18 @@ class Profiler {
   // they assume the value is in "usec", if in any case subclasses
   // didn't put usec, then the values are not meaningful.
   // TODO karimnosseir: Revisit and make the function more clear.
-  virtual void AddEvent(const char* tag, EventType event_type,
-                        uint32_t event_metadata, uint64_t start, uint64_t end) {
-    AddEvent(tag, event_type, event_metadata, start, end,
-             /*event_subgraph_index*/ 0);
+  void AddEvent(const char* tag, EventType event_type, uint64_t start,
+                uint64_t end, int64_t event_metadata) {
+    AddEvent(tag, event_type, start, end, event_metadata,
+             /*event_metadata2*/ 0);
   }
 
-  virtual void AddEvent(const char* tag, EventType event_type,
-                        uint32_t event_metadata, uint64_t start, uint64_t end,
-                        uint32_t event_subgraph_index) {}
+  virtual void AddEvent(const char* tag, EventType event_type, uint64_t start,
+                        uint64_t end, int64_t event_metadata1,
+                        int64_t event_metadata2) {}
+
+ protected:
+  friend class ScopedProfile;
 };
 
 // Adds a profile event to `profiler` that begins with the construction
@@ -79,7 +104,7 @@ class ScopedProfile {
  public:
   ScopedProfile(Profiler* profiler, const char* tag,
                 Profiler::EventType event_type = Profiler::EventType::DEFAULT,
-                uint32_t event_metadata = 0)
+                int64_t event_metadata = 0)
       : profiler_(profiler), event_handle_(0) {
     if (profiler) {
       event_handle_ = profiler_->BeginEvent(tag, event_type, event_metadata);
@@ -92,8 +117,8 @@ class ScopedProfile {
     }
   }
 
- private:
-  Profiler* const profiler_;
+ protected:
+  Profiler* profiler_;
   uint32_t event_handle_;
 };
 
@@ -113,6 +138,31 @@ class ScopedDelegateOperatorProfile : public ScopedProfile {
                       static_cast<uint32_t>(node_index)) {}
 };
 
+class ScopedRuntimeInstrumentationProfile : public ScopedProfile {
+ public:
+  ScopedRuntimeInstrumentationProfile(Profiler* profiler, const char* tag)
+      : ScopedProfile(
+            profiler, tag,
+            Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT, -1) {}
+
+  void set_runtime_status(int64_t delegate_status, int64_t interpreter_status) {
+    if (profiler_) {
+      delegate_status_ = delegate_status;
+      interpreter_status_ = interpreter_status;
+    }
+  }
+
+  ~ScopedRuntimeInstrumentationProfile() {
+    if (profiler_) {
+      profiler_->EndEvent(event_handle_, delegate_status_, interpreter_status_);
+    }
+  }
+
+ private:
+  int64_t delegate_status_;
+  int64_t interpreter_status_;
+};
+
 }  // namespace tflite
 
 #define TFLITE_VARNAME_UNIQ_IMPL(name, ctr) name##ctr
@@ -130,4 +180,15 @@ class ScopedDelegateOperatorProfile : public ScopedProfile {
   tflite::ScopedDelegateOperatorProfile TFLITE_VARNAME_UNIQ(               \
       _profile_, __COUNTER__)((profiler), (tag), (node_index))
 
+#define TFLITE_ADD_RUNTIME_INSTRUMENTATION_EVENT(                          \
+    profiler, tag, delegate_status, interpreter_status)                    \
+  do {                                                                     \
+    if (!profiler) {                                                       \
+      const auto handle = profiler->BeginEvent(                            \
+          tag, Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT, \
+          delegate_status, interpreter_status);                            \
+      profiler->EndEvent(handle);                                          \
+    }                                                                      \
+  } while (false);
+
 #endif  // TENSORFLOW_LITE_CORE_API_PROFILER_H_
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index d6067daaa6a..d9ccff35105 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_SUBGRAPH_H_
 #define TENSORFLOW_LITE_CORE_SUBGRAPH_H_
 
+#include <cstdint>
 #include <cstdlib>
 #include <map>
 #include <utility>
@@ -338,21 +339,16 @@ class Subgraph {
   class SubgraphAwareProfiler : public Profiler {
    public:
     // Constructor should be called with the non-nullptr profiler argument.
-    SubgraphAwareProfiler(Profiler* profiler, uint32_t subgraph_index)
+    SubgraphAwareProfiler(Profiler* profiler, int64_t subgraph_index)
         : profiler_(profiler), subgraph_index_(subgraph_index) {}
     ~SubgraphAwareProfiler() override {}
 
     uint32_t BeginEvent(const char* tag, EventType event_type,
-                        uint32_t event_metadata,
-                        uint32_t subgraph_index) override {
+                        int64_t event_metadata1,
+                        int64_t event_metadata2) override {
       if (!profiler_) return 0;
-      return profiler_->BeginEvent(tag, event_type, event_metadata,
-                                   subgraph_index);
-    }
-
-    uint32_t BeginEvent(const char* tag, EventType event_type,
-                        uint32_t event_metadata) override {
-      return BeginEvent(tag, event_type, event_metadata, subgraph_index_);
+      return profiler_->BeginEvent(tag, event_type, event_metadata1,
+                                   subgraph_index_);
     }
 
     void EndEvent(uint32_t event_handle) override {
@@ -360,17 +356,24 @@ class Subgraph {
       profiler_->EndEvent(event_handle);
     }
 
-    void AddEvent(const char* tag, EventType event_type,
-                  uint32_t event_metadata, uint64_t start,
-                  uint64_t end) override {
+    void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                  int64_t event_metadata2) override {
       if (!profiler_) return;
-      profiler_->AddEvent(tag, event_type, event_metadata, start, end);
+      profiler_->EndEvent(event_handle, event_metadata1, event_metadata2);
+    }
+
+    void AddEvent(const char* tag, EventType event_type, uint64_t start,
+                  uint64_t end, int64_t event_metadata1,
+                  int64_t event_metadata2) override {
+      if (!profiler_) return;
+      profiler_->AddEvent(tag, event_type, start, end, event_metadata1,
+                          subgraph_index_);
     }
 
    private:
     // Not own the memory.
     Profiler* const profiler_;
-    const uint32_t subgraph_index_;
+    const int64_t subgraph_index_;
   };
 
   // Prevent 'context_' from accessing functions that are only available to
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index 8a05298d01a..e1f91f32c34 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -20,6 +20,15 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+cc_library(
+    name = "status",
+    hdrs = ["status.h"],
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite/c:common",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = ["utils.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index c149479ae4c..95b20bc6e81 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -43,8 +43,10 @@ cc_library(
     srcs = ["arguments.cc"],
     hdrs = ["arguments.h"],
     deps = [
+        ":gpu_object",
         ":opencl_wrapper",
         ":util",
+        "//tensorflow/lite/delegates/gpu/common:access_type",
         "//tensorflow/lite/delegates/gpu/common:status",
         "//tensorflow/lite/delegates/gpu/common:types",
         "//tensorflow/lite/delegates/gpu/common:util",
@@ -305,6 +307,16 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gpu_object",
+    hdrs = ["gpu_object.h"],
+    deps = [
+        ":opencl_wrapper",
+        "//tensorflow/lite/delegates/gpu/common:data_type",
+        "//tensorflow/lite/delegates/gpu/common:status",
+    ],
+)
+
 cc_library(
     name = "inference_context",
     srcs = ["inference_context.cc"],
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.cc b/tensorflow/lite/delegates/gpu/cl/arguments.cc
index 26d9fc778b3..bdfae935f28 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.cc
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.cc
@@ -23,10 +23,14 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
+bool IsWordSymbol(char symbol) {
+  return absl::ascii_isalnum(symbol) || symbol == '_';
+}
+
 std::string GetNextWord(const std::string& code, size_t first_position) {
   size_t pos = first_position;
   char t = code[pos];
-  while (absl::ascii_isalnum(t) || t == '_') {
+  while (IsWordSymbol(t)) {
     pos++;
     t = code[pos];
   }
@@ -38,13 +42,19 @@ Arguments::Arguments(Arguments&& args)
     : int_values_(std::move(args.int_values_)),
       shared_int4s_data_(std::move(args.shared_int4s_data_)),
       float_values_(std::move(args.float_values_)),
-      shared_float4s_data_(std::move(args.shared_float4s_data_)) {}
+      shared_float4s_data_(std::move(args.shared_float4s_data_)),
+      buffers_(std::move(args.buffers_)),
+      images2d_(std::move(args.images2d_)),
+      objects_(std::move(args.objects_)) {}
 Arguments& Arguments::operator=(Arguments&& args) {
   if (this != &args) {
     int_values_ = std::move(args.int_values_);
     shared_int4s_data_ = std::move(args.shared_int4s_data_);
     float_values_ = std::move(args.float_values_);
     shared_float4s_data_ = std::move(args.shared_float4s_data_);
+    buffers_ = std::move(args.buffers_);
+    images2d_ = std::move(args.images2d_);
+    objects_ = std::move(args.objects_);
   }
   return *this;
 }
@@ -55,11 +65,40 @@ void Arguments::AddFloat(const std::string& name, float value) {
 void Arguments::AddInt(const std::string& name, int value) {
   int_values_[name].value = value;
 }
+void Arguments::AddBuffer(const std::string& name,
+                          const GPUBufferDescriptor& desc) {
+  buffers_[name] = desc;
+}
+void Arguments::AddImage2D(const std::string& name,
+                           const GPUImage2DDescriptor& desc) {
+  images2d_[name] = desc;
+}
+
+void Arguments::AddObject(const std::string& name, GPUObjectPtr&& object) {
+  objects_[name] = {AccessType::READ, std::move(object)};
+}
+
+void Arguments::AddGPUResources(const std::string& name,
+                                const GPUResources& resources) {
+  for (const auto& r : resources.ints) {
+    AddInt(absl::StrCat(name, "_", r));
+  }
+  for (const auto& r : resources.floats) {
+    AddFloat(absl::StrCat(name, "_", r));
+  }
+  for (const auto& r : resources.buffers) {
+    AddBuffer(absl::StrCat(name, "_", r.first), r.second);
+  }
+  for (const auto& r : resources.images2d) {
+    AddImage2D(absl::StrCat(name, "_", r.first), r.second);
+  }
+}
 
 absl::Status Arguments::SetInt(const std::string& name, int value) {
   auto ii = int_values_.find(name);
   if (ii == int_values_.end()) {
-    return absl::NotFoundError(absl::StrCat("No argument with name - ", name));
+    return absl::NotFoundError(
+        absl::StrCat("No int argument with name - ", name));
   }
   ii->second.value = value;
   if (ii->second.active) {
@@ -71,7 +110,8 @@ absl::Status Arguments::SetInt(const std::string& name, int value) {
 absl::Status Arguments::SetFloat(const std::string& name, float value) {
   auto fi = float_values_.find(name);
   if (fi == float_values_.end()) {
-    return absl::NotFoundError(absl::StrCat("No argument with name - ", name));
+    return absl::NotFoundError(
+        absl::StrCat("No float argument with name - ", name));
   }
   fi->second.value = value;
   if (fi->second.active) {
@@ -80,8 +120,60 @@ absl::Status Arguments::SetFloat(const std::string& name, float value) {
   return absl::OkStatus();
 }
 
+absl::Status Arguments::SetImage2D(const std::string& name, cl_mem memory) {
+  auto ti = images2d_.find(name);
+  if (ti == images2d_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No image2D argument with name - ", name));
+  }
+  ti->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetBuffer(const std::string& name, cl_mem memory) {
+  auto it = buffers_.find(name);
+  if (it == buffers_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("No buffer argument with name - ", name));
+  }
+  it->second.memory = memory;
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::SetGPUResources(
+    const std::string& name, const GPUResourcesWithValue& resources) {
+  for (const auto& r : resources.ints) {
+    RETURN_IF_ERROR(SetInt(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.floats) {
+    RETURN_IF_ERROR(SetFloat(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.buffers) {
+    RETURN_IF_ERROR(SetBuffer(absl::StrCat(name, "_", r.first), r.second));
+  }
+  for (const auto& r : resources.images2d) {
+    RETURN_IF_ERROR(SetImage2D(absl::StrCat(name, "_", r.first), r.second));
+  }
+  return absl::OkStatus();
+}
+
+absl::Status Arguments::TransformToCLCode(std::string* code) {
+  RETURN_IF_ERROR(AddObjectArgs());
+  ResolveArgsPass(code);
+  return absl::OkStatus();
+}
+
 std::string Arguments::GetListOfArgs() {
   std::string result;
+  for (auto& t : buffers_) {
+    const std::string type_name =
+        t.second.data_type == DataType::FLOAT32 ? "float" : "half";
+    absl::StrAppend(&result, ",\n  __global ", type_name, t.second.element_size,
+                    "* ", t.first);
+  }
+  for (auto& t : images2d_) {
+    absl::StrAppend(&result, ",\n  __read_only image2d_t ", t.first);
+  }
   for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
     absl::StrAppend(&result, ",\n  int4 shared_int4_", i);
   }
@@ -92,6 +184,26 @@ std::string Arguments::GetListOfArgs() {
 }
 
 absl::Status Arguments::Bind(cl_kernel kernel, int offset) {
+  for (auto& t : buffers_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
+  for (auto& t : images2d_) {
+    const int error_code =
+        clSetKernelArg(kernel, offset, sizeof(cl_mem), &t.second.memory);
+    if (error_code != CL_SUCCESS) {
+      return absl::UnknownError(absl::StrCat(
+          "Failed to set kernel arguments - ", CLErrorCodeToString(error_code),
+          "(at index - ", offset, ")"));
+    }
+    offset++;
+  }
   for (int i = 0; i < shared_int4s_data_.size() / 4; ++i) {
     const int error_code = clSetKernelArg(kernel, offset, sizeof(int32_t) * 4,
                                           &shared_int4s_data_[i * 4]);
@@ -148,8 +260,8 @@ std::string Arguments::AddActiveArgument(const std::string& arg_name) {
 }
 
 void Arguments::ResolveArgsPass(std::string* code) {
-  std::string result;
   constexpr char kPrefix[] = "args.";
+  std::string result;
   size_t position = 0;
   size_t next_position = code->find(kPrefix);
   while (next_position != std::string::npos) {
@@ -168,6 +280,16 @@ void Arguments::ResolveArgsPass(std::string* code) {
   shared_float4s_data_.resize(shared_float4s_aligned_size);
 }
 
+absl::Status Arguments::AddObjectArgs() {
+  for (auto& t : objects_) {
+    AddGPUResources(t.first,
+                    t.second.obj_ptr->GetGPUDescriptor()->GetGPUResources());
+    RETURN_IF_ERROR(
+        SetGPUResources(t.first, t.second.obj_ptr->GetGPUResources()));
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace cl
 }  // namespace gpu
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/gpu/cl/arguments.h b/tensorflow/lite/delegates/gpu/cl/arguments.h
index 274532d0199..f1059e77c93 100644
--- a/tensorflow/lite/delegates/gpu/cl/arguments.h
+++ b/tensorflow/lite/delegates/gpu/cl/arguments.h
@@ -20,8 +20,10 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/lite/delegates/gpu/cl/gpu_object.h"
 #include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
+#include "tensorflow/lite/delegates/gpu/common/access_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/types.h"
 #include "tensorflow/lite/delegates/gpu/common/util.h"
@@ -35,15 +37,21 @@ class Arguments {
   Arguments() = default;
   void AddFloat(const std::string& name, float value = 0.0f);
   void AddInt(const std::string& name, int value = 0);
+  void AddBuffer(const std::string& name, const GPUBufferDescriptor& desc);
+  void AddImage2D(const std::string& name, const GPUImage2DDescriptor& desc);
+
+  void AddObject(const std::string& name, GPUObjectPtr&& object);
 
   absl::Status SetInt(const std::string& name, int value);
   absl::Status SetFloat(const std::string& name, float value);
+  absl::Status SetImage2D(const std::string& name, cl_mem memory);
+  absl::Status SetBuffer(const std::string& name, cl_mem memory);
 
   std::string GetListOfArgs();
 
   absl::Status Bind(cl_kernel kernel, int offset);
 
-  void ResolveArgsPass(std::string* code);
+  absl::Status TransformToCLCode(std::string* code);
 
   // Move only
   Arguments(Arguments&& args);
@@ -53,6 +61,14 @@ class Arguments {
 
  private:
   std::string AddActiveArgument(const std::string& arg_name);
+  void AddGPUResources(const std::string& name, const GPUResources& resources);
+
+  absl::Status SetGPUResources(const std::string& name,
+                               const GPUResourcesWithValue& resources);
+
+  absl::Status AddObjectArgs();
+
+  void ResolveArgsPass(std::string* code);
 
   struct IntValue {
     int value;
@@ -79,6 +95,15 @@ class Arguments {
   };
   std::map<std::string, FloatValue> float_values_;
   std::vector<float> shared_float4s_data_;
+
+  std::map<std::string, GPUBufferDescriptor> buffers_;
+  std::map<std::string, GPUImage2DDescriptor> images2d_;
+
+  struct ObjectArg {
+    AccessType access_type;
+    GPUObjectPtr obj_ptr;
+  };
+  std::map<std::string, ObjectArg> objects_;
 };
 
 }  // namespace cl
diff --git a/tensorflow/lite/delegates/gpu/cl/gpu_object.h b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
new file mode 100644
index 00000000000..5cc045c6fc7
--- /dev/null
+++ b/tensorflow/lite/delegates/gpu/cl/gpu_object.h
@@ -0,0 +1,121 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_OBJECT_H_
+#define TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_OBJECT_H_
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/lite/delegates/gpu/cl/opencl_wrapper.h"
+#include "tensorflow/lite/delegates/gpu/common/data_type.h"
+#include "tensorflow/lite/delegates/gpu/common/status.h"
+
+namespace tflite {
+namespace gpu {
+namespace cl {
+
+struct GPUImage2DDescriptor {
+  DataType data_type;
+  cl_mem memory;
+};
+
+struct GPUBufferDescriptor {
+  DataType data_type;
+  int element_size;
+  cl_mem memory;
+};
+
+struct GPUResources {
+  std::vector<std::string> ints;
+  std::vector<std::string> floats;
+  std::vector<std::pair<std::string, GPUBufferDescriptor>> buffers;
+  std::vector<std::pair<std::string, GPUImage2DDescriptor>> images2d;
+
+  std::vector<std::string> GetNames() const {
+    std::vector<std::string> names = ints;
+    names.insert(names.end(), floats.begin(), floats.end());
+    for (const auto& obj : buffers) {
+      names.push_back(obj.first);
+    }
+    for (const auto& obj : images2d) {
+      names.push_back(obj.first);
+    }
+    return names;
+  }
+};
+
+struct GPUResourcesWithValue {
+  std::vector<std::pair<std::string, int>> ints;
+  std::vector<std::pair<std::string, float>> floats;
+  std::vector<std::pair<std::string, cl_mem>> buffers;
+  std::vector<std::pair<std::string, cl_mem>> images2d;
+};
+
+class GPUObjectDescriptor {
+ public:
+  GPUObjectDescriptor() = default;
+  GPUObjectDescriptor(const GPUObjectDescriptor& obj_desc)
+      : state_vars_(obj_desc.state_vars_) {}
+  GPUObjectDescriptor& operator=(const GPUObjectDescriptor& obj_desc) {
+    if (this != &obj_desc) {
+      state_vars_ = obj_desc.state_vars_;
+    }
+    return *this;
+  }
+  virtual ~GPUObjectDescriptor() = default;
+
+  void SetStateVar(const std::string& key, const std::string& value) const {
+    state_vars_[key] = value;
+  }
+
+  virtual std::string PerformConstExpr(const std::string& const_expr) const {
+    return "";
+  }
+
+  virtual absl::Status PerformSelector(const std::string& selector,
+                                       const std::vector<std::string>& args,
+                                       std::string* result) const {
+    *result = "";
+    return absl::OkStatus();
+  }
+  virtual GPUResources GetGPUResources() const { return GPUResources(); }
+
+ protected:
+  mutable std::map<std::string, std::string> state_vars_;
+};
+
+class GPUObject {
+ public:
+  GPUObject() = default;
+  // Move only
+  GPUObject(GPUObject&& obj_desc) = default;
+  GPUObject& operator=(GPUObject&& obj_desc) = default;
+  GPUObject(const GPUObject&) = delete;
+  GPUObject& operator=(const GPUObject&) = delete;
+  virtual ~GPUObject() = default;
+  virtual const GPUObjectDescriptor* GetGPUDescriptor() const = 0;
+  virtual GPUResourcesWithValue GetGPUResources() const = 0;
+};
+
+using GPUObjectPtr = std::unique_ptr<GPUObject>;
+
+}  // namespace cl
+}  // namespace gpu
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_GPU_CL_GPU_OBJECT_H_
diff --git a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
index fc3efe32c3b..7a1d454b571 100644
--- a/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
+++ b/tensorflow/lite/delegates/gpu/cl/kernels/transpose.cc
@@ -115,8 +115,7 @@ std::string GetTransposeCode(
   c += PostProcess(linked_operations, context);
   c += "  " + dst_tensor.WriteWHSB("result", "X", "Y", "Z", batch_id);
   c += "}\n";
-  args->ResolveArgsPass(&c);
-  return absl::Substitute(c, args->GetListOfArgs());
+  return c;
 }
 }  // namespace
 
@@ -139,8 +138,10 @@ Transpose& Transpose::operator=(Transpose&& operation) {
 }
 
 absl::Status Transpose::Compile(const CreationContext& creation_context) {
-  const auto code =
+  std::string code =
       GetTransposeCode(definition_, attr_, linked_operations_, &args_);
+  RETURN_IF_ERROR(args_.TransformToCLCode(&code));
+  code = absl::Substitute(code, args_.GetListOfArgs());
   return creation_context.cache->GetOrCreateCLKernel(
       code, "main_function", *creation_context.context,
       *creation_context.device, &kernel_);
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder.cc b/tensorflow/lite/delegates/gpu/common/model_builder.cc
index daedc277869..29d9813379e 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder.cc
@@ -402,6 +402,13 @@ class AddOperationParser : public TFLiteOperationParser {
       return absl::UnimplementedError("ADD requires two input tensors.");
     }
     // TODO(eignasheva): Add shapes check.
+    for (int i = 0; i < 2; i++) {
+      auto input = tflite::GetInput(context, tflite_node, i);
+      if (IsConstantTensor(input) && input->dims->size > 0) {
+        RETURN_IF_ERROR(CheckIfLinearConvertible(input->dims));
+      }
+    }
+
     TfLiteAddParams* tf_options = nullptr;
     return RetrieveBuiltinData(tflite_node, &tf_options);
   }
@@ -2453,15 +2460,15 @@ class TransformLandmarksV2OperationParser : public TFLiteOperationParser {
     RETURN_IF_ERROR(reader->AddOutputs(node));
     std::string op_name = "transform_landmarks_v2";
     node->operation.type = op_name;
-    BHWC output_shape;
+
+    auto output_value = graph->FindOutputs(node->id)[0];
+    output_value->tensor.shape = graph->FindInputs(node->id)[0]->tensor.shape;
+    BHWC output_shape = output_value->tensor.shape;
     RETURN_IF_ERROR(
         ParseCustomAttributes(op_name, tflite_node->custom_initial_data,
                               tflite_node->custom_initial_data_size,
                               &(node->operation.attributes), &output_shape));
 
-    auto output_value = graph->FindOutputs(node->id)[0];
-
-    output_value->tensor.shape = graph->FindInputs(node->id)[0]->tensor.shape;
     return absl::OkStatus();
   }
 
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
index 4973a8179cd..a1705e6cf78 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <fp16.h>
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context.h"
@@ -220,37 +221,48 @@ absl::Status CreateVectorCopyData<float>(const TfLiteTensor& tensor,
   return absl::OkStatus();
 }
 
+const std::string GetDimensionString(const TfLiteIntArray* dimensions) {
+  return absl::StrJoin(TfLiteIntArrayView(dimensions), "x");
+}
+
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Scalar* shape) {
   if (dimensions->size < 0) {
     return absl::InvalidArgumentError("Invalid Scalar dimensions");
   }
   for (int i = 0; i < dimensions->size; ++i) {
     if (dimensions->data[i] != 1) {
-      return absl::InvalidArgumentError(
-          "Dimension can not be reduced to scalar.");
+      return absl::InvalidArgumentError(absl::StrCat(
+          GetDimensionString(dimensions), "  cannot be reduced to scalar."));
     }
   }
   shape->v = 1;
   return absl::OkStatus();
 }
 
-absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape) {
+absl::Status CheckIfLinearConvertible(const TfLiteIntArray* dimensions) {
   if (dimensions->size <= 0) {
     return absl::InvalidArgumentError("Dimension is empty.");
   }
   for (int i = 0; i < dimensions->size - 1; ++i) {
     if (dimensions->data[i] != 1) {
-      return absl::InvalidArgumentError(
-          "Dimension can not be reduced to linear.");
+      return absl::InvalidArgumentError(absl::StrCat(
+          GetDimensionString(dimensions), "  cannot be reduced to linear."));
     }
   }
+  return absl::OkStatus();
+}
+
+absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape) {
+  RETURN_IF_ERROR(CheckIfLinearConvertible(dimensions));
   shape->v = dimensions->data[dimensions->size - 1];
   return absl::OkStatus();
 }
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HWC* shape) {
   if (dimensions->size != 4) {
-    return absl::InvalidArgumentError("Dimensions are not HWC");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Expected a 4D tensor of shape 1xHxWxC but got ",
+                     GetDimensionString(dimensions)));
   }
   if (dimensions->data[0] != 1) {
     return absl::UnimplementedError("Batch size is not equal to 1.");
@@ -263,7 +275,9 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HWC* shape) {
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HW* shape) {
   if (dimensions->size != 2) {
-    return absl::InvalidArgumentError("Dimensions are not HW");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Expected a 2D tensor of shape HxW but got ",
+                     GetDimensionString(dimensions)));
   }
   shape->h = dimensions->data[0];
   shape->w = dimensions->data[1];
@@ -273,7 +287,8 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HW* shape) {
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, OHWI* shape) {
   if (dimensions->size != 4) {
     return absl::InvalidArgumentError(
-        absl::StrCat("Dimensions are not OHWI: ", dimensions->size));
+        absl::StrCat("Expected a 4D tensor of shape OxHxWxI but got ",
+                     GetDimensionString(dimensions)));
   }
   shape->o = dimensions->data[0];
   shape->h = dimensions->data[1];
@@ -284,7 +299,9 @@ absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, OHWI* shape) {
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, BHWC* shape) {
   if (dimensions->size != 4) {
-    return absl::InvalidArgumentError("Dimensions are not BHWC");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Expected a 4D tensor of shape BxHxWxC but got ",
+                     GetDimensionString(dimensions)));
   }
   shape->b = dimensions->data[0];
   shape->h = dimensions->data[1];
diff --git a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
index 9caa5630037..6cbfcd9e7d6 100644
--- a/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
+++ b/tensorflow/lite/delegates/gpu/common/model_builder_helper.h
@@ -108,6 +108,8 @@ absl::Status CreateVectorCopyData<float>(const TfLiteTensor& tensor,
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Scalar* shape);
 
+absl::Status CheckIfLinearConvertible(const TfLiteIntArray* dimensions);
+
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, Linear* shape);
 
 absl::Status SetAllDimensions(const TfLiteIntArray* dimensions, HWC* shape);
diff --git a/tensorflow/lite/delegates/status.h b/tensorflow/lite/delegates/status.h
new file mode 100644
index 00000000000..e56bf7ce577
--- /dev/null
+++ b/tensorflow/lite/delegates/status.h
@@ -0,0 +1,83 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_DELEGATES_STATUS_H_
+#define TENSORFLOW_LITE_DELEGATES_STATUS_H_
+
+#include <cstdint>
+#include <limits>
+
+#include "tensorflow/lite/c/common.h"
+
+// This file defines data structures to represent detailed TFLite delegate
+// status, e.g. NNAPI delegate application failure because of a driver issue
+// etc. Such status is ONLY to be used for internal APIs.
+// Note, we simply use TfLiteStatus to represent high-level status while
+// delegate-specific status codes are defined with DelegateStatus.
+// WARNING: This is an experimental feature that is subject to change.
+namespace tflite {
+namespace delegates {
+
+// Defines the source of the code where it is generated from. We list all TFLite
+// delegates that're officially implemented and available as of April, 2020
+// (i.e. w/ 'TFLITE_' prefix to imply this).
+enum class DelegateStatusSource {
+  NONE = 0,
+  TFLITE_GPU = 1,
+  TFLITE_NNAPI = 2,
+  TFLITE_HEXAGON = 3,
+  TFLITE_XNNPACK = 4,
+  TFLITE_COREML = 5,
+  MAX_NUM_SOURCES = std::numeric_limits<int32_t>::max(),
+};
+
+// Defines the detailed status that combines a DelegateStatusSource and a
+// status int32_t code.
+class DelegateStatus {
+ public:
+  DelegateStatus() : DelegateStatus(DelegateStatusSource::NONE, 0) {}
+  explicit DelegateStatus(int32_t code)
+      : DelegateStatus(DelegateStatusSource::NONE, code) {}
+  explicit DelegateStatus(int64_t full_status)
+      : DelegateStatus(
+            static_cast<DelegateStatusSource>(
+                full_status >> 32 &
+                static_cast<int32_t>(DelegateStatusSource::MAX_NUM_SOURCES)),
+            static_cast<int32_t>(full_status &
+                                 std::numeric_limits<int32_t>::max())) {}
+  DelegateStatus(DelegateStatusSource source, int32_t code)
+      : source_(static_cast<int32_t>(source)), code_(code) {}
+
+  // Return the detailed full status encoded as a int64_t value.
+  int64_t full_status() const {
+    return static_cast<int64_t>(source_) << 32 | code_;
+  }
+
+  DelegateStatusSource source() const {
+    return static_cast<DelegateStatusSource>(source_);
+  }
+
+  int32_t code() const { return code_; }
+
+ private:
+  // value of a DelegateStatusSource, like DelegateStatusSource::TFLITE_GPU
+  int32_t source_;
+  // value of a status code, like kTfLiteOk.
+  int32_t code_;
+};
+
+}  // namespace delegates
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_DELEGATES_STATUS_H_
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index 1cdba72b615..df70a314308 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@XNNPACK",
     ],
 )
@@ -39,6 +40,7 @@ cc_library(
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@XNNPACK",
     ],
 )
@@ -56,6 +58,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -72,6 +75,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -88,6 +92,7 @@ cc_library(
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
@@ -215,6 +220,7 @@ cc_test(
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "@FP16",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
diff --git a/tensorflow/lite/delegates/xnnpack/add_test.cc b/tensorflow/lite/delegates/xnnpack/add_test.cc
index dd2857e01ce..6bc8f8d6bca 100644
--- a/tensorflow/lite/delegates/xnnpack/add_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/add_test.cc
@@ -679,6 +679,35 @@ TEST(Add, 2DByStatic0D) {
       .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
 }
 
+TEST(Add, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .FP16Weights()
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input2Static(true)
+      .FP16Weights()
+      .Test(BuiltinOperator_ADD, xnnpack_delegate.get());
+}
+
 TEST(Add, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
index e846cbeffe3..ad5b197d6fa 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include <fp16.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -62,6 +63,9 @@ void BinaryElementwiseTester::Test(tflite::BuiltinOperator binary_op,
   if (Input1Static()) {
     ASSERT_FALSE(Input2Static());
   }
+  if (FP16Weights()) {
+    ASSERT_TRUE(Input1Static() || Input2Static());
+  }
 
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
@@ -180,8 +184,12 @@ std::vector<char> BinaryElementwiseTester::CreateTfLiteModel(
   auto input2_rng = std::bind(input2_distribution, std::ref(rng));
 
   flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<OperatorCode> operator_code =
-      CreateOperatorCode(builder, binary_op);
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, binary_op)}};
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+  }
 
   std::vector<flatbuffers::Offset<Buffer>> buffers{{
       CreateBuffer(builder, builder.CreateVector({})),
@@ -189,43 +197,89 @@ std::vector<char> BinaryElementwiseTester::CreateTfLiteModel(
 
   int32_t input1_buffer = 0;
   if (Input1Static()) {
-    std::vector<float> input1_data(ComputeSize(Input1Shape()));
-    std::generate(input1_data.begin(), input1_data.end(), input1_rng);
+    if (FP16Weights()) {
+      std::vector<uint16_t> input1_data(ComputeSize(Input1Shape()));
+      std::generate(input1_data.begin(), input1_data.end(),
+                    std::bind(fp16_ieee_from_fp32_value, input1_rng));
 
-    input1_buffer = buffers.size();
-    buffers.push_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(input1_data.data()),
-                     sizeof(float) * input1_data.size())));
+      buffers.push_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(input1_data.data()),
+                       sizeof(uint16_t) * input1_data.size())));
+    } else {
+      std::vector<float> input1_data(ComputeSize(Input1Shape()));
+      std::generate(input1_data.begin(), input1_data.end(), input1_rng);
+
+      input1_buffer = buffers.size();
+      buffers.push_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(input1_data.data()),
+                       sizeof(float) * input1_data.size())));
+    }
   }
 
   int32_t input2_buffer = 0;
   if (Input2Static()) {
-    std::vector<float> input2_data(ComputeSize(Input2Shape()));
-    std::generate(input2_data.begin(), input2_data.end(), input2_rng);
+    if (FP16Weights()) {
+      std::vector<uint16_t> input2_data(ComputeSize(Input2Shape()));
+      std::generate(input2_data.begin(), input2_data.end(),
+                    std::bind(fp16_ieee_from_fp32_value, input1_rng));
 
-    input2_buffer = buffers.size();
-    buffers.push_back(CreateBuffer(
-        builder, builder.CreateVector(
-                     reinterpret_cast<const uint8_t*>(input2_data.data()),
-                     sizeof(float) * input2_data.size())));
+      buffers.push_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(input2_data.data()),
+                       sizeof(uint16_t) * input2_data.size())));
+    } else {
+      std::vector<float> input2_data(ComputeSize(Input2Shape()));
+      std::generate(input2_data.begin(), input2_data.end(), input2_rng);
+
+      input2_buffer = buffers.size();
+      buffers.push_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(input2_data.data()),
+                       sizeof(float) * input2_data.size())));
+    }
   }
 
   const std::vector<int32_t> output_shape = OutputShape();
-  const std::array<flatbuffers::Offset<Tensor>, 3> tensors{{
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(Input1Shape().data(),
-                                                 Input1Shape().size()),
-                   TensorType_FLOAT32, input1_buffer),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(Input2Shape().data(),
-                                                 Input2Shape().size()),
-                   TensorType_FLOAT32, input2_buffer),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(output_shape.data(),
-                                                 output_shape.size()),
-                   TensorType_FLOAT32),
-  }};
+  std::vector<flatbuffers::Offset<Tensor>> tensors;
+  std::vector<flatbuffers::Offset<Operator>> operators;
+  if (FP16Weights() && Input1Static()) {
+    tensors.emplace_back(
+        CreateTensor(builder,
+                     builder.CreateVector<int32_t>(Input1Shape().data(),
+                                                   Input1Shape().size()),
+                     TensorType_FLOAT16, 1));
+  }
+  if (FP16Weights() && Input2Static()) {
+    tensors.emplace_back(
+        CreateTensor(builder,
+                     builder.CreateVector<int32_t>(Input2Shape().data(),
+                                                   Input2Shape().size()),
+                     TensorType_FLOAT16, 1));
+  }
+  if (FP16Weights()) {
+    const std::array<int32_t, 1> dequantize_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_outputs{{Input1Static() ? 1 : 2}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_inputs.data(),
+                                      dequantize_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_outputs.data(),
+                                      dequantize_outputs.size())));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(Input1Shape().data(), Input1Shape().size()),
+      TensorType_FLOAT32, input1_buffer));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(Input2Shape().data(), Input2Shape().size()),
+      TensorType_FLOAT32, input2_buffer));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
 
   tflite::BuiltinOptions builtin_options_type = tflite::BuiltinOptions_NONE;
   flatbuffers::Offset<void> builtin_options = 0;
@@ -250,35 +304,40 @@ std::vector<char> BinaryElementwiseTester::CreateTfLiteModel(
       EXPECT_EQ(Activation(), ActivationFunctionType_NONE);
   }
 
-  const std::array<int32_t, 2> op_inputs{{0, 1}};
-  const std::array<int32_t, 1> op_outputs{{2}};
-  flatbuffers::Offset<Operator> op = CreateOperator(
+  const std::array<int32_t, 2> op_inputs{
+      {static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+  operators.emplace_back(CreateOperator(
       builder, /*opcode_index=*/0,
       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
-      builtin_options_type, builtin_options);
+      builtin_options_type, builtin_options));
 
   std::vector<int32_t> subgraph_inputs;
   if (!Input1Static()) {
-    subgraph_inputs.push_back(0);
+    subgraph_inputs.push_back(tensors.size() - 3);
   }
   if (!Input2Static()) {
-    subgraph_inputs.push_back(1);
+    subgraph_inputs.push_back(tensors.size() - 2);
   }
-  const std::array<int32_t, 1> subgraph_outputs{{2}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
   flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
       builder, builder.CreateVector(tensors.data(), tensors.size()),
       builder.CreateVector<int32_t>(subgraph_inputs.data(),
                                     subgraph_inputs.size()),
       builder.CreateVector<int32_t>(subgraph_outputs.data(),
                                     subgraph_outputs.size()),
-      builder.CreateVector(&op, 1));
+      builder.CreateVector(operators.data(), operators.size()));
 
   flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("Binary operator model");
 
   flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
       builder.CreateVector(buffers.data(), buffers.size()));
 
diff --git a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
index 15c99c3148d..a0c2440f59a 100644
--- a/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/binary_elementwise_tester.h
@@ -74,6 +74,13 @@ class BinaryElementwiseTester {
 
   inline bool Input2Static() const { return input2_static_; }
 
+  inline BinaryElementwiseTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
   inline BinaryElementwiseTester& ReluActivation() {
     activation_ = ::tflite::ActivationFunctionType_RELU;
     return *this;
@@ -114,6 +121,7 @@ class BinaryElementwiseTester {
   std::vector<int32_t> input2_shape_;
   bool input1_static_ = false;
   bool input2_static_ = false;
+  bool fp16_weights_ = false;
   ::tflite::ActivationFunctionType activation_ =
       ::tflite::ActivationFunctionType_NONE;
 };
diff --git a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
index 95a358d1b9c..a8c6a1956bc 100644
--- a/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/conv_2d_test.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
 #include <cstdint>
 #include <functional>
 #include <random>
 #include <vector>
 
 #include <gtest/gtest.h>
+#include <fp16.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/interpreter.h"
@@ -146,6 +148,13 @@ class Conv2DTester {
 
   int32_t DilationWidth() const { return dilation_width_; }
 
+  inline Conv2DTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
   Conv2DTester& SamePadding(bool same_padding) {
     same_padding_ = same_padding;
     return *this;
@@ -154,11 +163,7 @@ class Conv2DTester {
   bool SamePadding() const { return same_padding_; }
 
   void Test(TfLiteDelegate* delegate) const {
-    std::random_device random_device;
-    auto rng = std::mt19937(random_device());
-    auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
-
-    std::vector<char> buffer = CreateTfLiteModel(std::ref(f32rng));
+    std::vector<char> buffer = CreateTfLiteModel();
     const Model* model = GetModel(buffer.data());
 
     std::unique_ptr<Interpreter> delegate_interpreter;
@@ -187,6 +192,10 @@ class Conv2DTester {
     ASSERT_EQ(delegate_interpreter->ModifyGraphWithDelegate(delegate),
               kTfLiteOk);
 
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
+
     float* default_input_data = default_interpreter->typed_tensor<float>(
         default_interpreter->inputs()[0]);
     std::generate(default_input_data,
@@ -219,82 +228,149 @@ class Conv2DTester {
   }
 
  private:
-  std::vector<char> CreateTfLiteModel(std::function<float()> f32rng) const {
+  std::vector<char> CreateTfLiteModel() const {
+    std::random_device random_device;
+    auto rng = std::mt19937(random_device());
+    auto f32rng = std::bind(std::uniform_real_distribution<float>(), rng);
+
     flatbuffers::FlatBufferBuilder builder;
-    flatbuffers::Offset<OperatorCode> operator_code =
-        CreateOperatorCode(builder, BuiltinOperator_CONV_2D, 0);
+    std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+        {CreateOperatorCode(builder, BuiltinOperator_CONV_2D, 0)}};
+    std::vector<flatbuffers::Offset<tflite::Operator>> operators;
+    std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
+        {CreateBuffer(builder, builder.CreateVector({}))}};
+
+    if (FP16Weights()) {
+      operator_codes.emplace_back(
+          CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+
+      auto f16rng = std::bind(fp16_ieee_from_fp32_value, f32rng);
+
+      std::vector<uint16_t> filter_data(OutputChannels() * KernelHeight() *
+                                        KernelWidth() * InputChannels());
+      std::vector<uint16_t> bias_data(OutputChannels());
+
+      std::generate(filter_data.begin(), filter_data.end(), f16rng);
+      std::generate(bias_data.begin(), bias_data.end(), f16rng);
+
+      buffers.emplace_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(filter_data.data()),
+                       sizeof(uint16_t) * filter_data.size())));
+      buffers.emplace_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(bias_data.data()),
+                       sizeof(uint16_t) * bias_data.size())));
+
+      const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
+      const std::array<int32_t, 1> dequantize_filter_outputs{{3}};
+      operators.emplace_back(CreateOperator(
+          builder, /*opcode_index=*/1,
+          builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                        dequantize_filter_inputs.size()),
+          builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                        dequantize_filter_outputs.size())));
+      const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
+      const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
+      operators.emplace_back(CreateOperator(
+          builder, /*opcode_index=*/1,
+          builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                        dequantize_bias_inputs.size()),
+          builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                        dequantize_bias_outputs.size())));
+    } else {
+      std::vector<float> filter_data(OutputChannels() * KernelHeight() *
+                                     KernelWidth() * InputChannels());
+      std::vector<float> bias_data(OutputChannels());
+
+      std::generate(filter_data.begin(), filter_data.end(), f32rng);
+      std::generate(bias_data.begin(), bias_data.end(), f32rng);
+
+      buffers.emplace_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(filter_data.data()),
+                       sizeof(float) * filter_data.size())));
+      buffers.emplace_back(CreateBuffer(
+          builder, builder.CreateVector(
+                       reinterpret_cast<const uint8_t*>(bias_data.data()),
+                       sizeof(float) * bias_data.size())));
+    }
+
+    const std::array<int32_t, 4> input_shape{
+        {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
+    const std::array<int32_t, 4> output_shape{
+        {BatchSize(), OutputHeight(), OutputWidth(), OutputChannels()}};
+    const std::array<int32_t, 4> filter_shape{
+        {OutputChannels(), KernelHeight(), KernelWidth(), InputChannels()}};
+    const std::array<int32_t, 1> bias_shape{{OutputChannels()}};
+
+    std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
+    if (FP16Weights()) {
+      tensors.emplace_back(
+          CreateTensor(builder,
+                       builder.CreateVector<int32_t>(filter_shape.data(),
+                                                     filter_shape.size()),
+                       TensorType_FLOAT16, /*buffer=*/1));
+      tensors.emplace_back(CreateTensor(
+          builder,
+          builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+          TensorType_FLOAT16, /*buffer=*/2));
+    }
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+        TensorType_FLOAT32));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 1));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+        TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+        TensorType_FLOAT32));
+
+    const std::array<int32_t, 3> op_inputs{
+        {static_cast<int>(tensors.size()) - 4,
+         static_cast<int>(tensors.size()) - 3,
+         static_cast<int>(tensors.size()) - 2}};
+    const std::array<int32_t, 1> op_outputs{
+        {static_cast<int>(tensors.size()) - 1}};
 
     flatbuffers::Offset<Conv2DOptions> conv2d_options = CreateConv2DOptions(
         builder, SamePadding() ? tflite::Padding_SAME : tflite::Padding_VALID,
         StrideWidth(), StrideHeight(), ActivationFunctionType_NONE,
         DilationWidth(), DilationHeight());
 
-    std::vector<float> filter_data(OutputChannels() * KernelHeight() *
-                                   KernelWidth() * InputChannels());
-    std::vector<float> bias_data(OutputChannels());
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/0,
+        builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
+        builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
+        BuiltinOptions_Conv2DOptions, conv2d_options.Union()));
 
-    std::generate(filter_data.begin(), filter_data.end(), f32rng);
-    std::generate(bias_data.begin(), bias_data.end(), f32rng);
-
-    flatbuffers::Offset<Buffer> buffers[3] = {
-        CreateBuffer(builder, builder.CreateVector({})),
-        CreateBuffer(builder,
-                     builder.CreateVector(
-                         reinterpret_cast<const uint8_t*>(filter_data.data()),
-                         sizeof(float) * filter_data.size())),
-        CreateBuffer(builder,
-                     builder.CreateVector(
-                         reinterpret_cast<const uint8_t*>(bias_data.data()),
-                         sizeof(float) * bias_data.size())),
-    };
-
-    const int32_t input_shape[4] = {BatchSize(), InputHeight(), InputWidth(),
-                                    InputChannels()};
-    const int32_t output_shape[4] = {BatchSize(), OutputHeight(), OutputWidth(),
-                                     OutputChannels()};
-    const int32_t filter_shape[4] = {OutputChannels(), KernelHeight(),
-                                     KernelWidth(), InputChannels()};
-    const int32_t bias_shape[1] = {OutputChannels()};
-
-    flatbuffers::Offset<Tensor> tensors[4] = {
-        CreateTensor(builder, builder.CreateVector<int32_t>(input_shape, 4),
-                     TensorType_FLOAT32, /*buffer=*/0,
-                     builder.CreateString("X")),
-        CreateTensor(builder, builder.CreateVector<int32_t>(filter_shape, 4),
-                     TensorType_FLOAT32, /*buffer=*/1,
-                     builder.CreateString("W")),
-        CreateTensor(builder, builder.CreateVector<int32_t>(bias_shape, 1),
-                     TensorType_FLOAT32, /*buffer=*/2,
-                     builder.CreateString("b")),
-        CreateTensor(builder, builder.CreateVector<int32_t>(output_shape, 4),
-                     TensorType_FLOAT32, /*buffer=*/0,
-                     builder.CreateString("Y")),
-    };
-
-    const int32_t op_inputs[3] = {0, 1, 2};
-    const int32_t op_outputs[1] = {3};
-
-    flatbuffers::Offset<Operator> op =
-        CreateOperator(builder, /*opcode_index=*/0,
-                       builder.CreateVector<int32_t>(op_inputs, 3),
-                       builder.CreateVector<int32_t>(op_outputs, 1),
-                       BuiltinOptions_Conv2DOptions, conv2d_options.Union());
-
-    int32_t subgraph_inputs[1] = {0};
-    int32_t subgraph_outputs[1] = {3};
-    flatbuffers::Offset<SubGraph> subgraph =
-        CreateSubGraph(builder, builder.CreateVector(tensors, 4),
-                       builder.CreateVector<int32_t>(subgraph_inputs, 1),
-                       builder.CreateVector<int32_t>(subgraph_outputs, 1),
-                       builder.CreateVector(&op, 1), /*name=*/0);
+    const std::array<int32_t, 1> subgraph_inputs{
+        {static_cast<int>(tensors.size()) - 4}};
+    const std::array<int32_t, 1> subgraph_outputs{
+        {static_cast<int>(tensors.size()) - 1}};
+    flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
+        builder, builder.CreateVector(tensors.data(), tensors.size()),
+        builder.CreateVector<int32_t>(subgraph_inputs.data(),
+                                      subgraph_inputs.size()),
+        builder.CreateVector<int32_t>(subgraph_outputs.data(),
+                                      subgraph_outputs.size()),
+        builder.CreateVector(operators.data(), operators.size()));
 
     flatbuffers::Offset<flatbuffers::String> description =
         builder.CreateString("Conv2D model");
 
     flatbuffers::Offset<Model> model_buffer = CreateModel(
-        builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+        builder, TFLITE_SCHEMA_VERSION,
+        builder.CreateVector(operator_codes.data(), operator_codes.size()),
         builder.CreateVector(&subgraph, 1), description,
-        builder.CreateVector(buffers, 3));
+        builder.CreateVector(buffers.data(), buffers.size()));
 
     builder.Finish(model_buffer);
 
@@ -313,6 +389,7 @@ class Conv2DTester {
   int32_t stride_width_ = 1;
   int32_t dilation_height_ = 1;
   int32_t dilation_width_ = 1;
+  bool fp16_weights_ = false;
   bool same_padding_ = true;
 };
 
@@ -506,5 +583,35 @@ TEST(Conv2D, DilationWithValidPadding) {
       .Test(xnnpack_delegate.get());
 }
 
+TEST(Conv2D, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(1, 16), std::ref(rng));
+
+  Conv2DTester()
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .OutputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .SamePadding(true)
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
 }  // namespace xnnpack
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
index fd82e4fd83f..c9d274cbe01 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_test.cc
@@ -371,6 +371,37 @@ TEST(DepthwiseConv2D, DepthMultiplier) {
       .Test(xnnpack_delegate.get());
 }
 
+TEST(DepthwiseConv2D, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 4), std::ref(rng));
+  auto input_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(10, 25), std::ref(rng));
+  auto kernel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 5), std::ref(rng));
+  auto stride_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 3), std::ref(rng));
+  auto channel_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(3, 32), std::ref(rng));
+
+  DepthwiseConv2DTester()
+      .BatchSize(batch_rng())
+      .InputHeight(input_rng())
+      .InputWidth(input_rng())
+      .InputChannels(channel_rng())
+      .KernelHeight(kernel_rng())
+      .KernelWidth(kernel_rng())
+      .StrideHeight(stride_rng())
+      .StrideWidth(stride_rng())
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
 TEST(DepthwiseConv2D, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
index b6d1dfec69b..9b6749e42f6 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include <fp16.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -107,56 +108,110 @@ void DepthwiseConv2DTester::Test(TfLiteDelegate* delegate) const {
 }
 
 std::vector<char> DepthwiseConv2DTester::CreateTfLiteModel() const {
-  flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<OperatorCode> operator_code =
-      CreateOperatorCode(builder, BuiltinOperator_DEPTHWISE_CONV_2D);
-
-  flatbuffers::Offset<DepthwiseConv2DOptions> depthwise_conv2d_options =
-      CreateDepthwiseConv2DOptions(
-          builder, Padding(), StrideWidth(), StrideHeight(), DepthMultiplier(),
-          Activation(), DilationWidth(), DilationHeight());
-
-  std::vector<float> filter_data(KernelHeight() * KernelWidth() *
-                                 OutputChannels());
-  std::vector<float> bias_data(OutputChannels());
-
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
   auto range_rng = std::bind(
       std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
-  for (int32_t ic = 0; ic < InputChannels(); ic++) {
-    // Use the same range of all-positive or all-negative values to generate
-    // all pixels within the same batch index & channel, but different ranges
-    // for different channels or batches. This ensures that no catastrophic
-    // cancellation occur, but test covers both positive and negative inputs.
-    const float range = range_rng();
-    auto value_rng =
-        std::bind(std::uniform_real_distribution<float>(std::min(range, 0.0f),
-                                                        std::max(range, 0.0f)),
-                  std::ref(rng));
-    for (int32_t m = 0; m < DepthMultiplier(); m++) {
-      const int32_t oc = ic * DepthMultiplier() + m;
-      bias_data[oc] = value_rng();
-      for (int32_t y = 0; y < KernelHeight(); y++) {
-        for (int32_t x = 0; x < KernelWidth(); x++) {
-          const int32_t index = (y * KernelWidth() + x) * OutputChannels() + oc;
-          filter_data[index] = value_rng();
+
+  flatbuffers::FlatBufferBuilder builder;
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_DEPTHWISE_CONV_2D)}};
+  std::vector<flatbuffers::Offset<tflite::Operator>> operators;
+  std::vector<flatbuffers::Offset<tflite::Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
+
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
+
+    std::vector<uint16_t> filter_data(KernelHeight() * KernelWidth() *
+                                      OutputChannels());
+    std::vector<uint16_t> bias_data(OutputChannels());
+    for (int32_t ic = 0; ic < InputChannels(); ic++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all pixels within the same batch index & channel, but different ranges
+      // for different channels or batches. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(fp16_ieee_from_fp32_value,
+                    std::bind(std::uniform_real_distribution<float>(
+                                  std::min(range, 0.0f), std::max(range, 0.0f)),
+                              std::ref(rng)));
+      for (int32_t m = 0; m < DepthMultiplier(); m++) {
+        const int32_t oc = ic * DepthMultiplier() + m;
+        bias_data[oc] = value_rng();
+        for (int32_t y = 0; y < KernelHeight(); y++) {
+          for (int32_t x = 0; x < KernelWidth(); x++) {
+            const int32_t index =
+                (y * KernelWidth() + x) * OutputChannels() + oc;
+            filter_data[index] = value_rng();
+          }
         }
       }
     }
-  }
 
-  const std::array<flatbuffers::Offset<tflite::Buffer>, 3> buffers{{
-      CreateBuffer(builder, builder.CreateVector({})),
-      CreateBuffer(builder,
-                   builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(filter_data.data()),
-                       sizeof(float) * filter_data.size())),
-      CreateBuffer(builder,
-                   builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(bias_data.data()),
-                       sizeof(float) * bias_data.size())),
-  }};
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(uint16_t) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(uint16_t) * bias_data.size())));
+
+    const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_filter_outputs{{3}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                      dequantize_filter_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                      dequantize_filter_outputs.size())));
+    const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
+    const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                      dequantize_bias_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                      dequantize_bias_outputs.size())));
+  } else {
+    std::vector<float> filter_data(KernelHeight() * KernelWidth() *
+                                   OutputChannels());
+    std::vector<float> bias_data(OutputChannels());
+    for (int32_t ic = 0; ic < InputChannels(); ic++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all pixels within the same batch index & channel, but different ranges
+      // for different channels or batches. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(std::uniform_real_distribution<float>(
+                        std::min(range, 0.0f), std::max(range, 0.0f)),
+                    std::ref(rng));
+      for (int32_t m = 0; m < DepthMultiplier(); m++) {
+        const int32_t oc = ic * DepthMultiplier() + m;
+        bias_data[oc] = value_rng();
+        for (int32_t y = 0; y < KernelHeight(); y++) {
+          for (int32_t x = 0; x < KernelWidth(); x++) {
+            const int32_t index =
+                (y * KernelWidth() + x) * OutputChannels() + oc;
+            filter_data[index] = value_rng();
+          }
+        }
+      }
+    }
+
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(float) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(float) * bias_data.size())));
+  }
 
   const std::array<int32_t, 4> input_shape{
       {BatchSize(), InputHeight(), InputWidth(), InputChannels()}};
@@ -166,49 +221,69 @@ std::vector<char> DepthwiseConv2DTester::CreateTfLiteModel() const {
       {1, KernelHeight(), KernelWidth(), OutputChannels()}};
   const std::array<int32_t, 1> bias_shape{{OutputChannels()}};
 
-  const std::array<flatbuffers::Offset<tflite::Tensor>, 4> tensors{{
-      CreateTensor(
-          builder,
-          builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
-          TensorType_FLOAT32),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(filter_shape.data(),
-                                                 filter_shape.size()),
-                   TensorType_FLOAT32, /*buffer=*/1),
-      CreateTensor(
-          builder,
-          builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-          TensorType_FLOAT32, /*buffer=*/2),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(output_shape.data(),
-                                                 output_shape.size()),
-                   TensorType_FLOAT32),
-  }};
+  std::vector<flatbuffers::Offset<tflite::Tensor>> tensors;
+  if (FP16Weights()) {
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/1));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/2));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(input_shape.data(), input_shape.size()),
+      TensorType_FLOAT32));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 1));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
 
-  const std::array<int32_t, 3> op_inputs{{0, 1, 2}};
-  const std::array<int32_t, 1> op_outputs{{3}};
+  const std::array<int32_t, 3> op_inputs{
+      {static_cast<int>(tensors.size()) - 4,
+       static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
 
-  flatbuffers::Offset<tflite::Operator> op = CreateOperator(
+  flatbuffers::Offset<DepthwiseConv2DOptions> depthwise_conv2d_options =
+      CreateDepthwiseConv2DOptions(
+          builder, Padding(), StrideWidth(), StrideHeight(), DepthMultiplier(),
+          Activation(), DilationWidth(), DilationHeight());
+  operators.emplace_back(CreateOperator(
       builder, /*opcode_index=*/0,
       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
-      BuiltinOptions_DepthwiseConv2DOptions, depthwise_conv2d_options.Union());
+      BuiltinOptions_DepthwiseConv2DOptions, depthwise_conv2d_options.Union()));
 
-  const std::array<int32_t, 1> subgraph_inputs{{0}};
-  const std::array<int32_t, 1> subgraph_outputs{{3}};
+  const std::array<int32_t, 1> subgraph_inputs{
+      {static_cast<int>(tensors.size()) - 4}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
   flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
       builder, builder.CreateVector(tensors.data(), tensors.size()),
       builder.CreateVector<int32_t>(subgraph_inputs.data(),
                                     subgraph_inputs.size()),
       builder.CreateVector<int32_t>(subgraph_outputs.data(),
                                     subgraph_outputs.size()),
-      builder.CreateVector(&op, 1));
+      builder.CreateVector(operators.data(), operators.size()));
 
   flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("DepthwiseConv2D model");
 
   flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
       builder.CreateVector(buffers.data(), buffers.size()));
 
diff --git a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
index 16dc5920229..102c66af340 100644
--- a/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/depthwise_conv_2d_tester.h
@@ -152,6 +152,13 @@ class DepthwiseConv2DTester {
     return (KernelWidth() - 1) * DilationWidth() + 1;
   }
 
+  inline DepthwiseConv2DTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
   inline DepthwiseConv2DTester& SamePadding() {
     padding_ = ::tflite::Padding_SAME;
     return *this;
@@ -209,6 +216,7 @@ class DepthwiseConv2DTester {
   int32_t stride_width_ = 1;
   int32_t dilation_height_ = 1;
   int32_t dilation_width_ = 1;
+  bool fp16_weights_ = false;
   ::tflite::Padding padding_ = ::tflite::Padding_VALID;
   ::tflite::ActivationFunctionType activation_ =
       ::tflite::ActivationFunctionType_NONE;
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
index a801ce141ed..0dffd1dee19 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_test.cc
@@ -228,6 +228,29 @@ TEST(FullyConnected, 4DKeepDims) {
       .Test(xnnpack_delegate.get());
 }
 
+TEST(FullyConnected, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto batch_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  auto channels_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 9), std::ref(rng));
+  const auto batch = batch_rng();
+  const auto input_channels = channels_rng();
+  const auto output_channels = channels_rng();
+
+  FullyConnectedTester()
+      .InputShape({batch, input_channels})
+      .InputChannels(input_channels)
+      .OutputChannels(output_channels)
+      .FP16Weights()
+      .Test(xnnpack_delegate.get());
+}
+
 TEST(FullyConnected, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
index 05716bf18fb..8962b8ba7ba 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include <fp16.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
@@ -109,98 +110,165 @@ void FullyConnectedTester::Test(TfLiteDelegate* delegate) const {
 std::vector<char> FullyConnectedTester::CreateTfLiteModel() const {
   std::random_device random_device;
   auto rng = std::mt19937(random_device());
-
   auto range_rng = std::bind(
       std::uniform_real_distribution<float>(-25.0f, 25.0f), std::ref(rng));
 
   flatbuffers::FlatBufferBuilder builder;
-  flatbuffers::Offset<OperatorCode> operator_code =
-      CreateOperatorCode(builder, BuiltinOperator_FULLY_CONNECTED);
+  std::vector<flatbuffers::Offset<OperatorCode>> operator_codes{
+      {CreateOperatorCode(builder, BuiltinOperator_FULLY_CONNECTED)}};
+  std::vector<flatbuffers::Offset<Operator>> operators;
+  std::vector<flatbuffers::Offset<Buffer>> buffers{
+      {CreateBuffer(builder, builder.CreateVector({}))}};
 
-  std::vector<float> filter_data(InputChannels() * OutputChannels());
-  std::vector<float> bias_data(OutputChannels());
+  if (FP16Weights()) {
+    operator_codes.emplace_back(
+        CreateOperatorCode(builder, BuiltinOperator_DEQUANTIZE));
 
-  for (int32_t oc = 0; oc < OutputChannels(); oc++) {
-    // Use the same range of all-positive or all-negative values to generate
-    // all filter & bias weights within the same channel, but different ranges
-    // for different output channels. This ensures that no catastrophic
-    // cancellation occur, but test covers both positive and negative inputs.
-    const float range = range_rng();
-    auto value_rng =
-        std::bind(std::uniform_real_distribution<float>(std::min(range, 0.0f),
-                                                        std::max(range, 0.0f)),
-                  std::ref(rng));
+    std::vector<uint16_t> filter_data(InputChannels() * OutputChannels());
+    std::vector<uint16_t> bias_data(OutputChannels());
 
-    bias_data[oc] = value_rng();
-    for (int32_t ic = 0; ic < InputChannels(); ic++) {
-      filter_data[oc * InputChannels() + ic] = value_rng();
+    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all filter & bias weights within the same channel, but different ranges
+      // for different output channels. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(fp16_ieee_from_fp32_value,
+                    std::bind(std::uniform_real_distribution<float>(
+                                  std::min(range, 0.0f), std::max(range, 0.0f)),
+                              std::ref(rng)));
+
+      bias_data[oc] = value_rng();
+      for (int32_t ic = 0; ic < InputChannels(); ic++) {
+        filter_data[oc * InputChannels() + ic] = value_rng();
+      }
     }
-  }
 
-  std::array<flatbuffers::Offset<Buffer>, 3> buffers{{
-      CreateBuffer(builder, builder.CreateVector({})),
-      CreateBuffer(builder,
-                   builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(filter_data.data()),
-                       sizeof(float) * filter_data.size())),
-      CreateBuffer(builder,
-                   builder.CreateVector(
-                       reinterpret_cast<const uint8_t*>(bias_data.data()),
-                       sizeof(float) * bias_data.size())),
-  }};
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(uint16_t) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(uint16_t) * bias_data.size())));
+
+    const std::array<int32_t, 1> dequantize_filter_inputs{{0}};
+    const std::array<int32_t, 1> dequantize_filter_outputs{{3}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_filter_inputs.data(),
+                                      dequantize_filter_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_filter_outputs.data(),
+                                      dequantize_filter_outputs.size())));
+    const std::array<int32_t, 1> dequantize_bias_inputs{{1}};
+    const std::array<int32_t, 1> dequantize_bias_outputs{{4}};
+    operators.emplace_back(CreateOperator(
+        builder, /*opcode_index=*/1,
+        builder.CreateVector<int32_t>(dequantize_bias_inputs.data(),
+                                      dequantize_bias_inputs.size()),
+        builder.CreateVector<int32_t>(dequantize_bias_outputs.data(),
+                                      dequantize_bias_outputs.size())));
+  } else {
+    std::vector<float> filter_data(InputChannels() * OutputChannels());
+    std::vector<float> bias_data(OutputChannels());
+
+    for (int32_t oc = 0; oc < OutputChannels(); oc++) {
+      // Use the same range of all-positive or all-negative values to generate
+      // all filter & bias weights within the same channel, but different ranges
+      // for different output channels. This ensures that no catastrophic
+      // cancellation occur, but test covers both positive and negative inputs.
+      const float range = range_rng();
+      auto value_rng =
+          std::bind(std::uniform_real_distribution<float>(
+                        std::min(range, 0.0f), std::max(range, 0.0f)),
+                    std::ref(rng));
+
+      bias_data[oc] = value_rng();
+      for (int32_t ic = 0; ic < InputChannels(); ic++) {
+        filter_data[oc * InputChannels() + ic] = value_rng();
+      }
+    }
+
+    buffers.emplace_back(CreateBuffer(
+        builder, builder.CreateVector(
+                     reinterpret_cast<const uint8_t*>(filter_data.data()),
+                     sizeof(float) * filter_data.size())));
+    buffers.emplace_back(CreateBuffer(
+        builder,
+        builder.CreateVector(reinterpret_cast<const uint8_t*>(bias_data.data()),
+                             sizeof(float) * bias_data.size())));
+  }
 
   const std::array<int32_t, 2> filter_shape(
       {OutputChannels(), InputChannels()});
   const std::array<int32_t, 1> bias_shape({OutputChannels()});
 
   const std::vector<int32_t> output_shape = OutputShape();
-  const std::array<flatbuffers::Offset<Tensor>, 4> tensors{{
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(InputShape().data(),
-                                                 InputShape().size()),
-                   TensorType_FLOAT32),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(filter_shape.data(),
-                                                 filter_shape.size()),
-                   TensorType_FLOAT32, /*buffer=*/1),
-      CreateTensor(
-          builder,
-          builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
-          TensorType_FLOAT32, /*buffer=*/2),
-      CreateTensor(builder,
-                   builder.CreateVector<int32_t>(output_shape.data(),
-                                                 output_shape.size()),
-                   TensorType_FLOAT32),
-  }};
+  std::vector<flatbuffers::Offset<Tensor>> tensors;
+  if (FP16Weights()) {
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/1));
+    tensors.emplace_back(CreateTensor(
+        builder,
+        builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+        TensorType_FLOAT16, /*buffer=*/2));
+  }
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(InputShape().data(), InputShape().size()),
+      TensorType_FLOAT32));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(filter_shape.data(), filter_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 1));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(bias_shape.data(), bias_shape.size()),
+      TensorType_FLOAT32, /*buffer=*/FP16Weights() ? 0 : 2));
+  tensors.emplace_back(CreateTensor(
+      builder,
+      builder.CreateVector<int32_t>(output_shape.data(), output_shape.size()),
+      TensorType_FLOAT32));
 
   flatbuffers::Offset<FullyConnectedOptions> fully_connected_options =
       CreateFullyConnectedOptions(builder, Activation(),
                                   FullyConnectedOptionsWeightsFormat_DEFAULT,
                                   KeepDims());
 
-  const std::array<int32_t, 3> op_inputs{{0, 1, 2}};
-  const std::array<int32_t, 1> op_outputs{{3}};
-  flatbuffers::Offset<Operator> op = CreateOperator(
+  const std::array<int32_t, 3> op_inputs{
+      {static_cast<int>(tensors.size()) - 4,
+       static_cast<int>(tensors.size()) - 3,
+       static_cast<int>(tensors.size()) - 2}};
+  const std::array<int32_t, 1> op_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
+  operators.emplace_back(CreateOperator(
       builder, /*opcode_index=*/0,
       builder.CreateVector<int32_t>(op_inputs.data(), op_inputs.size()),
       builder.CreateVector<int32_t>(op_outputs.data(), op_outputs.size()),
-      BuiltinOptions_FullyConnectedOptions, fully_connected_options.Union());
+      BuiltinOptions_FullyConnectedOptions, fully_connected_options.Union()));
 
-  const std::array<int32_t, 1> subgraph_inputs{{0}};
-  const std::array<int32_t, 1> subgraph_outputs{{3}};
+  const std::array<int32_t, 1> subgraph_inputs{
+      {static_cast<int>(tensors.size()) - 4}};
+  const std::array<int32_t, 1> subgraph_outputs{
+      {static_cast<int>(tensors.size()) - 1}};
   flatbuffers::Offset<SubGraph> subgraph = CreateSubGraph(
       builder, builder.CreateVector(tensors.data(), tensors.size()),
       builder.CreateVector<int32_t>(subgraph_inputs.data(),
                                     subgraph_inputs.size()),
       builder.CreateVector<int32_t>(subgraph_outputs.data(),
                                     subgraph_outputs.size()),
-      builder.CreateVector(&op, 1));
+      builder.CreateVector(operators.data(), operators.size()));
 
   flatbuffers::Offset<flatbuffers::String> description =
       builder.CreateString("Fully Connected model");
 
   flatbuffers::Offset<Model> model_buffer = CreateModel(
-      builder, TFLITE_SCHEMA_VERSION, builder.CreateVector(&operator_code, 1),
+      builder, TFLITE_SCHEMA_VERSION,
+      builder.CreateVector(operator_codes.data(), operator_codes.size()),
       builder.CreateVector(&subgraph, 1), description,
       builder.CreateVector(buffers.data(), buffers.size()));
 
diff --git a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
index cf1d5513d46..6350bc8d739 100644
--- a/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
+++ b/tensorflow/lite/delegates/xnnpack/fully_connected_tester.h
@@ -71,6 +71,13 @@ class FullyConnectedTester {
 
   inline bool KeepDims() const { return keep_dims_; }
 
+  inline FullyConnectedTester& FP16Weights() {
+    fp16_weights_ = true;
+    return *this;
+  }
+
+  inline bool FP16Weights() const { return fp16_weights_; }
+
   inline FullyConnectedTester& ReluActivation() {
     activation_ = ::tflite::ActivationFunctionType_RELU;
     return *this;
@@ -102,6 +109,7 @@ class FullyConnectedTester {
   int32_t input_channels_ = 1;
   int32_t output_channels_ = 1;
   bool keep_dims_ = false;
+  bool fp16_weights_ = false;
   ::tflite::ActivationFunctionType activation_ =
       ::tflite::ActivationFunctionType_NONE;
 };
diff --git a/tensorflow/lite/delegates/xnnpack/mul_test.cc b/tensorflow/lite/delegates/xnnpack/mul_test.cc
index 6c0475e2b64..2dbb2663b80 100644
--- a/tensorflow/lite/delegates/xnnpack/mul_test.cc
+++ b/tensorflow/lite/delegates/xnnpack/mul_test.cc
@@ -679,6 +679,35 @@ TEST(Mul, 2DByStatic0D) {
       .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
 }
 
+TEST(Mul, FP16Weights) {
+  std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
+      xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
+                       TfLiteXNNPackDelegateDelete);
+
+  std::random_device random_device;
+  auto rng = std::mt19937(random_device());
+  auto shape_rng =
+      std::bind(std::uniform_int_distribution<int32_t>(2, 5), std::ref(rng));
+  const auto batch = shape_rng();
+  const auto height = shape_rng();
+  const auto width = shape_rng();
+  const auto channels = shape_rng();
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input1Static(true)
+      .FP16Weights()
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+
+  BinaryElementwiseTester()
+      .Input1Shape({batch, height, width, channels})
+      .Input2Shape({batch, height, width, channels})
+      .Input2Static(true)
+      .FP16Weights()
+      .Test(BuiltinOperator_MUL, xnnpack_delegate.get());
+}
+
 TEST(Mul, ReluActivation) {
   std::unique_ptr<TfLiteDelegate, decltype(&TfLiteXNNPackDelegateDelete)>
       xnnpack_delegate(TfLiteXNNPackDelegateCreate(nullptr),
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 2beaa16255d..9cbdea60706 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -22,10 +22,12 @@ limitations under the License.
 #include <limits>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include <fp16.h>
 #include <xnnpack.h>
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/builtin_op_data.h"
@@ -39,6 +41,8 @@ namespace {
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
 
 class Delegate {
+  friend class Subgraph;
+
  public:
   explicit Delegate(const TfLiteXNNPackDelegateOptions* options) {
 #if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__)
@@ -49,9 +53,10 @@ class Delegate {
 #endif
   }
 
+  TfLiteIntArray* PrepareOpsToDelegate(TfLiteContext* context);
   TfLiteDelegate* tflite_delegate() { return &delegate_; }
 
-  pthreadpool_t threadpool() {
+  pthreadpool_t threadpool() const {
 #if defined(__EMSCRIPTEN__) && !defined(__EMSCRIPTEN_PTHREADS__)
     return nullptr;
 #else
@@ -69,6 +74,17 @@ class Delegate {
       kTfLiteDelegateFlagsNone,       // .flags
   };
 
+  // Unpacked data for quasi-static tensors, i.e. tensors produced by
+  // dequantizing or unpacking static buffers.
+  std::vector<char> static_unpacked_data_;
+  // Mapping from a tensor index for a quasi-static tensor to the offset to
+  // its unpacked data within static_unpacked_data_.
+  std::unordered_map<int, size_t> static_unpacked_data_map_;
+  // Set of indices of nodes which unpack static data, e.g. Dequantize
+  // operators which convert FP16 static weights to FP32. These nodes are simply
+  // ignored in the delegate implementation, because their outputs are
+  // pre-unpacked in DelegatePrepare.
+  std::unordered_set<int> static_unpack_nodes_;
 #if !defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__)
   // Thread pool with smart-pointer for lifetime management.
   std::unique_ptr<pthreadpool, decltype(&pthreadpool_destroy)> threadpool_{
@@ -80,14 +96,20 @@ class Subgraph {
  public:
   static Subgraph* Create(TfLiteContext* context,
                           const TfLiteDelegateParams* params,
-                          pthreadpool_t threadpool) {
+                          const Delegate* delegate) {
     // Convert subgraph inputs and outputs to hash sets for faster lookup.
     const std::unordered_set<int> inputs(
         &params->input_tensors->data[0],
         &params->input_tensors->data[params->input_tensors->size]);
-    const std::unordered_set<int> outputs(
-        &params->output_tensors->data[0],
-        &params->output_tensors->data[params->output_tensors->size]);
+    std::unordered_set<int> outputs;
+    for (int o = 0; o < params->output_tensors->size; o++) {
+      const int output_tensor_idx = params->output_tensors->data[o];
+      // Exclude quasi-static tensors which may have become subgraph outputs
+      // after partitioning.
+      if (delegate->static_unpacked_data_map_.count(output_tensor_idx) == 0) {
+        outputs.insert(output_tensor_idx);
+      }
+    }
     std::unordered_set<int> externals(outputs);
 
     TfLiteIntArray* execution_plan;
@@ -113,11 +135,17 @@ class Subgraph {
     // filtered out and removed later.
     std::vector<int> tensors(context->tensors_size, -1);
     for (int i = 0; i < params->nodes_to_replace->size; i++) {
+      const int node_index = params->nodes_to_replace->data[i];
+      if (delegate->static_unpack_nodes_.count(node_index)) {
+        // The node unpacks static input and can be skipped because its input
+        // was pre-unpacked in DelegatePrepare.
+        continue;
+      }
+
       TfLiteNode* node = nullptr;
       TfLiteRegistration* registration = nullptr;
-      if (context->GetNodeAndRegistration(context,
-                                          params->nodes_to_replace->data[i],
-                                          &node, &registration) != kTfLiteOk) {
+      if (context->GetNodeAndRegistration(context, node_index, &node,
+                                          &registration) != kTfLiteOk) {
         return nullptr;
       }
 
@@ -164,6 +192,12 @@ class Subgraph {
       const void* data = nullptr;
       if (context->tensors[t].allocation_type == kTfLiteMmapRo) {
         data = context->tensors[t].data.raw_const;
+      } else {
+        // Check for quasi-static data.
+        const auto it = delegate->static_unpacked_data_map_.find(t);
+        if (it != delegate->static_unpacked_data_map_.end()) {
+          data = delegate->static_unpacked_data_.data() + it->second;
+        }
       }
       if (inputs.count(t) != 0) {
         flags |= XNN_VALUE_FLAG_EXTERNAL_INPUT;
@@ -189,25 +223,38 @@ class Subgraph {
       }
     }
 
+    // Create a set of quasi-static tensors for VisitNode function
+    std::unordered_set<int> quasi_static_tensors;
+    for (const std::pair<const int, size_t>& entry :
+         delegate->static_unpacked_data_map_) {
+      quasi_static_tensors.insert(entry.first);
+    }
+
     // Create XNNPACK nodes for TFLite delegate nodes
     for (int i = 0; i < params->nodes_to_replace->size; i++) {
+      const int node_index = params->nodes_to_replace->data[i];
+      if (delegate->static_unpack_nodes_.count(node_index)) {
+        // The node unpacks static input and can be skipped because its input
+        // was pre-unpacked in DelegatePrepare.
+        continue;
+      }
+
       TfLiteNode* node = nullptr;
       TfLiteRegistration* registration = nullptr;
-      if (context->GetNodeAndRegistration(context,
-                                          params->nodes_to_replace->data[i],
-                                          &node, &registration) != kTfLiteOk) {
+      if (context->GetNodeAndRegistration(context, node_index, &node,
+                                          &registration) != kTfLiteOk) {
         return nullptr;
       }
 
-      if (VisitNode(subgraph.get(), context, registration, node, i,
-                    xnnpack_tensors) != kTfLiteOk) {
+      if (VisitNode(subgraph.get(), context, registration, node, node_index,
+                    quasi_static_tensors, xnnpack_tensors) != kTfLiteOk) {
         return nullptr;
       }
     }
 
     xnn_runtime_t runtime_ptr = nullptr;
-    status = xnn_create_runtime_v2(subgraph.get(), threadpool, /*flags=*/0,
-                                   &runtime_ptr);
+    status = xnn_create_runtime_v2(subgraph.get(), delegate->threadpool(),
+                                   /*flags=*/0, &runtime_ptr);
     if (status != xnn_status_success) {
       TF_LITE_KERNEL_LOG(context, "failed to create XNNPACK runtime");
       return nullptr;
@@ -707,10 +754,11 @@ class Subgraph {
     return kTfLiteOk;
   }
 
-  static TfLiteStatus VisitNode(xnn_subgraph_t subgraph, TfLiteContext* context,
-                                TfLiteRegistration* registration,
-                                TfLiteNode* node, int node_index,
-                                const std::vector<uint32_t>& xnnpack_tensors) {
+  static TfLiteStatus VisitNode(
+      xnn_subgraph_t subgraph, TfLiteContext* context,
+      TfLiteRegistration* registration, TfLiteNode* node, int node_index,
+      const std::unordered_set<int>& quasi_static_tensors,
+      const std::vector<uint32_t>& xnnpack_tensors) {
     // TFLite context used for logging purposes. When we create a new node
     // (subgraph is non-null), logging context is the same as context, and error
     // messages are passed to TFLite. When we detect supported operations
@@ -738,7 +786,8 @@ class Subgraph {
             static_cast<const TfLiteConvParams*>(node->builtin_data);
 
         return VisitConv2DNode(subgraph, logging_context, node_index, node,
-                               context->tensors, conv_params, xnnpack_tensors);
+                               context->tensors, conv_params,
+                               quasi_static_tensors, xnnpack_tensors);
       }
       case kTfLiteBuiltinDepthwiseConv2d: {
         const TfLiteDepthwiseConvParams* dwconv_params =
@@ -746,7 +795,7 @@ class Subgraph {
 
         return VisitDepthwiseConv2DNode(subgraph, logging_context, node_index,
                                         node, context->tensors, dwconv_params,
-                                        xnnpack_tensors);
+                                        quasi_static_tensors, xnnpack_tensors);
       }
       case kTfLiteBuiltinFullyConnected: {
         const TfLiteFullyConnectedParams* fc_params =
@@ -754,7 +803,7 @@ class Subgraph {
 
         return VisitFullyConnectedNode(subgraph, logging_context, node_index,
                                        node, context->tensors, fc_params,
-                                       xnnpack_tensors);
+                                       quasi_static_tensors, xnnpack_tensors);
       }
       case kTfLiteBuiltinHardSwish:
         return VisitHardSwishNode(subgraph, logging_context, node_index, node,
@@ -782,7 +831,8 @@ class Subgraph {
                             context->tensors, xnnpack_tensors);
       case kTfLiteBuiltinPrelu:
         return VisitPreluNode(subgraph, logging_context, node_index, node,
-                              context->tensors, xnnpack_tensors);
+                              context->tensors, quasi_static_tensors,
+                              xnnpack_tensors);
       case kTfLiteBuiltinRelu:
         return VisitReluNode(
             subgraph, logging_context, node_index, node, context->tensors, 0.0f,
@@ -810,7 +860,7 @@ class Subgraph {
 
           return VisitMediaPipeDeconvolutionNode(
               subgraph, context, node_index, node, context->tensors,
-              &deconv_params, xnnpack_tensors);
+              &deconv_params, quasi_static_tensors, xnnpack_tensors);
         } else if (strcmp(registration->custom_name,
                           "MaxPoolingWithArgmax2D") == 0) {
           TfLitePoolParams pool_params = {kTfLitePaddingUnknown};
@@ -948,6 +998,7 @@ class Subgraph {
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteConvParams* conv_params,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckConvolutionParams(logging_context, conv_params, node_index));
@@ -968,16 +1019,20 @@ class Subgraph {
         logging_context, filter_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
                                            node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, filter_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1034,6 +1089,7 @@ class Subgraph {
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteDepthwiseConvParams* dwconv_params,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckNumInputsAndOutputs(logging_context, node, 3, 1, node_index));
@@ -1051,16 +1107,20 @@ class Subgraph {
         logging_context, filter_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
                                            node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, filter_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1123,6 +1183,7 @@ class Subgraph {
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteFullyConnectedParams* fc_params,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckFullyConnectedParams(logging_context, fc_params, node_index));
@@ -1141,16 +1202,20 @@ class Subgraph {
         logging_context, filter_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 2,
                                            node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, filter_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1387,6 +1452,7 @@ class Subgraph {
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
       const TfLiteTransposeConvParams* deconv_params,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckNumInputsAndOutputs(logging_context, node, 3, 1, node_index));
@@ -1404,16 +1470,20 @@ class Subgraph {
         logging_context, filter_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, filter_tensor, 4,
                                            node->inputs->data[1]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, filter_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, filter_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& bias_tensor = tensors[node->inputs->data[2]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
         logging_context, filter_tensor, node->inputs->data[2], node_index));
     TF_LITE_ENSURE_STATUS(CheckTensorShape(logging_context, bias_tensor, 1,
                                            node->inputs->data[2]));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, bias_tensor, node->inputs->data[2], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[2]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, bias_tensor, node->inputs->data[2], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1735,6 +1805,7 @@ class Subgraph {
   static TfLiteStatus VisitPreluNode(
       xnn_subgraph_t subgraph, TfLiteContext* logging_context, int node_index,
       TfLiteNode* node, const TfLiteTensor* tensors,
+      const std::unordered_set<int>& quasi_static_tensors,
       const std::vector<uint32_t>& xnnpack_tensors) {
     TF_LITE_ENSURE_STATUS(
         CheckNumInputsAndOutputs(logging_context, node, 2, 1, node_index));
@@ -1752,8 +1823,10 @@ class Subgraph {
         logging_context, slope_tensor, node->inputs->data[1], node_index));
     TF_LITE_ENSURE_STATUS(CheckSlopeTensorShape(
         logging_context, slope_tensor, node->inputs->data[1], node_index));
-    TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
-        logging_context, slope_tensor, node->inputs->data[1], node_index));
+    if (quasi_static_tensors.count(node->inputs->data[1]) == 0) {
+      TF_LITE_ENSURE_STATUS(CheckTensorStaticAllocation(
+          logging_context, slope_tensor, node->inputs->data[1], node_index));
+    }
 
     const TfLiteTensor& output_tensor = tensors[node->outputs->data[0]];
     TF_LITE_ENSURE_STATUS(CheckTensorFloatType(
@@ -1869,15 +1942,29 @@ class Subgraph {
   bool first_run_{true};
 };
 
-TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
+TfLiteIntArray* Delegate::PrepareOpsToDelegate(TfLiteContext* context) {
+  // Clear previous data, in case the delegate is reused without re-creation.
+  static_unpacked_data_map_.clear();
+  static_unpacked_data_.clear();
+  static_unpack_nodes_.clear();
+
   TfLiteIntArray* execution_plan = nullptr;
   if (context->GetExecutionPlan(context, &execution_plan) != kTfLiteOk) {
     TF_LITE_KERNEL_LOG(context, "Unable to get graph execution plan.");
     return nullptr;
   }
 
-  TfLiteIntArray* nodes_to_replace = TfLiteIntArrayCreate(execution_plan->size);
-  nodes_to_replace->size = 0;
+  // Mapping for quasi-static (unpacked from static) tensor index to the node
+  // index that produced it.
+  std::unordered_map<int, int> quasi_static_tensors_producers;
+  // Set of all quasi-static tensors in the execution plan.
+  std::unordered_set<int> quasi_static_tensors;
+  // Set of quasi-static tensors consumed by the delegated nodes.
+  std::unordered_set<int> quasi_static_tensors_to_unpack;
+
+  TfLiteIntArray* nodes_to_delegate =
+      TfLiteIntArrayCreate(execution_plan->size);
+  nodes_to_delegate->size = 0;
   for (int i = 0; i < execution_plan->size; ++i) {
     const int node_index = execution_plan->data[i];
 
@@ -1892,15 +1979,142 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
       continue;  // Soft error (skip this node).
     }
 
+    if (registration->builtin_code == kTfLiteBuiltinDequantize &&
+        node->inputs->size == 1 && node->outputs->size == 1) {
+      const TfLiteTensor& input_tensor =
+          context->tensors[node->inputs->data[0]];
+      const TfLiteTensor& output_tensor =
+          context->tensors[node->outputs->data[0]];
+      if (input_tensor.allocation_type == kTfLiteMmapRo &&
+          input_tensor.type == kTfLiteFloat16 &&
+          output_tensor.type == kTfLiteFloat32) {
+        static_unpack_nodes_.insert(i);
+        quasi_static_tensors_producers[node->outputs->data[0]] = i;
+        quasi_static_tensors.insert(node->outputs->data[0]);
+
+        // Skip this node for now. If output of the node is consumed only by
+        // delegated nodes, it will be added to nodes_to_delegate in the end.
+        continue;
+      }
+    }
+
     if (Subgraph::VisitNode(/*subgraph=*/nullptr, context, registration, node,
-                            node_index, std::vector<uint32_t>()) != kTfLiteOk) {
+                            node_index, quasi_static_tensors,
+                            std::vector<uint32_t>()) != kTfLiteOk) {
+      // If a non-delegated node consumes output of a node that unpacks static
+      // data, that node shouldn't be delegated.
+      for (int j = 0; j < node->inputs->size; j++) {
+        const auto it =
+            quasi_static_tensors_producers.find(node->inputs->data[j]);
+        if (it != quasi_static_tensors_producers.end()) {
+          static_unpack_nodes_.erase(it->second);
+        }
+      }
+
       // Non-delegatable node is not an error.
       continue;
     }
 
-    nodes_to_replace->data[nodes_to_replace->size++] = node_index;
+    for (int j = 0; j < node->inputs->size; j++) {
+      if (quasi_static_tensors.count(node->inputs->data[j]) != 0) {
+        quasi_static_tensors_to_unpack.insert(node->inputs->data[j]);
+      }
+    }
+
+    nodes_to_delegate->data[nodes_to_delegate->size++] = node_index;
   }
 
+  // Unpack static data of all tensors
+  for (int t : quasi_static_tensors_to_unpack) {
+    const int producer_index = quasi_static_tensors_producers[t];
+    // Check if TFLite nodes can be delegated to XNNPACK
+    TfLiteNode* node = nullptr;
+    TfLiteRegistration* registration = nullptr;
+    if (context->GetNodeAndRegistration(context, producer_index, &node,
+                                        &registration) != kTfLiteOk) {
+      TF_LITE_KERNEL_LOG(context,
+                         "Unable to get node and registration for node %d.",
+                         producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+
+    if (node->inputs->size != 1) {
+      TF_LITE_KERNEL_LOG(context, "unexpected number of inputs (%d) in node %d",
+                         node->inputs->size, producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+
+    if (node->outputs->size != 1) {
+      TF_LITE_KERNEL_LOG(context,
+                         "unexpected number of outputs (%d) in node %d",
+                         node->outputs->size, producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+
+    const TfLiteTensor& input_tensor = context->tensors[node->inputs->data[0]];
+    if (input_tensor.allocation_type != kTfLiteMmapRo) {
+      TF_LITE_KERNEL_LOG(context,
+                         "unexpected allocation type in tensor %d in node %d",
+                         node->inputs->data[0], producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+
+    const TfLiteTensor& output_tensor = context->tensors[t];
+    if (output_tensor.type != kTfLiteFloat32) {
+      TF_LITE_KERNEL_LOG(context,
+                         "unexpected datatype (%s) in tensor %d in node %d",
+                         TfLiteTypeGetName(output_tensor.type),
+                         node->outputs->data[0], producer_index);
+      TfLiteIntArrayFree(nodes_to_delegate);
+      return nullptr;  // Hard error.
+    }
+    const size_t tensor_elements = output_tensor.bytes / sizeof(float);
+
+    // Align to XNN_EXTRA_BYTES bytes
+    while (static_unpacked_data_.size() % XNN_EXTRA_BYTES != 0) {
+      static_unpacked_data_.push_back(0);
+    }
+    const size_t tensor_offset = static_unpacked_data_.size();
+    static_unpacked_data_.resize(tensor_offset + context->tensors[t].bytes);
+
+    float* unpacked_data =
+        reinterpret_cast<float*>(static_unpacked_data_.data() + tensor_offset);
+    switch (input_tensor.type) {
+      case kTfLiteFloat16: {
+        const uint16_t* packed_data =
+            static_cast<const uint16_t*>(input_tensor.data.data);
+        for (size_t i = 0; i < tensor_elements; i++) {
+          unpacked_data[i] = fp16_ieee_to_fp32_value(packed_data[i]);
+        }
+        break;
+      }
+      default:
+        TF_LITE_KERNEL_LOG(context,
+                           "unexpected datatype (%s) in tensor %d in node %d",
+                           TfLiteTypeGetName(output_tensor.type),
+                           node->outputs->data[0], producer_index);
+        TfLiteIntArrayFree(nodes_to_delegate);
+        return nullptr;  // Hard error.
+    }
+
+    static_unpacked_data_map_[t] = tensor_offset;
+  }
+
+  // Add nodes that unpack static data consumed by delegated nodes.
+  // Note: this is done purely to avoid the overhead of running these nodes
+  // again in TFLite interpreter which would allocate memory for their outputs.
+  // We mark them as delegated, but the delegate would simply ignore these nodes
+  // as the static weights are already unpacked.
+  for (int node_index : static_unpack_nodes_) {
+    nodes_to_delegate->data[nodes_to_delegate->size++] = node_index;
+  }
+  std::sort(&nodes_to_delegate->data[0],
+            &nodes_to_delegate->data[nodes_to_delegate->size]);
+
 #ifdef XNNPACK_DELEGATE_TEST_MODE
   // In the test mode build (used by unit tests), XNNPACK delegate claims to
   // support all operators in the execution plan to disable fallback to the
@@ -1908,24 +2122,22 @@ TfLiteIntArray* GetOpsToReplace(TfLiteContext* context) {
   // not supported by the delegate, they will cause a failure in
   // ::tflite::Interpreter::ModifyGraphWithDelegate, to be caught in the unit
   // tests.
-  nodes_to_replace->size = execution_plan->size;
+  nodes_to_delegate->size = execution_plan->size;
   std::copy(&execution_plan->data[0],
             &execution_plan->data[execution_plan->size],
-            &nodes_to_replace->data[0]);
+            &nodes_to_delegate->data[0]);
 #endif
 
-  return nodes_to_replace;
+  return nodes_to_delegate;
 }
 
 void* SubgraphInit(TfLiteContext* context, const char* buffer, size_t length) {
   const TfLiteDelegateParams* params =
       reinterpret_cast<const TfLiteDelegateParams*>(buffer);
 
-  pthreadpool_t threadpool =
-      static_cast<::tflite::xnnpack::Delegate*>(params->delegate->data_)
-          ->threadpool();
-
-  return static_cast<void*>(Subgraph::Create(context, params, threadpool));
+  return static_cast<void*>(Subgraph::Create(
+      context, params,
+      static_cast<::tflite::xnnpack::Delegate*>(params->delegate->data_)));
 }
 
 TfLiteStatus SubgraphPrepare(TfLiteContext* context, TfLiteNode* node) {
@@ -1962,7 +2174,9 @@ const TfLiteRegistration kSubgraphRegistration = {
 };
 
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate) {
-  TfLiteIntArray* ops_to_replace = GetOpsToReplace(context);
+  TfLiteIntArray* ops_to_replace =
+      static_cast<::tflite::xnnpack::Delegate*>(delegate->data_)
+          ->PrepareOpsToDelegate(context);
   const TfLiteStatus status = context->ReplaceNodeSubsetsWithDelegateKernels(
       context, kSubgraphRegistration, ops_to_replace, delegate);
   TfLiteIntArrayFree(ops_to_replace);
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index 364ac325967..5967c23be33 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -301,7 +301,7 @@ void RunInference(Settings* s) {
     profiler->StopProfiling();
     auto profile_events = profiler->GetProfileEvents();
     for (int i = 0; i < profile_events.size(); i++) {
-      auto subgraph_index = profile_events[i]->event_subgraph_index;
+      auto subgraph_index = profile_events[i]->extra_event_metadata;
       auto op_index = profile_events[i]->event_metadata;
       const auto subgraph = interpreter->subgraph(subgraph_index);
       const auto node_and_registration =
diff --git a/tensorflow/lite/experimental/delegates/coreml/BUILD b/tensorflow/lite/experimental/delegates/coreml/BUILD
index 193f2e0223b..2985cd3a315 100644
--- a/tensorflow/lite/experimental/delegates/coreml/BUILD
+++ b/tensorflow/lite/experimental/delegates/coreml/BUILD
@@ -1,4 +1,4 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+
 package(default_visibility = [
     "//visibility:public",
 ])
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index ba86eac25fd..44b58fb0759 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -42,7 +42,7 @@ root.v1 = tf.Variable(3.)
 root.v2 = tf.Variable(2.)
 root.f = tf.function(lambda x: root.v1 * root.v2 * x)
 
-# Save the model.
+# Save the model in SavedModel format.
 export_dir = "/tmp/test_saved_model"
 input_data = tf.constant(1., shape=[1, 1])
 to_save = root.f.get_concrete_function(input_data)
@@ -51,6 +51,10 @@ tf.saved_model.save(root, export_dir, to_save)
 # Convert the model.
 converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
 tflite_model = converter.convert()
+
+# Save the TF Lite model.
+with tf.gfile.GFile('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
 This API does not have the option of specifying the input shape of any input
@@ -87,6 +91,10 @@ model.fit(x, y, epochs=50)
 # Convert the model.
 converter = tf.lite.TFLiteConverter.from_keras_model(model)
 tflite_model = converter.convert()
+
+# Save the TF Lite model.
+with tf.gfile.GFile('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
 ### Converting a concrete function <a name="concrete_function"></a>
@@ -115,6 +123,10 @@ concrete_func = root.f.get_concrete_function(input_data)
 # functions is under development.
 converter = tf.lite.TFLiteConverter.from_concrete_functions([concrete_func])
 tflite_model = converter.convert()
+
+# Save the TF Lite model.
+with tf.gfile.GFile('model.tflite', 'wb') as f:
+  f.write(tflite_model)
 ```
 
 ### End-to-end MobileNet conversion <a name="mobilenet"></a>
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index 167254a2a62..cae2ca7dde0 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/delegates/status.h"
 #include "tensorflow/lite/graph_info.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/minimal_logging.h"
@@ -71,6 +72,17 @@ TfLiteQuantization GetQuantizationFromLegacy(
   return quantization;
 }
 
+// TODO(b/153131797): We have put 'delegate_status' to 0 in the following macro
+// temporarily because delegate-specific error codes are either not retrievable
+// at the moment, which we will add later.
+#define TF_LITE_ENSURE_STATUS_WITH_SCOPED_INSTRUMENTATION(runtime_event, a) \
+  do {                                                                      \
+    TfLiteStatus status = (a);                                              \
+    runtime_event.set_runtime_status(/*delegate_status=*/0,                 \
+                                     static_cast<int64_t>(status));         \
+    TF_LITE_ENSURE_STATUS(status);                                          \
+  } while (0)
+
 }  // namespace
 
 Interpreter::Interpreter(ErrorReporter* error_reporter)
@@ -210,11 +222,15 @@ TfLiteStatus Interpreter::ReleaseNonPersistentMemory() {
 }
 
 TfLiteStatus Interpreter::Invoke() {
-  TF_LITE_ENSURE_STATUS(primary_subgraph().Invoke());
+  ScopedRuntimeInstrumentationProfile scoped_runtime_event(installed_profiler_,
+                                                           "invoke");
+  TF_LITE_ENSURE_STATUS_WITH_SCOPED_INSTRUMENTATION(
+      scoped_runtime_event, primary_subgraph().Invoke());
 
   if (!allow_buffer_handle_output_) {
     for (int tensor_index : outputs()) {
-      TF_LITE_ENSURE_STATUS(
+      TF_LITE_ENSURE_STATUS_WITH_SCOPED_INSTRUMENTATION(
+          scoped_runtime_event,
           primary_subgraph().EnsureTensorDataIsReadable(tensor_index));
     }
   }
@@ -381,18 +397,21 @@ void Interpreter::SetProfiler(Profiler* profiler) {
   // Release resources occupied by owned_profiler_ which is replaced by
   // caller-owned profiler.
   owned_profiler_.reset(nullptr);
-  SetSubgraphProfiler(profiler);
+  installed_profiler_ = profiler;
+  SetSubgraphProfiler();
 }
 
 void Interpreter::SetProfiler(std::unique_ptr<Profiler> profiler) {
   owned_profiler_ = std::move(profiler);
-  SetSubgraphProfiler(owned_profiler_.get());
+  installed_profiler_ = owned_profiler_.get();
+  SetSubgraphProfiler();
 }
 
-void Interpreter::SetSubgraphProfiler(Profiler* profiler) {
+void Interpreter::SetSubgraphProfiler() {
   for (int subgraph_index = 0; subgraph_index < subgraphs_.size();
        ++subgraph_index) {
-    subgraphs_[subgraph_index]->SetProfiler(profiler, subgraph_index);
+    subgraphs_[subgraph_index]->SetProfiler(installed_profiler_,
+                                            subgraph_index);
   }
 }
 
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 0e01ce44e0c..59cab6add6d 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -540,7 +540,7 @@ class Interpreter {
                                  TfLiteExternalContext* ctx);
 
   // Sets the profiler to all subgraphs.
-  void SetSubgraphProfiler(Profiler* profiler);
+  void SetSubgraphProfiler();
 
   // Remove delegates (for fallback behaviour). The interpreter is invokable
   // afterwards.
@@ -559,10 +559,10 @@ class Interpreter {
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
   // This is the primary subgraph context.
-  TfLiteContext* context_;
+  TfLiteContext* context_ = nullptr;
 
   // The error reporter delegate that tflite will forward queries errors to.
-  ErrorReporter* error_reporter_;
+  ErrorReporter* error_reporter_ = nullptr;
 
   // List of delegates that have been installed and are owned by this
   // interpreter instance. Useful if client delegate ownership is burdensome.
@@ -574,6 +574,9 @@ class Interpreter {
   // Useful if client profiler ownership is burdensome.
   std::unique_ptr<Profiler> owned_profiler_;
 
+  // Points to the installed Profiler instance.
+  Profiler* installed_profiler_ = nullptr;
+
   bool allow_buffer_handle_output_ = false;
 
   // List of active external contexts.
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 47146771b50..fc0c461a7c1 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -88,6 +88,7 @@ struct PreluOpData : public OpData {
   int32_t output_shift_1 = 0;
   int32_t output_multiplier_2 = 0;
   int32_t output_shift_2 = 0;
+  bool requires_broadcast;
 };
 
 struct HardSwishData {
@@ -693,6 +694,7 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
                        &data->output_shift_2);
   }
 
+  data->requires_broadcast = !HaveSameShapes(input, alpha);
   // PRelu (parameteric Relu) shares the same alpha value on "shared axis".
   // This means it's always required to "broadcast" alpha values in PRelu.
   TfLiteIntArray* output_size = nullptr;
@@ -1161,11 +1163,19 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   const PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
   switch (input->type) {
     case kTfLiteFloat32: {
-      reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
-          GetTensorShape(input), GetTensorData<float>(input),
-          GetTensorShape(alpha), GetTensorData<float>(alpha),
-          GetTensorShape(output), GetTensorData<float>(output),
-          ApplyPrelu<float>);
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+            GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(alpha), GetTensorData<float>(alpha),
+            GetTensorShape(output), GetTensorData<float>(output),
+            ApplyPrelu<float>);
+      } else {
+        reference_ops::BinaryFunction<float, float, float>(
+            GetTensorShape(input), GetTensorData<float>(input),
+            GetTensorShape(alpha), GetTensorData<float>(alpha),
+            GetTensorShape(output), GetTensorData<float>(output),
+            ApplyPrelu<float>);
+      }
       return kTfLiteOk;
     } break;
     case kTfLiteUInt8: {
@@ -1177,10 +1187,17 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.output_shift_1 = data->output_shift_1;
       op_params.output_multiplier_2 = data->output_multiplier_2;
       op_params.output_shift_2 = data->output_shift_2;
-      reference_ops::BroadcastPrelu4DSlow(
-          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
-          GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
-          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastPrelu4DSlow(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      } else {
+        reference_ops::Prelu(
+            op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+            GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
+            GetTensorShape(output), GetTensorData<uint8_t>(output));
+      }
       return kTfLiteOk;
     } break;
     case kTfLiteInt8: {
@@ -1192,10 +1209,17 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
       op_params.output_shift_1 = data->output_shift_1;
       op_params.output_multiplier_2 = data->output_multiplier_2;
       op_params.output_shift_2 = data->output_shift_2;
-      reference_ops::BroadcastPrelu4DSlow(
-          op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
-          GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
-          GetTensorShape(output), GetTensorData<int8_t>(output));
+      if (data->requires_broadcast) {
+        reference_ops::BroadcastPrelu4DSlow(
+            op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+            GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
+            GetTensorShape(output), GetTensorData<int8_t>(output));
+      } else {
+        reference_ops::Prelu(
+            op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
+            GetTensorShape(alpha), GetTensorData<int8_t>(alpha),
+            GetTensorShape(output), GetTensorData<int8_t>(output));
+      }
       return kTfLiteOk;
     } break;
     default:
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index 9f6fb932d34..5a679147469 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -2060,7 +2060,7 @@ TEST(FloatActivationsOpTest, PRelu) {
       0.0f, 0.0f, 0.0f,     // Row 1, Column 1
       1.0f, 1.0f, 1.0f,     // Row 1, Column 2
       -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
-      -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
+      -2.0f, -2.0f, -2.0f,  // Row 2, Column 2
   });
   m.SetAlpha({0.0f, 1.0f, 2.0f});
   m.Invoke();
@@ -2068,7 +2068,32 @@ TEST(FloatActivationsOpTest, PRelu) {
                                  0.0f, 0.0f, 0.0f,    // Row 1, Column 1
                                  1.0f, 1.0f, 1.0f,    // Row 1, Column 2
                                  0.0f, -1.0f, -2.0f,  // Row 2, Column 1
-                                 0.0f, -2.0f, -4.0f,  // Row 1, Column 2
+                                 0.0f, -2.0f, -4.0f,  // Row 2, Column 2
+                             }));
+}
+
+TEST(FloatActivationsOpTest, PReluSameShapes) {
+  FloatPReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+                      {TensorType_FLOAT32, {1, 2, 2, 3}});
+
+  m.SetInput({
+      0.0f, 0.0f, 0.0f,     // Row 1, Column 1
+      1.0f, 1.0f, 1.0f,     // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
+      -2.0f, -2.0f, -2.0f,  // Row 2, Column 2
+  });
+  m.SetAlpha({
+      0.0f, 1.0f, 2.0f,  // Row 1, Column 1
+      0.0f, 1.0f, 2.0f,  // Row 1, Column 2
+      0.0f, 1.0f, 2.0f,  // Row 2, Column 1
+      0.0f, 1.0f, 2.0f,  // Row 2, Column 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0f, 0.0f, 0.0f,    // Row 1, Column 1
+                                 1.0f, 1.0f, 1.0f,    // Row 1, Column 2
+                                 0.0f, -1.0f, -2.0f,  // Row 2, Column 1
+                                 0.0f, -2.0f, -4.0f,  // Row 2, Column 2
                              }));
 }
 
@@ -2081,7 +2106,7 @@ TEST(QuantizedActivationsOpTest, PRelu) {
       0.0f, 0.0f, 0.0f,        // Row 1, Column 1
       0.5f, 0.5f, 0.5f,        // Row 1, Column 2
       -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
-      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+      -0.25f, -0.25f, -0.25f,  // Row 2, Column 2
   });
   m.SetAlpha<uint8_t>({0.0f, 0.5f, -0.5f});
   m.Invoke();
@@ -2091,14 +2116,49 @@ TEST(QuantizedActivationsOpTest, PRelu) {
                       0.0f, 0.0f, 0.0f,       // Row 1, Column 1
                       0.5f, 0.5f, 0.5f,       // Row 1, Column 2
                       0.0f, -0.5f, 0.5f,      // Row 2, Column 1
-                      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+                      0.0f, -0.125f, 0.125f,  // Row 2, Column 2
                   },
                   kQuantizedTolerance)));
   EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
                                           128, 128, 128,  // Row 1, Column 1
                                           192, 192, 192,  // Row 1, Column 2
                                           128, 64, 192,   // Row 2, Column 1
-                                          128, 112, 144,  // Row 1, Column 2
+                                          128, 112, 144,  // Row 2, Column 2
+                                      }));
+}
+
+TEST(QuantizedActivationsOpTest, PReluSameShapes) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedPReluOpModel m({TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax},
+                          {TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax});
+  m.SetInput<uint8_t>({
+      0.0f, 0.0f, 0.0f,        // Row 1, Column 1
+      0.5f, 0.5f, 0.5f,        // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 2, Column 2
+  });
+  m.SetAlpha<uint8_t>({
+      0.0f, 0.5f, -0.5f,  // Row 1, Column 1
+      0.0f, 0.5f, -0.5f,  // Row 1, Column 2
+      0.0f, 0.5f, -0.5f,  // Row 2, Column 1
+      0.0f, 0.5f, -0.5f,  // Row 2, Column 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 0.0f, 0.0f,       // Row 1, Column 1
+                      0.5f, 0.5f, 0.5f,       // Row 1, Column 2
+                      0.0f, -0.5f, 0.5f,      // Row 2, Column 1
+                      0.0f, -0.125f, 0.125f,  // Row 2, Column 2
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
+                                          128, 128, 128,  // Row 1, Column 1
+                                          192, 192, 192,  // Row 1, Column 2
+                                          128, 64, 192,   // Row 2, Column 1
+                                          128, 112, 144,  // Row 2, Column 2
                                       }));
 }
 
@@ -2111,7 +2171,7 @@ TEST(QuantizedActivationsOpTest, PReluInt8) {
       0.0f, 0.0f, 0.0f,        // Row 1, Column 1
       0.5f, 0.5f, 0.5f,        // Row 1, Column 2
       -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
-      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+      -0.25f, -0.25f, -0.25f,  // Row 2, Column 2
   });
   m.SetAlpha<int8_t>({0.0f, 0.5f, -0.5f});
   m.Invoke();
@@ -2121,14 +2181,49 @@ TEST(QuantizedActivationsOpTest, PReluInt8) {
                       0.0f, 0.0f, 0.0f,       // Row 1, Column 1
                       0.5f, 0.5f, 0.5f,       // Row 1, Column 2
                       0.0f, -0.5f, 0.5f,      // Row 2, Column 1
-                      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+                      0.0f, -0.125f, 0.125f,  // Row 2, Column 2
                   },
                   kQuantizedTolerance)));
   EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
                                          0, 0, 0,     // Row 1, Column 1
                                          64, 64, 64,  // Row 1, Column 2
                                          0, -64, 64,  // Row 2, Column 1
-                                         0, -16, 16,  // Row 1, Column 2
+                                         0, -16, 16,  // Row 2, Column 2
+                                     }));
+}
+
+TEST(QuantizedActivationsOpTest, PReluInt8SameShapes) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedPReluOpModel m({TensorType_INT8, {1, 2, 2, 3}, kMin, kMax},
+                          {TensorType_INT8, {1, 1, 3}, kMin, kMax});
+  m.SetInput<int8_t>({
+      0.0f, 0.0f, 0.0f,        // Row 1, Column 1
+      0.5f, 0.5f, 0.5f,        // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 2, Column 2
+  });
+  m.SetAlpha<int8_t>({
+      0.0f, 0.5f, -0.5f,  // Row 1, Column 1
+      0.0f, 0.5f, -0.5f,  // Row 1, Column 2
+      0.0f, 0.5f, -0.5f,  // Row 2, Column 1
+      0.0f, 0.5f, -0.5f,  // Row 2, Column 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 0.0f, 0.0f,       // Row 1, Column 1
+                      0.5f, 0.5f, 0.5f,       // Row 1, Column 2
+                      0.0f, -0.5f, 0.5f,      // Row 2, Column 1
+                      0.0f, -0.125f, 0.125f,  // Row 2, Column 2
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<int8_t>(), ElementsAreArray({
+                                         0, 0, 0,     // Row 1, Column 1
+                                         64, 64, 64,  // Row 1, Column 2
+                                         0, -64, 64,  // Row 2, Column 1
+                                         0, -16, 16,  // Row 2, Column 2
                                      }));
 }
 
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 403adc725eb..21ee5f806a8 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -370,12 +370,15 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
     }
   }
 
-  // The multi-threaded kernel supports neither dilation nor hybrid kernels.
+  // The multi-threaded kernel supports neither dilation nor hybrid kernels, and
+  // is incompatible with mutable input filters that might change between evals.
   data->supports_multithreaded_kernel =
       (kernel_type == kMultithreadOptimized) &&
       (context->recommended_num_threads != 1) && !is_hybrid &&
       (params->dilation_width_factor == 1) &&
-      (params->dilation_height_factor == 1);
+      (params->dilation_height_factor == 1) &&
+      (filter->allocation_type != kTfLiteArenaRw) &&
+      !IsDynamicTensor(filter);
 
   TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(
       context, node, is_hybrid, data->is_hybrid_per_channel, kernel_type));
diff --git a/tensorflow/lite/kernels/conv_test.cc b/tensorflow/lite/kernels/conv_test.cc
index 8569809df75..a2201835195 100644
--- a/tensorflow/lite/kernels/conv_test.cc
+++ b/tensorflow/lite/kernels/conv_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <cstdarg>
+#include <initializer_list>
 
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
@@ -39,6 +40,7 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
+template <typename FilterType>
 class BaseConvolutionOpModel : public SingleOpModel {
  public:
   BaseConvolutionOpModel(
@@ -47,9 +49,15 @@ class BaseConvolutionOpModel : public SingleOpModel {
       int stride_height = 2, enum Padding padding = Padding_VALID,
       enum ActivationFunctionType activation = ActivationFunctionType_NONE,
       int dilation_width_factor = 1, int dilation_height_factor = 1,
-      int num_threads = -1) {
+      int num_threads = -1,
+      std::initializer_list<FilterType> filter_data = {}) {
     input_ = AddInput(input);
-    filter_ = AddInput(filter);
+
+    if (filter_data.size()) {
+      filter_ = AddConstInput(filter, filter_data);
+    } else {
+      filter_ = AddInput(filter);
+    }
 
     int bias_size = GetShape(filter_)[0];
     if (input.type == TensorType_FLOAT32) {
@@ -115,7 +123,7 @@ class BaseConvolutionOpModel : public SingleOpModel {
   int output_;
 };
 
-class ConvolutionOpModel : public BaseConvolutionOpModel {
+class ConvolutionOpModel : public BaseConvolutionOpModel<float> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -553,6 +561,85 @@ TEST_P(ConvolutionOpTest, HandCalculatedFloat32) {
                                     234, 261, 121}));
     }
   }
+
+  // Change the filter to ensure non-const filter behavior is correct.
+  m.SetFilter({2, 4, 7, 2, 5, 8, 3, 6, 9});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 313, 359,
+                                               181, 187, 239, 267, 128}));
+}
+
+// TODO(b/157263074): Ideally using a const filter would be a parameterization
+// of the test, so we ensure full test coverage with all the different
+// types and backends.
+TEST_P(ConvolutionOpTest, HandCalculatedFloat32WithConstFilter) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  const int filter_size = 3;
+  const int filter_count = 1;
+  const int stride_width = 1;
+  const int stride_height = 1;
+  const Padding padding = Padding_SAME;
+  // The filter matrix is:
+  // | 1 | 4 | 7 |
+  // | 2 | 5 | 8 |
+  // | 3 | 6 | 9 |
+  const std::initializer_list<float> filter_data = {1, 4, 7, 2, 5, 8, 3, 6, 9};
+  ConvolutionOpModel m(
+      GetRegistration(),
+      {TensorType_FLOAT32,
+       {image_batch_count, image_height, image_width, depth}},
+      {TensorType_FLOAT32, {depth, filter_size, filter_size, filter_count}},
+      {TensorType_FLOAT32, {}}, stride_width, stride_height, padding,
+      ActivationFunctionType_NONE,
+      /*dilation_width_factor=*/1,
+      /*dilation_height_factor=*/1,
+      /*num_threads=*/-1, filter_data);
+
+  // The image matrix is:
+  // |  1 |  2 |  3 |  4 |
+  // |  5 |  6 |  7 |  8 |
+  // |  9 | 10 | 11 | 12 |
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  // No bias for this test.
+  m.SetBias({0});
+
+  m.Invoke();
+  // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
+  // the input set to zero because we're using the 'SAME' padding mode.
+  // The calculations behind the expected output are:
+  // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)=105
+  // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)=150
+  // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)=183
+  // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)=95
+  // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)=235
+  // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
+  // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
+  // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)=178
+  // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
+  // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
+  // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
+  // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
+  // This means we should end up with this matrix:
+  // |  105  |  150  |  183  |   95  |
+  // |  235  |  312  |  357  |  178  |
+  // |  187  |  234  |  261  |  121  |
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({105, 150, 183, 95, 235, 312, 357,
+                                               178, 187, 234, 261, 121}));
+
+  // Add an additional test for the multi-threaded case, ensuring stability
+  // under different thread counts.
+  if (GetParam() == "MultithreadedOptimized") {
+    for (int i = 1; i < 4; ++i) {
+      m.SetNumThreads(i);
+      m.Invoke();
+      EXPECT_THAT(m.GetOutput(),
+                  ElementsAreArray({105, 150, 183, 95, 235, 312, 357, 178, 187,
+                                    234, 261, 121}));
+    }
+  }
 }
 
 TEST_P(ConvolutionOpTest, HandCalculatedWithBiasFloat32) {
@@ -766,7 +853,7 @@ TEST_P(ConvolutionOpTest, SimpleTestFloatWithDilation) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
 }
 
-class QuantizedConvolutionOpModel : public BaseConvolutionOpModel {
+class QuantizedConvolutionOpModel : public BaseConvolutionOpModel<uint8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -986,7 +1073,7 @@ TEST_P(ConvolutionOpTest, SimpleTestQuantizedWithDilation) {
               ElementsAreArray({5, 5, 5, 5, 5, 5, 5, 5, 5}));
 }
 
-class HybridConvolutionOpModel : public BaseConvolutionOpModel {
+class HybridConvolutionOpModel : public BaseConvolutionOpModel<int8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -1325,7 +1412,8 @@ TEST_P(ConvolutionOpTest, DISABLED_PointwiseMultifilterHybrid) {
                   0.0474)));
 }
 
-class PerChannelQuantizedConvolutionOpModel : public BaseConvolutionOpModel {
+class PerChannelQuantizedConvolutionOpModel
+    : public BaseConvolutionOpModel<int8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
@@ -1442,7 +1530,8 @@ TEST_P(ConvolutionOpTest, SimplePerChannelTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({61, 127, -115, -93}));
 }
 
-class HybridPerChannelConvolutionOpModel : public BaseConvolutionOpModel {
+class HybridPerChannelConvolutionOpModel
+    : public BaseConvolutionOpModel<int8_t> {
  public:
   using BaseConvolutionOpModel::BaseConvolutionOpModel;
 
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index d6a96efdbf7..51b58f92de1 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -481,6 +481,7 @@ cc_library(
         "reference/strided_slice.h",
         "reference/sub.h",
         "reference/svdf.h",
+        "reference/tanh.h",
     ],
     build_for_embedded = True,
     copts = tflite_copts(),
@@ -551,6 +552,7 @@ cc_library(
         "reference/softmax.h",
         "reference/strided_slice.h",
         "reference/sub.h",
+        "reference/tanh.h",
     ],
     copts = tflite_copts(),
     deps = [
diff --git a/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h b/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
index 750e63e152f..969caa57d47 100644
--- a/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SPARSE_OPS_FULLY_CONNECTED_H_
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SPARSE_OPS_FULLY_CONNECTED_H_
 
+#include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/kernels/internal/common.h"
 #include "tensorflow/lite/kernels/internal/cppmath.h"
@@ -31,6 +32,8 @@ inline void FullyConnectedSparseWeight(
     const RuntimeShape& weights_shape, const float* weights_data,
     const RuntimeShape& bias_shape, const float* bias_data,
     const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("FullyConnected");
+  ruy::profiler::ScopeLabel inner_label("Random Sparse");
   const float output_activation_min = params.float_activation_min;
   const float output_activation_max = params.float_activation_max;
 
@@ -75,6 +78,8 @@ inline void FullyConnectedSparseWeight1x4(
     const RuntimeShape& weights_shape, const float* weights_data,
     const RuntimeShape& bias_shape, const float* bias_data,
     const RuntimeShape& output_shape, float* output_data) {
+  ruy::profiler::ScopeLabel label("FullyConnected");
+  ruy::profiler::ScopeLabel inner_label("1x4 Block Sparse");
   const float output_activation_min = params.float_activation_min;
   const float output_activation_max = params.float_activation_max;
 
diff --git a/tensorflow/lite/kernels/internal/reference/prelu.h b/tensorflow/lite/kernels/internal/reference/prelu.h
index 50d9ad24dd9..4633cb9599a 100644
--- a/tensorflow/lite/kernels/internal/reference/prelu.h
+++ b/tensorflow/lite/kernels/internal/reference/prelu.h
@@ -72,6 +72,37 @@ inline void BroadcastPrelu4DSlow(
   }
 }
 
+template <typename T>
+inline void Prelu(const PreluParams& params, const RuntimeShape& input_shape,
+                  const T* input_data, const RuntimeShape& alpha_shape,
+                  const T* alpha_data, const RuntimeShape& output_shape,
+                  T* output_data) {
+  const int32 quantized_min = std::numeric_limits<T>::min();
+  const int32 quantized_max = std::numeric_limits<T>::max();
+
+  const int flat_size =
+      MatchingElementsSize(input_shape, alpha_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const int32 input_value = params.input_offset + input_data[i];
+    int32 output_value;
+    if (input_value >= 0) {
+      output_value = MultiplyByQuantizedMultiplier(
+          input_value, params.output_multiplier_1, params.output_shift_1);
+    } else {
+      const int32 alpha_value = params.alpha_offset + alpha_data[i];
+
+      output_value = MultiplyByQuantizedMultiplier(input_value * alpha_value,
+                                                   params.output_multiplier_2,
+                                                   params.output_shift_2);
+    }
+    output_value += params.output_offset;
+
+    const int32 clamped_output =
+        std::min(quantized_max, std::max(quantized_min, output_value));
+    output_data[i] = static_cast<T>(clamped_output);
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 1a6c6d0d80e..e991a21e3bd 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/reference/softmax.h"
 #include "tensorflow/lite/kernels/internal/reference/strided_slice.h"
 #include "tensorflow/lite/kernels/internal/reference/sub.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
 #include "tensorflow/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -1343,59 +1344,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   }
 }
 
-inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
-                 const RuntimeShape& output_shape, float* output_data) {
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-
-  for (int i = 0; i < flat_size; i++) {
-    float val = input_data[i];
-    float result = std::tanh(val);
-    output_data[i] = result;
-  }
-}
-
-// Convenience version that allows, for example, generated-code calls to be
-// uniform between data types.
-inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
-                 const float* input_data, const RuntimeShape& output_shape,
-                 float* output_data) {
-  // Drop params: not needed.
-  Tanh(input_shape, input_data, output_shape, output_data);
-}
-
-inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
-                 const int16* input_data, const RuntimeShape& output_shape,
-                 int16* output_data) {
-  const int input_left_shift = params.input_left_shift;
-  // Support for shifts is limited until we have a parameterized version of
-  // SaturatingRoundingMultiplyByPOT().
-  TFLITE_DCHECK_GE(input_left_shift, 0);
-  TFLITE_DCHECK_LE(input_left_shift, 1);
-
-  const int flat_size = MatchingFlatSize(input_shape, output_shape);
-
-  // F0 uses 0 integer bits, range [-1, 1].
-  // This is the return type of math functions such as tanh, logistic,
-  // whose range is in [-1, 1].
-  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
-  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
-  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
-
-  if (input_left_shift == 0) {
-    for (int i = 0; i < flat_size; i++) {
-      F3 input = F3::FromRaw(input_data[i]);
-      F0 output = gemmlowp::tanh(input);
-      output_data[i] = output.raw();
-    }
-  } else {
-    for (int i = 0; i < flat_size; i++) {
-      F3 input = F3::FromRaw(
-          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
-      F0 output = gemmlowp::tanh(input);
-      output_data[i] = output.raw();
-    }
-  }
-}
 
 inline void Dequantize(const RuntimeShape& input_shape,
                        const Eigen::half* input_data,
diff --git a/tensorflow/lite/kernels/internal/reference/tanh.h b/tensorflow/lite/kernels/internal/reference/tanh.h
new file mode 100644
index 00000000000..0f31d4ddeef
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/tanh.h
@@ -0,0 +1,86 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
+
+#include <cmath>
+
+#include "fixedpoint/fixedpoint.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace reference_ops {
+
+inline void Tanh(const RuntimeShape& input_shape, const float* input_data,
+                 const RuntimeShape& output_shape, float* output_data) {
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    float val = input_data[i];
+    float result = std::tanh(val);
+    output_data[i] = result;
+  }
+}
+
+// Convenience version that allows, for example, generated-code calls to be
+// uniform between data types.
+inline void Tanh(const TanhParams&, const RuntimeShape& input_shape,
+                 const float* input_data, const RuntimeShape& output_shape,
+                 float* output_data) {
+  // Drop params: not needed.
+  Tanh(input_shape, input_data, output_shape, output_data);
+}
+
+inline void Tanh(const TanhParams& params, const RuntimeShape& input_shape,
+                 const int16* input_data, const RuntimeShape& output_shape,
+                 int16* output_data) {
+  const int input_left_shift = params.input_left_shift;
+  // Support for shifts is limited until we have a parameterized version of
+  // SaturatingRoundingMultiplyByPOT().
+  TFLITE_DCHECK_GE(input_left_shift, 0);
+  TFLITE_DCHECK_LE(input_left_shift, 1);
+
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  // F0 uses 0 integer bits, range [-1, 1].
+  // This is the return type of math functions such as tanh, logistic,
+  // whose range is in [-1, 1].
+  using F0 = gemmlowp::FixedPoint<std::int16_t, 0>;
+  // F3 uses 3 integer bits, range [-8, 8], the input range expected here.
+  using F3 = gemmlowp::FixedPoint<std::int16_t, 3>;
+
+  if (input_left_shift == 0) {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(input_data[i]);
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  } else {
+    for (int i = 0; i < flat_size; i++) {
+      F3 input = F3::FromRaw(
+          gemmlowp::SaturatingRoundingMultiplyByPOT<1>(input_data[i]));
+      F0 output = gemmlowp::tanh(input);
+      output_data[i] = output.raw();
+    }
+  }
+}
+
+}  // namespace reference_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_TANH_H_
diff --git a/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb b/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb
new file mode 100644
index 00000000000..b34c0bb3fb8
--- /dev/null
+++ b/tensorflow/lite/micro/examples/hello_world/create_sine_model.ipynb
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Redirect","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyO1u6oks1qPVEQNnHFD3Cyo"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"86C-FMxpdZxv","colab_type":"text"},"source":["This Colab notebook has been moved to [https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb)\n"]}]}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb b/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb
new file mode 100644
index 00000000000..fb56dc3a4d9
--- /dev/null
+++ b/tensorflow/lite/micro/examples/micro_speech/train_speech_model.ipynb
@@ -0,0 +1 @@
+{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"name":"Redirect","provenance":[],"collapsed_sections":[],"authorship_tag":"ABX9TyO1u6oks1qPVEQNnHFD3Cyo"},"kernelspec":{"name":"python3","display_name":"Python 3"}},"cells":[{"cell_type":"markdown","metadata":{"id":"86C-FMxpdZxv","colab_type":"text"},"source":["This Colab notebook has been moved to [https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/micro/examples/micro_speech/train/train_micro_speech_model.ipynb)\n"]}]}
\ No newline at end of file
diff --git a/tensorflow/lite/micro/kernels/BUILD b/tensorflow/lite/micro/kernels/BUILD
index b6c6054d604..bbb5c67d9e5 100644
--- a/tensorflow/lite/micro/kernels/BUILD
+++ b/tensorflow/lite/micro/kernels/BUILD
@@ -51,6 +51,7 @@ cc_library(
         "split.cc",
         "strided_slice.cc",
         "sub.cc",
+        "tanh.cc",
         "unpack.cc",
     ] + select({
         "//conditions:default": [
@@ -153,6 +154,7 @@ cc_library(
         "strided_slice.cc",
         "sub.cc",
         "svdf.cc",
+        "tanh.cc",
         "unpack.cc",
     ],
     hdrs = ["micro_ops.h"],
@@ -656,3 +658,14 @@ tflite_micro_cc_test(
         "//tensorflow/lite/micro/testing:micro_test",
     ],
 )
+
+tflite_micro_cc_test(
+    name = "tanh_test",
+    srcs = ["tanh_test.cc"],
+    deps = [
+        ":all_ops_resolver",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/micro:micro_framework",
+        "//tensorflow/lite/micro/testing:micro_test",
+    ],
+)
diff --git a/tensorflow/lite/micro/kernels/elementwise.cc b/tensorflow/lite/micro/kernels/elementwise.cc
index 93fc4ec0d88..b69d260a826 100644
--- a/tensorflow/lite/micro/kernels/elementwise.cc
+++ b/tensorflow/lite/micro/kernels/elementwise.cc
@@ -106,9 +106,6 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalLogical(context, node, [](bool v) { return !v; });
 }
 
-TfLiteStatus TANHEval(TfLiteContext* context, TfLiteNode* node) {
-  return EvalNumeric(context, node, std::tanh);
-}
 
 }  // namespace
 }  // namespace elementwise
@@ -225,20 +222,6 @@ TfLiteRegistration* Register_LOGICAL_NOT() {
   return &r;
 }
 
-TfLiteRegistration* Register_TANH() {
-  static TfLiteRegistration r = {
-      /*init=*/nullptr,
-      /*free=*/nullptr,
-      /*prepare=*/
-      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
-      /*invoke=*/elementwise::TANHEval,
-      /*profiling_string=*/nullptr,
-      /*builtin_code=*/0,
-      /*custom_name=*/nullptr,
-      /*version=*/0};
-  return &r;
-}
-
 }  // namespace micro
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/prelu.cc b/tensorflow/lite/micro/kernels/prelu.cc
index 801181abba4..921aa208ea2 100644
--- a/tensorflow/lite/micro/kernels/prelu.cc
+++ b/tensorflow/lite/micro/kernels/prelu.cc
@@ -68,8 +68,9 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   int output_shift_1 = 0;
   int32_t output_multiplier_2 = 0;
   int output_shift_2 = 0;
-  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
-    double real_multiplier_1 = static_cast<double>(input->params.scale) *
+  if (output->type == kTfLiteInt8 || output->type == kTfLiteUInt8 ||
+      output->type == kTfLiteInt16) {
+    double real_multiplier_1 = static_cast<double>(input->params.scale) /
                                static_cast<double>(output->params.scale);
     double real_multiplier_2 = static_cast<double>(input->params.scale) *
                                static_cast<double>(alpha->params.scale) /
diff --git a/tensorflow/lite/micro/kernels/prelu_test.cc b/tensorflow/lite/micro/kernels/prelu_test.cc
index 66c0a609e8a..4199ae69689 100644
--- a/tensorflow/lite/micro/kernels/prelu_test.cc
+++ b/tensorflow/lite/micro/kernels/prelu_test.cc
@@ -156,14 +156,14 @@ TF_LITE_MICRO_TESTS_BEGIN
 TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
   const int output_dims_count = 12;
   float output_data[output_dims_count];
-  tflite::testing::TestPreluFloat({1, 2, 2, 3},  // input shape
+  tflite::testing::TestPreluFloat({3, 2, 2, 3},  // input shape
                                   {
                                       0.0f, 0.0f, 0.0f,     // Row 1, Column 1
                                       1.0f, 1.0f, 1.0f,     // Row 1, Column 2
                                       -1.0f, -1.0f, -1.0f,  // Row 2, Column 1
                                       -2.0f, -2.0f, -2.0f,  // Row 1, Column 2
                                   },
-                                  {1, 1, 1, 3},        // alpha shape
+                                  {3, 1, 1, 3},        // alpha shape
                                   {0.0f, 1.0f, 2.0f},  // alpha values
                                   {
                                       0.0f, 0.0f, 0.0f,    // Row 1, Column 1
@@ -171,26 +171,26 @@ TF_LITE_MICRO_TEST(FloatPreluActivationsOpTest) {
                                       0.0f, -1.0f, -2.0f,  // Row 2, Column 1
                                       0.0f, -2.0f, -4.0f,  // Row 1, Column 2
                                   },
-                                  {1, 2, 2, 3},  // output shape
+                                  {3, 2, 2, 3},  // output shape
                                   output_data);
 }
 
 TF_LITE_MICRO_TEST(QuantizedUint8PreluActivationsOpTest) {
   using tflite::testing::F2Q;
-  const float kMin = -1;
-  const float kMax = 127.f / 128.f;
+  const float kMin = -4;
+  const float kMax = 127.f / 32.f;
   const float kAlphaMin = -0.5f;
   const float kAlphaMax = 0.5f;
   const int output_dims_count = 12;
   uint8_t output_data[output_dims_count];
   tflite::testing::TestPreluQuantized(
-      {1, 2, 2, 3},  // input shape
+      {3, 2, 2, 3},  // input shape
       {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
        F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
        F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax), F2Q(-1.0f, kMin, kMax),
        F2Q(-0.25f, kMin, kMax), F2Q(-0.25f, kMin, kMax),
        F2Q(-0.25f, kMin, kMax)},
-      kMin, kMax, {1, 1, 1, 3},  // alpha shape
+      kMin, kMax, {3, 1, 1, 3},  // alpha shape
       {F2Q(0.0f, kMin, kMax), F2Q(0.5f, kMin, kMax), F2Q(-0.5f, kMin, kMax)},
       kMin, kMax,
       {F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax), F2Q(0.0f, kMin, kMax),
@@ -198,7 +198,7 @@ TF_LITE_MICRO_TEST(QuantizedUint8PreluActivationsOpTest) {
        F2Q(0.0f, kMin, kMax), F2Q(-0.5f, kMin, kMax), F2Q(0.5f, kMin, kMax),
        F2Q(0.0f, kMin, kMax), F2Q(-0.125f, kMin, kMax),
        F2Q(0.125f, kMin, kMax)},
-      {1, 2, 2, 3},  // output shape
+      {3, 2, 2, 3},  // output shape
       kMin, kMax, output_data);
 }
 
@@ -211,13 +211,13 @@ TF_LITE_MICRO_TEST(QuantizedInt8PreluActivationsOpTest) {
   const int output_dims_count = 12;
   int8_t output_data[output_dims_count];
   tflite::testing::TestPreluQuantized(
-      {1, 2, 2, 3},  // input shape
+      {3, 2, 2, 3},  // input shape
       {F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax),
        F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
        F2QS(-1.0f, kMin, kMax), F2QS(-1.0f, kMin, kMax),
        F2QS(-1.0f, kMin, kMax), F2QS(-0.25f, kMin, kMax),
        F2QS(-0.25f, kMin, kMax), F2QS(-0.25f, kMin, kMax)},
-      kMin, kMax, {1, 1, 1, 3},  // alpha shape
+      kMin, kMax, {3, 1, 1, 3},  // alpha shape
       {F2QS(0.0f, kMin, kMax), F2QS(0.5f, kMin, kMax), F2QS(-0.5f, kMin, kMax)},
       kMin, kMax,
       {F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax), F2QS(0.0f, kMin, kMax),
@@ -225,7 +225,7 @@ TF_LITE_MICRO_TEST(QuantizedInt8PreluActivationsOpTest) {
        F2QS(0.0f, kMin, kMax), F2QS(-0.5f, kMin, kMax), F2QS(0.5f, kMin, kMax),
        F2QS(0.0f, kMin, kMax), F2QS(-0.125f, kMin, kMax),
        F2QS(0.125f, kMin, kMax)},
-      {1, 2, 2, 3},  // output shape
+      {3, 2, 2, 3},  // output shape
       kMin, kMax, output_data);
 }
 TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/kernels/tanh.cc b/tensorflow/lite/micro/kernels/tanh.cc
new file mode 100644
index 00000000000..9ee5b74bde4
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/tanh.cc
@@ -0,0 +1,128 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h"
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/tanh.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace micro {
+namespace activations {
+namespace {
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  int32_t input_zero_point;
+  int32_t input_range_radius;
+  int32_t input_multiplier;
+  int input_left_shift;
+};
+
+TfLiteStatus CalculateArithmeticOpData(TfLiteContext* context, TfLiteNode* node,
+                                       OpData* data) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  if (input->type == kTfLiteInt8) {
+    TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
+
+    // The number if input integer bits is set to be consistent with the
+    // required value in reference_integer_ops::Tanh
+    static constexpr int kInputIntegerBits = 4;
+    const double input_real_multiplier =
+        static_cast<double>(input->params.scale) *
+        static_cast<double>(1 << (31 - kInputIntegerBits));
+
+    const double q = std::frexp(input_real_multiplier, &data->input_left_shift);
+    data->input_multiplier = static_cast<int32_t>(TfLiteRound(q * (1ll << 31)));
+
+    data->input_range_radius =
+        CalculateInputRadius(kInputIntegerBits, data->input_left_shift, 31);
+  }
+  return kTfLiteOk;
+}
+}  // namespace
+
+TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  OpData data;
+  CalculateArithmeticOpData(context, node, &data);
+
+  if (input->type == kTfLiteFloat32) {
+    switch (output->type) {
+      case kTfLiteFloat32: {
+        reference_ops::Tanh(GetTensorShape(input), GetTensorData<float>(input),
+                            GetTensorShape(output),
+                            GetTensorData<float>(output));
+        return kTfLiteOk;
+      }
+      default:
+        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                           TfLiteTypeGetName(input->type),
+                           TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else if (input->type == kTfLiteInt8) {
+    switch (output->type) {
+      case kTfLiteInt8: {
+        reference_integer_ops::Tanh(
+            input->params.zero_point, data.input_range_radius,
+            data.input_multiplier, data.input_left_shift,
+            NumElements(input->dims), GetTensorData<int8_t>(input),
+            GetTensorData<int8_t>(output));
+        return kTfLiteOk;
+      }
+      default:
+        TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                           TfLiteTypeGetName(input->type),
+                           TfLiteTypeGetName(output->type));
+        return kTfLiteError;
+    }
+  } else {
+    TF_LITE_KERNEL_LOG(context, "Input %s, output %s not supported.",
+                       TfLiteTypeGetName(input->type),
+                       TfLiteTypeGetName(output->type));
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace activations
+
+TfLiteRegistration* Register_TANH() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr,
+                                 /*prepare=*/nullptr,
+                                 /*invoke=*/activations::TanhEval,
+                                 /*profiling_string=*/nullptr,
+                                 /*builtin_code=*/0,
+                                 /*custom_name=*/nullptr,
+                                 /*version=*/0};
+  return &r;
+}
+}  // namespace micro
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/micro/kernels/tanh_test.cc b/tensorflow/lite/micro/kernels/tanh_test.cc
new file mode 100644
index 00000000000..2a367107771
--- /dev/null
+++ b/tensorflow/lite/micro/kernels/tanh_test.cc
@@ -0,0 +1,220 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/micro/testing/micro_test.h"
+#include "tensorflow/lite/micro/testing/test_utils.h"
+
+namespace tflite {
+namespace testing {
+namespace {
+
+void TestTanhFloat(std::initializer_list<int> input_dims_data,
+                   std::initializer_list<float> input_data,
+                   std::initializer_list<float> expected_output_data,
+                   std::initializer_list<int> output_dims_data,
+                   float* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_elements_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateFloatTensor(input_data, input_dims, "input_tensor"),
+      CreateFloatTensor(output_data, output_dims, "output_tensor"),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_TANH, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  const char* init_data = nullptr;
+  size_t init_data_size = 0;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = nullptr;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_elements_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1e-5f);
+  }
+}
+
+void TestTanhInt8(std::initializer_list<int> input_dims_data,
+                  std::initializer_list<int8_t> input_data, float input_min,
+                  float input_max,
+                  std::initializer_list<int8_t> expected_output_data,
+                  std::initializer_list<int> output_dims_data, float output_min,
+                  float output_max, int8_t* output_data) {
+  TfLiteIntArray* input_dims = IntArrayFromInitializer(input_dims_data);
+  TfLiteIntArray* output_dims = IntArrayFromInitializer(output_dims_data);
+  const int output_elements_count = ElementCount(*output_dims);
+
+  constexpr int inputs_size = 1;
+  constexpr int outputs_size = 1;
+  constexpr int tensors_size = inputs_size + outputs_size;
+  TfLiteTensor tensors[tensors_size] = {
+      CreateQuantizedTensor(input_data, input_dims, "input_tensor", input_min,
+                            input_max),
+      CreateQuantizedTensor(output_data, output_dims, "output_tensor",
+                            output_min, output_max),
+  };
+
+  TfLiteContext context;
+  PopulateContext(tensors, tensors_size, micro_test::reporter, &context);
+
+  ::tflite::ops::micro::AllOpsResolver resolver;
+  const TfLiteRegistration* registration =
+      resolver.FindOp(tflite::BuiltinOperator_TANH, 1);
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration);
+
+  const char* init_data = nullptr;
+  size_t init_data_size = 1;
+  void* user_data = nullptr;
+  if (registration->init) {
+    user_data = registration->init(&context, init_data, init_data_size);
+  }
+  int inputs_array_data[] = {1, 0};
+  TfLiteIntArray* inputs_array = IntArrayFromInts(inputs_array_data);
+  int outputs_array_data[] = {1, 1};
+  TfLiteIntArray* outputs_array = IntArrayFromInts(outputs_array_data);
+
+  TfLiteNode node;
+  node.inputs = inputs_array;
+  node.outputs = outputs_array;
+  node.temporaries = nullptr;
+  node.user_data = user_data;
+  node.builtin_data = nullptr;
+  node.custom_initial_data = nullptr;
+  node.custom_initial_data_size = 0;
+  node.delegate = nullptr;
+  if (registration->prepare) {
+    TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->prepare(&context, &node));
+  }
+  TF_LITE_MICRO_EXPECT_NE(nullptr, registration->invoke);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, registration->invoke(&context, &node));
+  if (registration->free) {
+    registration->free(&context, user_data);
+  }
+  for (int i = 0; i < output_elements_count; ++i) {
+    TF_LITE_MICRO_EXPECT_NEAR(expected_output_data.begin()[i], output_data[i],
+                              1);
+  }
+}
+
+}  // namespace
+}  // namespace testing
+}  // namespace tflite
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(SimpleTestFloat) {
+  const int output_elements_count = 10;
+  float output_data[output_elements_count];
+  tflite::testing::TestTanhFloat({2, 1, 5},  // Input shape.
+                                 {
+                                     1.0,
+                                     2.0,
+                                     3.0,
+                                     4.0,
+                                     93.0,
+                                     -1.0,
+                                     -2.0,
+                                     -3.0,
+                                     -4.0,
+                                     -93.0,
+                                 },
+                                 {
+                                     // Expected results.
+                                     0.76159416,
+                                     0.96402758,
+                                     0.99505475,
+                                     0.9993293,
+                                     1.0,
+                                     -0.76159416,
+                                     -0.96402758,
+                                     -0.99505475,
+                                     -0.9993293,
+                                     -1.0,
+                                 },
+                                 {2, 1, 5},  // Output shape.
+                                 output_data);
+}
+
+TF_LITE_MICRO_TEST(SimpleTestInt8) {
+  using tflite::testing::F2QS;
+
+  const float input_min = -31.75f;
+  const float input_max = 32.0f;
+  const float output_min = -1.0f;
+  const float output_max = (127.0f / 128.0f);
+
+  const int output_elements_count = 10;
+  int8_t output_data[output_elements_count];
+  tflite::testing::TestTanhInt8(
+      {2, 1, output_elements_count},  // Input shape.
+      {F2QS(1.0, input_min, input_max), F2QS(2.0, input_min, input_max),
+       F2QS(3.0, input_min, input_max), F2QS(4.0, input_min, input_max),
+       F2QS(5.0, input_min, input_max), F2QS(-1.0, input_min, input_max),
+       F2QS(-2.0, input_min, input_max), F2QS(-3.0, input_min, input_max),
+       F2QS(-4.0, input_min, input_max), F2QS(-5.0, input_min, input_max)},
+      input_min, input_max,  // Input quantized range.
+      {                      // Expected results.
+       F2QS(0.76159416, output_min, output_max),
+       F2QS(0.96402758, output_min, output_max),
+       F2QS(0.99505475, output_min, output_max),
+       F2QS(0.9993293, output_min, output_max),
+       F2QS(0.9999092, output_min, output_max),
+       F2QS(-0.76159416, output_min, output_max),
+       F2QS(-0.96402758, output_min, output_max),
+       F2QS(-0.99505475, output_min, output_max),
+       F2QS(-0.9993293, output_min, output_max),
+       F2QS(-0.9999092, output_min, output_max)},
+      {2, 1, output_elements_count},  // Output shape.
+      output_min, output_max,         // Output quantized range.
+      output_data);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc
index 0d56a1391f9..c204f4460b4 100644
--- a/tensorflow/lite/micro/micro_allocator.cc
+++ b/tensorflow/lite/micro/micro_allocator.cc
@@ -49,9 +49,14 @@ constexpr int kBufferAlignment = 16;
 
 constexpr char kOfflineMemAllocMetadata[] = "OfflineMemoryAllocation";
 
-// Static instance of a zero-length int to pass as tensor dims for a flatbuffer
-// Tensor with no shape.
-constexpr TfLiteIntArray kZeroLengthIntArray = {0, {}};
+// Instance of a zero-length int to pass as tensor dims for a flatbuffer
+// Tensor with no shape. Note that the second member of a TfLiteArray is a
+// flexible array member, which is not strictly valid C++. However it is
+// supported by both GCC and clang, as long as the flexible array element is not
+// initialized, which is ok in this case as it should never be accessed.
+// Declaring this as constexpr causes build errors with clang, as it requires
+// the flexible array element to be initialized.
+const TfLiteIntArray kZeroLengthIntArray = {0};
 
 class MicroBuiltinDataAllocator : public BuiltinDataAllocator {
  public:
diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile
index 13761cca28b..a94d643a3d0 100644
--- a/tensorflow/lite/micro/tools/make/Makefile
+++ b/tensorflow/lite/micro/tools/make/Makefile
@@ -164,6 +164,8 @@ tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/logistic.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/l2normalization.h \
 tensorflow/lite/kernels/internal/reference/integer_ops/mul.h \
+tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h \
+tensorflow/lite/kernels/internal/reference/integer_ops/tanh.h \
 tensorflow/lite/kernels/internal/reference/l2normalization.h \
 tensorflow/lite/kernels/internal/reference/maximum_minimum.h \
 tensorflow/lite/kernels/internal/reference/mul.h \
@@ -181,7 +183,7 @@ tensorflow/lite/kernels/internal/reference/softmax.h \
 tensorflow/lite/kernels/internal/reference/sub.h \
 tensorflow/lite/kernels/internal/reference/logistic.h \
 tensorflow/lite/kernels/internal/reference/strided_slice.h \
-tensorflow/lite/kernels/internal/reference/integer_ops/pooling.h \
+tensorflow/lite/kernels/internal/reference/tanh.h \
 tensorflow/lite/kernels/internal/cppmath.h \
 tensorflow/lite/kernels/internal/strided_slice_logic.h \
 tensorflow/lite/kernels/internal/tensor.h \
diff --git a/tensorflow/lite/profiling/atrace_profiler.cc b/tensorflow/lite/profiling/atrace_profiler.cc
index 2f6672d6bb7..4bdaf9d9e06 100644
--- a/tensorflow/lite/profiling/atrace_profiler.cc
+++ b/tensorflow/lite/profiling/atrace_profiler.cc
@@ -57,16 +57,19 @@ class ATraceProfiler : public tflite::Profiler {
   }
 
   uint32_t BeginEvent(const char* tag, EventType event_type,
-                      uint32_t event_metadata,
-                      uint32_t event_subgraph_index) override {
+                      int64_t event_metadata1,
+                      int64_t event_metadata2) override {
     if (handle_ && atrace_is_enabled_()) {
       // Note: When recording an OPERATOR_INVOKE_EVENT, we have recorded the op
-      // name as tag and node index as event_metadata. See the macro
-      // TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE defined in
-      // tensorflow/lite/core/api/profiler.h for details.
-      // op_name@node_index/subgraph_index
+      // name
+      // as tag, node index as event_metadata1 and subgraph index as
+      // event_metadata2. See the macro TFLITE_SCOPED_TAGGED_OPERATOR_PROFILE
+      // defined in tensorflow/lite/core/api/profiler.h for details.
+      // Regardless the 'event_type', we encode the perfetto event name as
+      // tag@event_metadata1/event_metadata2. In case of OPERATOR_INVOKE_EVENT,
+      // the perfetto event name will be op_name@node_index/subgraph_index
       std::string trace_event_tag =
-          absl::StrCat(tag, "@", event_metadata, "/", event_subgraph_index);
+          absl::StrCat(tag, "@", event_metadata1, "/", event_metadata2);
       atrace_begin_section_(trace_event_tag.c_str());
     }
     return 0;
diff --git a/tensorflow/lite/profiling/buffered_profiler.h b/tensorflow/lite/profiling/buffered_profiler.h
index 2b617c92aeb..cfd96e0490a 100644
--- a/tensorflow/lite/profiling/buffered_profiler.h
+++ b/tensorflow/lite/profiling/buffered_profiler.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
 #define TENSORFLOW_LITE_PROFILING_BUFFERED_PROFILER_H_
 
+#include <cstdint>
 #include <vector>
 
 #include "tensorflow/lite/core/api/profiler.h"
@@ -75,24 +76,33 @@ namespace profiling {
 class BufferedProfiler : public tflite::Profiler {
  public:
   explicit BufferedProfiler(uint32_t max_num_entries)
-      : buffer_(max_num_entries, false) {}
+      : buffer_(max_num_entries, false),
+        supported_event_types_(~static_cast<uint64_t>(
+            EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT)) {}
 
   uint32_t BeginEvent(const char* tag, EventType event_type,
-                      uint32_t event_metadata,
-                      uint32_t event_subgraph_index) override {
-    return buffer_.BeginEvent(tag, event_type, event_metadata,
-                              event_subgraph_index);
+                      int64_t event_metadata1,
+                      int64_t event_metadata2) override {
+    if (!ShouldAddEvent(event_type)) return kInvalidEventHandle;
+    return buffer_.BeginEvent(tag, event_type, event_metadata1,
+                              event_metadata2);
   }
 
   void EndEvent(uint32_t event_handle) override {
     buffer_.EndEvent(event_handle);
   }
 
-  void AddEvent(const char* tag, EventType event_type, uint32_t event_metadata,
-                uint64_t start, uint64_t end,
-                uint32_t event_subgraph_index) override {
-    buffer_.AddEvent(tag, event_type, event_metadata, start, end,
-                     event_subgraph_index);
+  void EndEvent(uint32_t event_handle, int64_t event_metadata1,
+                int64_t event_metadata2) override {
+    buffer_.EndEvent(event_handle, &event_metadata1, &event_metadata2);
+  }
+
+  void AddEvent(const char* tag, EventType event_type, uint64_t start,
+                uint64_t end, int64_t event_metadata1,
+                int64_t event_metadata2) override {
+    if (!ShouldAddEvent(event_type)) return;
+    buffer_.AddEvent(tag, event_type, start, end, event_metadata1,
+                     event_metadata2);
   }
 
   void StartProfiling() { buffer_.SetEnabled(true); }
@@ -107,9 +117,15 @@ class BufferedProfiler : public tflite::Profiler {
     return profile_events;
   }
 
+ protected:
+  bool ShouldAddEvent(EventType event_type) {
+    return (static_cast<uint64_t>(event_type) & supported_event_types_) != 0;
+  }
+
  private:
   ProfileBuffer* GetProfileBuffer() { return &buffer_; }
   ProfileBuffer buffer_;
+  const uint64_t supported_event_types_;
 };
 
 }  // namespace profiling
diff --git a/tensorflow/lite/profiling/noop_profiler.h b/tensorflow/lite/profiling/noop_profiler.h
index 27363fc6788..078d0e8ee6d 100644
--- a/tensorflow/lite/profiling/noop_profiler.h
+++ b/tensorflow/lite/profiling/noop_profiler.h
@@ -29,7 +29,7 @@ class NoopProfiler : public tflite::Profiler {
   NoopProfiler() {}
   explicit NoopProfiler(int max_profiling_buffer_entries) {}
 
-  uint32_t BeginEvent(const char*, EventType, uint32_t, uint32_t) override {
+  uint32_t BeginEvent(const char*, EventType, int64_t, int64_t) override {
     return 0;
   }
   void EndEvent(uint32_t) override {}
diff --git a/tensorflow/lite/profiling/profile_buffer.h b/tensorflow/lite/profiling/profile_buffer.h
index 67d19d02afe..3b34cf9612a 100644
--- a/tensorflow/lite/profiling/profile_buffer.h
+++ b/tensorflow/lite/profiling/profile_buffer.h
@@ -51,10 +51,11 @@ struct ProfileEvent {
   // The field containing the type of event. This must be one of the event types
   // in EventType.
   EventType event_type;
-  // Extra data describing the details of the event.
-  uint32_t event_metadata;
-  // The index of subgraph where an event came from.
-  uint32_t event_subgraph_index;
+  // Meta data associated w/ the event.
+  int64_t event_metadata;
+  // Note: if this is an OPERATOR_INVOKE_EVENT, 'extra_event_metadata' will
+  // represent the index of the subgraph that this event comes from.
+  int64_t extra_event_metadata;
 };
 
 // A ring buffer of profile events.
@@ -69,7 +70,7 @@ class ProfileBuffer {
   // buffer is disabled this has no affect.
   // The tag of the event should remain valid till the buffer is valid.
   uint32_t BeginEvent(const char* tag, ProfileEvent::EventType event_type,
-                      uint32_t event_metadata, uint32_t event_subgraph_index) {
+                      int64_t event_metadata1, int64_t event_metadata2) {
     if (!enabled_) {
       return kInvalidEventHandle;
     }
@@ -81,8 +82,8 @@ class ProfileBuffer {
     }
     event_buffer_[index].tag = tag;
     event_buffer_[index].event_type = event_type;
-    event_buffer_[index].event_subgraph_index = event_subgraph_index;
-    event_buffer_[index].event_metadata = event_metadata;
+    event_buffer_[index].event_metadata = event_metadata1;
+    event_buffer_[index].extra_event_metadata = event_metadata2;
     event_buffer_[index].begin_timestamp_us = timestamp;
     event_buffer_[index].end_timestamp_us = 0;
     if (event_type != Profiler::EventType::OPERATOR_INVOKE_EVENT) {
@@ -98,7 +99,8 @@ class ProfileBuffer {
   // Sets the end timestamp for event for the handle to current time.
   // If the buffer is disabled or previous event has been overwritten this
   // operation has not effect.
-  void EndEvent(uint32_t event_handle) {
+  void EndEvent(uint32_t event_handle, const int64_t* event_metadata1 = nullptr,
+                const int64_t* event_metadata2 = nullptr) {
     if (!enabled_ || event_handle == kInvalidEventHandle ||
         event_handle > current_index_) {
       return;
@@ -116,11 +118,17 @@ class ProfileBuffer {
         Profiler::EventType::OPERATOR_INVOKE_EVENT) {
       event_buffer_[event_index].end_mem_usage = memory::GetMemoryUsage();
     }
+    if (event_metadata1) {
+      event_buffer_[event_index].event_metadata = *event_metadata1;
+    }
+    if (event_metadata2) {
+      event_buffer_[event_index].extra_event_metadata = *event_metadata2;
+    }
   }
 
   void AddEvent(const char* tag, ProfileEvent::EventType event_type,
-                uint32_t event_metadata, uint64_t start, uint64_t end,
-                uint32_t event_subgraph_index) {
+                uint64_t start, uint64_t end, int64_t event_metadata1,
+                int64_t event_metadata2) {
     if (!enabled_) {
       return;
     }
@@ -131,8 +139,8 @@ class ProfileBuffer {
     }
     event_buffer_[index].tag = tag;
     event_buffer_[index].event_type = event_type;
-    event_buffer_[index].event_subgraph_index = event_subgraph_index;
-    event_buffer_[index].event_metadata = event_metadata;
+    event_buffer_[index].event_metadata = event_metadata1;
+    event_buffer_[index].extra_event_metadata = event_metadata2;
     event_buffer_[index].begin_timestamp_us = start;
     event_buffer_[index].end_timestamp_us = end;
     current_index_++;
diff --git a/tensorflow/lite/profiling/profile_buffer_test.cc b/tensorflow/lite/profiling/profile_buffer_test.cc
index 584d21255a5..ab98cbb0d13 100644
--- a/tensorflow/lite/profiling/profile_buffer_test.cc
+++ b/tensorflow/lite/profiling/profile_buffer_test.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/profiling/profile_buffer.h"
+
+#include <cstdint>
 #include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/profiling/profile_buffer.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
@@ -43,7 +45,7 @@ TEST(ProfileBufferTest, AddEvent) {
   EXPECT_EQ(0, buffer.Size());
   auto event_handle =
       buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
-                        /*event_metadata*/ 42, /*event_subgraph_index*/ 0);
+                        /*event_metadata1*/ 42, /*event_metadata2*/ 0);
 
   EXPECT_GE(event_handle, 0);
   EXPECT_EQ(1, buffer.Size());
@@ -59,6 +61,28 @@ TEST(ProfileBufferTest, AddEvent) {
   EXPECT_GE(event->end_timestamp_us, event->begin_timestamp_us);
 }
 
+TEST(ProfileBufferTest, EndEventWithMetadata) {
+  ProfileBuffer buffer(/*max_size*/ 10, /*enabled*/ true);
+  EXPECT_EQ(0, buffer.Size());
+  auto event_handle =
+      buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
+                        /*event_metadata1*/ 42, /*event_metadata2*/ 0);
+  const int64_t kEventMetadata1 = 18;
+  const int64_t kEventMetadata2 = 36;
+  buffer.EndEvent(event_handle, &kEventMetadata1, &kEventMetadata2);
+
+  EXPECT_GE(event_handle, 0);
+  EXPECT_EQ(1, buffer.Size());
+  auto event = GetProfileEvents(buffer)[0];
+  EXPECT_EQ(event->tag, "hello");
+  EXPECT_GT(event->begin_timestamp_us, 0);
+  EXPECT_EQ(event->event_type, ProfileEvent::EventType::DEFAULT);
+  EXPECT_EQ(event->event_metadata, kEventMetadata1);
+  EXPECT_EQ(event->extra_event_metadata, kEventMetadata2);
+  EXPECT_EQ(1, buffer.Size());
+  EXPECT_GE(event->end_timestamp_us, event->begin_timestamp_us);
+}
+
 TEST(ProfileBufferTest, OverFlow) {
   const int max_size = 4;
   ProfileBuffer buffer{max_size, true};
@@ -83,13 +107,13 @@ TEST(ProfileBufferTest, Enable) {
   EXPECT_EQ(0, buffer.Size());
   auto event_handle =
       buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
-                        /*event_metadata*/ 42, /*event_subgraph_index*/ 0);
+                        /*event_metadata1*/ 42, /*event_metadata2*/ 0);
   EXPECT_EQ(kInvalidEventHandle, event_handle);
   EXPECT_EQ(0, buffer.Size());
   buffer.SetEnabled(true);
   event_handle =
       buffer.BeginEvent("hello", ProfileEvent::EventType::DEFAULT,
-                        /*event_metadata*/ 42, /*event_subgraph_index*/ 0);
+                        /*event_metadata1*/ 42, /*event_metadata2*/ 0);
   EXPECT_GE(event_handle, 0);
   EXPECT_EQ(1, buffer.Size());
 }
diff --git a/tensorflow/lite/profiling/profile_summarizer.cc b/tensorflow/lite/profiling/profile_summarizer.cc
index acf630c93cf..2fc04f99659 100644
--- a/tensorflow/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/lite/profiling/profile_summarizer.cc
@@ -128,7 +128,7 @@ void ProfileSummarizer::ProcessProfiles(
   int64_t delegate_internal_total_us = 0;
 
   for (auto event : events) {
-    const auto subgraph_index = event->event_subgraph_index;
+    const auto subgraph_index = event->extra_event_metadata;
     auto stats_calculator = GetStatsCalculator(subgraph_index);
     int64_t start_us = event->begin_timestamp_us - base_start_us;
     int64_t node_exec_time =
@@ -174,7 +174,7 @@ void ProfileSummarizer::ProcessProfiles(
       const memory::MemoryUsage node_mem_usage =
           event->end_mem_usage - event->begin_mem_usage;
       std::string node_name(event->tag);
-      node_name += "/" + std::to_string(event->event_subgraph_index);
+      node_name += "/" + std::to_string(event->extra_event_metadata);
       stats_calculator->AddNodeStats(node_name, event->tag, node_num, start_us,
                                      node_exec_time,
                                      node_mem_usage.max_rss_kb * 1000.0);
diff --git a/tensorflow/lite/profiling/profile_summarizer_test.cc b/tensorflow/lite/profiling/profile_summarizer_test.cc
index 87e689e9985..98d26196b75 100644
--- a/tensorflow/lite/profiling/profile_summarizer_test.cc
+++ b/tensorflow/lite/profiling/profile_summarizer_test.cc
@@ -182,13 +182,13 @@ TEST_F(ProfileSummarizerIfOpTest, TestIfTrue) {
   EXPECT_EQ(2, events.size());
   int event_count_of_subgraph_zero = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 0; });
+      [](auto event) { return event->extra_event_metadata == 0; });
   int event_count_of_subgraph_one = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 1; });
+      [](auto event) { return event->extra_event_metadata == 1; });
   int event_count_of_subgraph_two = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 2; });
+      [](auto event) { return event->extra_event_metadata == 2; });
   EXPECT_EQ(1, event_count_of_subgraph_zero);
   EXPECT_EQ(1, event_count_of_subgraph_one);
   EXPECT_EQ(0, event_count_of_subgraph_two);
@@ -209,13 +209,13 @@ TEST_F(ProfileSummarizerIfOpTest, TestIfFalse) {
   EXPECT_EQ(2, events.size());
   int event_count_of_subgraph_zero = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 0; });
+      [](auto event) { return event->extra_event_metadata == 0; });
   int event_count_of_subgraph_one = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 1; });
+      [](auto event) { return event->extra_event_metadata == 1; });
   int event_count_of_subgraph_two = std::count_if(
       events.begin(), events.end(),
-      [](auto event) { return event->event_subgraph_index == 2; });
+      [](auto event) { return event->extra_event_metadata == 2; });
   EXPECT_EQ(1, event_count_of_subgraph_zero);
   EXPECT_EQ(0, event_count_of_subgraph_one);
   EXPECT_EQ(1, event_count_of_subgraph_two);
diff --git a/tensorflow/lite/profiling/profiler_test.cc b/tensorflow/lite/profiling/profiler_test.cc
index cedb109697d..1d8455e3647 100644
--- a/tensorflow/lite/profiling/profiler_test.cc
+++ b/tensorflow/lite/profiling/profiler_test.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/lite/profiling/profiler.h"
+
 #include <unistd.h>
 
 #include <chrono>  // NOLINT(build/c++11)
@@ -20,7 +22,6 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/testing/util.h"
 
 namespace tflite {
@@ -55,6 +56,20 @@ TEST(ProfilerTest, NoProfilesAreCollectedWhenDisabled) {
   EXPECT_EQ(0, profile_events.size());
 }
 
+TEST(ProfilerTest, NoProfilesAreCollectedWhenEventTypeUnsupported) {
+  BufferedProfiler profiler(1024);
+  tflite::Profiler* p = &profiler;
+  p->AddEvent("Hello",
+              Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT,
+              /*start*/ 0, /*end*/ 1,
+              /*event_metadata*/ 2);
+  auto handler = p->BeginEvent(
+      "begin", Profiler::EventType::GENERAL_RUNTIME_INSTRUMENTATION_EVENT, 0);
+  p->EndEvent(handler);
+  auto profile_events = profiler.GetProfileEvents();
+  EXPECT_EQ(0, profile_events.size());
+}
+
 TEST(ProfilingTest, ProfilesAreCollected) {
   BufferedProfiler profiler(1024);
   profiler.StartProfiling();
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index ce59c56a1d0..53814bb0c43 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -279,7 +279,7 @@ class QuantizationMode(object):
     })
 
     for node_def in self._graph_def.node:
-      if any([op in node_def.name for op in training_quant_ops]):
+      if any(op in node_def.name for op in training_quant_ops):
         return True
     return False
 
diff --git a/tensorflow/lite/tools/delegates/BUILD b/tensorflow/lite/tools/delegates/BUILD
index d2eac9d7348..9f2108d07d6 100644
--- a/tensorflow/lite/tools/delegates/BUILD
+++ b/tensorflow/lite/tools/delegates/BUILD
@@ -143,6 +143,10 @@ cc_library(
     name = "external_delegate_provider",
     srcs = ["external_delegate_provider.cc"],
     copts = tflite_copts(),
+    linkopts = select({
+        "//tensorflow:windows": [],
+        "//conditions:default": ["-ldl"],
+    }),
     linkstatic = True,
     visibility = ["//visibility:public"],
     deps = [
diff --git a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
index 2fbfb791e8c..bde9c0e03e3 100644
--- a/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/nnapi_delegate_provider.cc
@@ -26,6 +26,7 @@ namespace tools {
 class NnapiDelegateProvider : public DelegateProvider {
  public:
   NnapiDelegateProvider() {
+#if defined(__ANDROID__)
     default_params_.AddParam("use_nnapi", ToolParam::Create<bool>(false));
     default_params_.AddParam("nnapi_execution_preference",
                              ToolParam::Create<std::string>(""));
@@ -35,6 +36,7 @@ class NnapiDelegateProvider : public DelegateProvider {
                              ToolParam::Create<bool>(false));
     default_params_.AddParam("nnapi_allow_fp16",
                              ToolParam::Create<bool>(false));
+#endif
   }
 
   std::vector<Flag> CreateFlags(ToolParams* params) const final;
@@ -49,18 +51,21 @@ REGISTER_DELEGATE_PROVIDER(NnapiDelegateProvider);
 
 std::vector<Flag> NnapiDelegateProvider::CreateFlags(ToolParams* params) const {
   std::vector<Flag> flags = {
-      CreateFlag<bool>("use_nnapi", params, "use nnapi delegate api"),
-      CreateFlag<std::string>("nnapi_execution_preference", params,
-                              "execution preference for nnapi delegate. Should "
-                              "be one of the following: fast_single_answer, "
-                              "sustained_speed, low_power, undefined"),
-      CreateFlag<std::string>(
-          "nnapi_accelerator_name", params,
-          "the name of the nnapi accelerator to use (requires Android Q+)"),
-      CreateFlag<bool>("disable_nnapi_cpu", params,
-                       "Disable the NNAPI CPU device"),
-      CreateFlag<bool>("nnapi_allow_fp16", params,
-                       "Allow fp32 computation to be run in fp16")};
+#if defined(__ANDROID__)
+    CreateFlag<bool>("use_nnapi", params, "use nnapi delegate api"),
+    CreateFlag<std::string>("nnapi_execution_preference", params,
+                            "execution preference for nnapi delegate. Should "
+                            "be one of the following: fast_single_answer, "
+                            "sustained_speed, low_power, undefined"),
+    CreateFlag<std::string>(
+        "nnapi_accelerator_name", params,
+        "the name of the nnapi accelerator to use (requires Android Q+)"),
+    CreateFlag<bool>("disable_nnapi_cpu", params,
+                     "Disable the NNAPI CPU device"),
+    CreateFlag<bool>("nnapi_allow_fp16", params,
+                     "Allow fp32 computation to be run in fp16")
+#endif
+  };
 
   return flags;
 }
@@ -98,6 +103,7 @@ void NnapiDelegateProvider::LogParams(const ToolParams& params) const {
 TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
     const ToolParams& params) const {
   TfLiteDelegatePtr delegate(nullptr, [](TfLiteDelegate*) {});
+#if defined(__ANDROID__)
   if (params.Get<bool>("use_nnapi")) {
     StatefulNnApiDelegate::Options options;
     std::string accelerator_name =
@@ -157,7 +163,7 @@ TfLiteDelegatePtr NnapiDelegateProvider::CreateTfLiteDelegate(
                      << params.Get<std::string>("nnapi_execution_preference")
                      << ") to be used.";
   }
-
+#endif
   return delegate;
 }
 
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index 1bc35211b0a..f6d2c8d8141 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -41,10 +41,10 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/lite/delegates/gpu:delegate",
+            "//tensorflow/lite/delegates/nnapi:nnapi_delegate",
         ],
         "//conditions:default": [],
     }) + select({
diff --git a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
index 5d0a4dfa7d3..0282b258082 100644
--- a/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
+++ b/tensorflow/lite/tools/evaluation/evaluation_delegate_provider_test.cc
@@ -43,7 +43,11 @@ TEST(EvaluationDelegateProviderTest, CreateTfLiteDelegate) {
 TEST(EvaluationDelegateProviderTest, DelegateProvidersParams) {
   DelegateProviders providers;
   const auto& params = providers.GetAllParams();
+#if defined(__ANDROID__)
   EXPECT_TRUE(params.HasParam("use_nnapi"));
+#else
+  EXPECT_FALSE(params.HasParam("use_nnapi"));
+#endif
   EXPECT_TRUE(params.HasParam("use_gpu"));
 
   int argc = 3;
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index 33967b6f4ea..c766a932999 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -101,20 +101,18 @@ TfLiteDelegatePtr CreateNNAPIDelegate() {
       // NnApiDelegate() returns a singleton, so provide a no-op deleter.
       [](TfLiteDelegate*) {});
 #else
-  return TfLiteDelegatePtr(nullptr, [](TfLiteDelegate*) {});
+  return CreateNullDelegate();
 #endif  // defined(__ANDROID__)
 }
 
-TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
 #if defined(__ANDROID__)
+TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options) {
   return TfLiteDelegatePtr(
       new StatefulNnApiDelegate(options), [](TfLiteDelegate* delegate) {
         delete reinterpret_cast<StatefulNnApiDelegate*>(delegate);
       });
-#else
-  return CreateNullDelegate();
-#endif  // defined(__ANDROID__)
 }
+#endif  // defined(__ANDROID__)
 
 #if defined(__ANDROID__)
 TfLiteDelegatePtr CreateGPUDelegate(TfLiteGpuDelegateOptionsV2* options) {
diff --git a/tensorflow/lite/tools/evaluation/utils.h b/tensorflow/lite/tools/evaluation/utils.h
index ef2c609723e..7269d04a802 100644
--- a/tensorflow/lite/tools/evaluation/utils.h
+++ b/tensorflow/lite/tools/evaluation/utils.h
@@ -16,12 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TOOLS_EVALUATION_UTILS_H_
 #define TENSORFLOW_LITE_TOOLS_EVALUATION_UTILS_H_
 
+#include <memory>
 #include <string>
 #include <unordered_set>
 #include <vector>
 
 #if defined(__ANDROID__)
 #include "tensorflow/lite/delegates/gpu/delegate.h"
+#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #if (defined(__arm__) || defined(__aarch64__))
 #include "tensorflow/lite/experimental/delegates/hexagon/hexagon_delegate.h"
 #endif
@@ -33,7 +35,6 @@ limitations under the License.
 #endif
 
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 
 namespace tflite {
 namespace evaluation {
@@ -61,8 +62,9 @@ inline TfLiteStatus GetSortedFileNames(const std::string& directory,
 }
 
 TfLiteDelegatePtr CreateNNAPIDelegate();
-
+#if defined(__ANDROID__)
 TfLiteDelegatePtr CreateNNAPIDelegate(StatefulNnApiDelegate::Options options);
+#endif
 
 TfLiteDelegatePtr CreateGPUDelegate();
 #if defined(__ANDROID__)
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 13c58c74583..2fb22a89706 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -5122,8 +5122,6 @@ cuda_py_test(
     srcs = ["ops/nn_test.py"],
     python_version = "PY3",
     tags = ["no_windows"],
-    # TODO(b/130689556): Numerical differences due to fast math on CPU.
-    xla_enable_strict_auto_jit = False,
     deps = [
         ":array_ops",
         ":client_testlib",
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 673781e47dd..b54770cbd28 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -72,31 +72,43 @@ class ControlFlowTransformer(converter.Base):
     return results
 
   def _create_state_functions(
-      self, loop_vars, nonlocal_declarations, getter_name, setter_name):
-    if loop_vars:
-      template = """
-        def getter_name():
-          return state_vars,
-        def setter_name(vars_):
-          nonlocal_declarations
-          state_vars, = vars_
-      """
-      return templates.replace(
-          template,
-          nonlocal_declarations=nonlocal_declarations,
-          getter_name=getter_name,
-          setter_name=setter_name,
-          state_vars=tuple(loop_vars))
-    else:
+      self, block_vars, nonlocal_declarations, getter_name, setter_name):
+    if not block_vars:
       template = """
         def getter_name():
           return ()
-        def setter_name(loop_vars):
+        def setter_name(block_vars):
           pass
       """
       return templates.replace(
           template, getter_name=getter_name, setter_name=setter_name)
 
+    guarded_block_vars = []
+    for v in block_vars:
+      if v.is_simple():
+        guarded_block_vars.append(v)
+      else:
+        guarded_block_vars.append(
+            templates.replace_as_expression(
+                'ag__.ldu(lambda: var_, name)',
+                var_=v,
+                name=gast.Constant(str(v), kind=None)))
+
+    template = """
+      def getter_name():
+        return guarded_state_vars,
+      def setter_name(vars_):
+        nonlocal_declarations
+        state_vars, = vars_
+    """
+    return templates.replace(
+        template,
+        nonlocal_declarations=nonlocal_declarations,
+        getter_name=getter_name,
+        guarded_state_vars=guarded_block_vars,
+        setter_name=setter_name,
+        state_vars=tuple(block_vars))
+
   def _create_loop_options(self, node):
     if not anno.hasanno(node, anno.Basic.DIRECTIVES):
       return gast.Dict([], [])
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 935e2cec4b8..f0681128698 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -189,9 +189,9 @@ class WhileStatementTest(ControlFlowTestBase):
         symbols={'TestClass': TestClass})
     with self.converted(
         test_fn, control_flow, {'TestClass': TestClass}) as result:
-      # TODO(b/128519776): Better error message.
-      with self.assertRaisesRegex(AttributeError, 'subattr'):
-        result.test_fn(constant_op.constant(0), constant_op.constant(5))
+      with self.assertRaisesRegex(
+          ValueError, "'tc.subattr' must be defined before the loop"):
+        result.test_fn(constant_op.constant(0), 0)
 
   def test_composite_state_slice_initialized_in_loop(self):
 
@@ -209,9 +209,9 @@ class WhileStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, (0, constant_op.constant(10)),
                                  {'subkey': 14})
     with self.converted(test_fn, control_flow, {}) as result:
-      # TODO(b/128519776): Better error message.
-      with self.assertRaisesRegex(KeyError, 'subkey'):
-        result.test_fn(constant_op.constant(0), constant_op.constant(5))
+      with self.assertRaisesRegex(
+          ValueError, r"'d\[k\]' must be defined before the loop"):
+        result.test_fn(constant_op.constant(0), 0)
 
   def test_composite_state_literal_slice_initialized_in_loop(self):
 
@@ -228,9 +228,9 @@ class WhileStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, (0, constant_op.constant(10)),
                                  {'subkey': 14})
     with self.converted(test_fn, control_flow, {}) as result:
-      # TODO(b/128519776): Better error message.
-      with self.assertRaisesRegex(KeyError, 'subkey'):
-        result.test_fn(constant_op.constant(0), constant_op.constant(5))
+      with self.assertRaisesRegex(
+          ValueError, r"'d\['subkey'\]' must be defined before the loop"):
+        result.test_fn(constant_op.constant(0), 0)
 
   def test_composite_state_slice_aliased_to_local(self):
 
@@ -245,7 +245,7 @@ class WhileStatementTest(ControlFlowTestBase):
     self.assertTransformedResult(test_fn, (0, constant_op.constant(10)),
                                  {'subkey': 11})
     with self.converted(test_fn, control_flow, {}) as result:
-      # TODO(b/128519776): Better error message.
+      # TODO(b/136999953): Better error message.
       # Note that this error happens at execution time.
       with self.assertRaises(errors.InaccessibleTensorError):
         graph_fn = def_function.function(result.test_fn, autograph=False)
@@ -671,11 +671,9 @@ class ForStatementTest(ControlFlowTestBase):
         symbols={'TestClass': TestClass})
     with self.converted(
         test_fn, control_flow, {'TestClass': TestClass}) as result:
-      # TODO(b/128519776): Better error message.
       with self.assertRaisesRegex(
-          AttributeError, '\'TestClass\' object has no attribute \'x\''):
-        result.test_fn(
-            constant_op.constant(list(range(5))), constant_op.constant(5))
+          ValueError, "'tc.x' must be defined before the loop"):
+        result.test_fn(constant_op.constant(list(range(5))), 0)
 
   def test_tuple_unpacking(self):
     def test_fn(x_list):
diff --git a/tensorflow/python/autograph/operators/__init__.py b/tensorflow/python/autograph/operators/__init__.py
index 8ac4e1d8bb3..a42dcf326c3 100644
--- a/tensorflow/python/autograph/operators/__init__.py
+++ b/tensorflow/python/autograph/operators/__init__.py
@@ -62,5 +62,6 @@ from tensorflow.python.autograph.operators.slices import get_item
 from tensorflow.python.autograph.operators.slices import GetItemOpts
 from tensorflow.python.autograph.operators.slices import set_item
 from tensorflow.python.autograph.operators.variables import ld
+from tensorflow.python.autograph.operators.variables import ldu
 from tensorflow.python.autograph.operators.variables import Undefined
 from tensorflow.python.autograph.operators.variables import UndefinedReturnValue
diff --git a/tensorflow/python/autograph/operators/variables.py b/tensorflow/python/autograph/operators/variables.py
index 150f64e1758..c3bedc3fecf 100644
--- a/tensorflow/python/autograph/operators/variables.py
+++ b/tensorflow/python/autograph/operators/variables.py
@@ -26,6 +26,31 @@ def ld(v):
   return v
 
 
+def ldu(load_v, name):
+  """Load variable operator that returns Undefined when failing to evaluate.
+
+  Note: the name ("load or return undefined") is abbreviated to minimize
+  the amount of clutter in generated code.
+
+  This variant of `ld` is useful when loading symbols that may be undefined at
+  runtime, such as composite symbols, and whether they are defined or not cannot
+  be determined statically. For example `d['a']` is undefined when `d` is an
+  empty dict.
+
+  Args:
+    load_v: Lambda that executes the actual read.
+    name: Human-readable name of the symbol being read.
+  Returns:
+    Either the value of the symbol, or Undefined, if the symbol is not fully
+    defined.
+  """
+  try:
+    # TODO(mdan): Use locals()/globals() here.
+    return load_v()
+  except (KeyError, AttributeError, NameError):
+    return Undefined(name)
+
+
 class Undefined(object):
   """Represents an undefined symbol in Python.
 
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 53545c58a2d..d21f1755d94 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 27)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 5, 28)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/data/experimental/BUILD b/tensorflow/python/data/experimental/BUILD
index 5264b4a7791..a1e45ecd17e 100644
--- a/tensorflow/python/data/experimental/BUILD
+++ b/tensorflow/python/data/experimental/BUILD
@@ -13,5 +13,6 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/data/experimental/ops:dataset_ops",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
+        "//tensorflow/python/data/experimental/service",
     ],
 )
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index b64ecc32e3e..badc173acec 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -53,6 +53,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@copy_to_device
 @@dense_to_ragged_batch
 @@dense_to_sparse_batch
+@@distribute
 @@enumerate_dataset
 @@from_variant
 @@get_next_as_optional
@@ -89,6 +90,7 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
+from tensorflow.python.data.experimental import service
 from tensorflow.python.data.experimental.ops.batching import dense_to_ragged_batch
 from tensorflow.python.data.experimental.ops.batching import dense_to_sparse_batch
 from tensorflow.python.data.experimental.ops.batching import map_and_batch
@@ -150,4 +152,9 @@ from tensorflow.python.framework.type_spec import TypeSpec as Structure
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
-remove_undocumented(__name__)
+
+_allowed_symbols = [
+    "service",
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
index 88230153181..1a0e0a101e2 100644
--- a/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/snapshot_dataset_benchmark.py
@@ -51,7 +51,8 @@ class SnapshotDatasetBenchmark(benchmark_base.DatasetBenchmarkBase):
     dataset = dataset.map(
         lambda x: gen_array_ops.broadcast_to(x, [50, 50, 3]))
     dataset = dataset.repeat(num_elems)
-    dataset = dataset.apply(snapshot.snapshot(tmp_dir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmp_dir, compression=compression))
 
     return dataset
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py
index 53261d4b298..d54e50ebfa3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/snapshot_dataset_serialization_test.py
@@ -46,7 +46,7 @@ class SnapshotDatasetSerializationTest(
         os.mkdir(self.snapshot_dir)
       dataset = dataset_ops.Dataset.range(1000)
       dataset = dataset.apply(
-          snapshot.snapshot(
+          snapshot.legacy_snapshot(
               self.snapshot_dir,
               num_writer_threads=num_threads,
               writer_buffer_size=2 * num_threads,
diff --git a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
index 535cf884dc6..6169a1752fc 100644
--- a/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/snapshot_test.py
@@ -97,11 +97,11 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(1000)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, list(range(1000)))
 
     dataset = dataset_ops.Dataset.range(1001)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, list(range(1001)))
 
     self.assertSnapshotDirectoryContains(tmpdir, 2, 1, 1)
@@ -111,11 +111,11 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset1 = dataset_ops.Dataset.range(1000)
-    dataset1 = dataset1.apply(snapshot.snapshot(tmpdir))
+    dataset1 = dataset1.apply(snapshot.legacy_snapshot(tmpdir))
     next1 = self.getNext(dataset1)
 
     dataset2 = dataset_ops.Dataset.range(1000)
-    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    dataset2 = dataset2.apply(snapshot.legacy_snapshot(tmpdir))
     next2 = self.getNext(dataset2)
 
     for i in range(0, 1000):
@@ -132,11 +132,11 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     # We create two iterators but call getNext on only one.
     dataset1 = dataset_ops.Dataset.range(1000)
-    dataset1 = dataset1.apply(snapshot.snapshot(tmpdir))
+    dataset1 = dataset1.apply(snapshot.legacy_snapshot(tmpdir))
     next1 = self.getNext(dataset1)
 
     dataset2 = dataset_ops.Dataset.range(1001)
-    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    dataset2 = dataset2.apply(snapshot.legacy_snapshot(tmpdir))
     _ = self.getNext(dataset2)
 
     for _ in range(1000):
@@ -156,7 +156,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(1000)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset, list(range(1000)))
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
@@ -172,7 +173,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(10)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
@@ -194,7 +196,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
       return (x, string_ops.as_string(x), string_ops.as_string(2 * x), 2 * x)
 
     dataset = dataset.map(map_fn)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     dataset = dataset.repeat(10)
 
     expected = []
@@ -210,7 +213,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(
-        snapshot.snapshot(tmpdir, snapshot_name="my_custom_snapshot"))
+        snapshot.legacy_snapshot(tmpdir, snapshot_name="my_custom_snapshot"))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
@@ -226,7 +229,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(10)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, mode="passthrough"))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, mode="passthrough"))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
@@ -237,7 +241,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
 
     dataset = dataset_ops.Dataset.range(10)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, mode="write"))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir, mode="write"))
     dataset = dataset.repeat(10)
     self.assertDatasetProduces(dataset, list(range(10)) * 10)
 
@@ -251,7 +255,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # We write a copy of the snapshot first.
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir, mode="write", snapshot_name="my_custom_snapshot"))
     self.assertDatasetProduces(dataset, list(range(10)))
 
@@ -264,7 +268,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # longer exists after we moved, we force it to read from the run we specify.
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir, mode="read", snapshot_name="my_custom_snapshot_2"))
     self.assertDatasetProduces(dataset, list(range(10)))
 
@@ -276,7 +280,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
     dataset = dataset_ops.Dataset.range(10)
     with self.assertRaises(errors.NotFoundError):
-      dataset = dataset.apply(snapshot.snapshot(tmpdir, mode="read"))
+      dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir, mode="read"))
       get_next = self.getNext(dataset)
       self.evaluate(get_next())
 
@@ -286,7 +290,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset = dataset_ops.Dataset.range(10)
     with self.assertRaises(errors.NotFoundError):
       dataset = dataset.apply(
-          snapshot.snapshot(
+          snapshot.legacy_snapshot(
               tmpdir, mode="read", snapshot_name="my_nonexistent_snapshot"))
       get_next = self.getNext(dataset)
       self.evaluate(get_next())
@@ -310,15 +314,16 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, compression=compression))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
     self.removeTFRecords()
 
     dataset2 = core_readers._TFRecordDataset(filenames)
-    dataset2 = dataset2.apply(snapshot.snapshot(
-        tmpdir, compression=compression))
+    dataset2 = dataset2.apply(
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset2, expected)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -334,7 +339,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, shard_size_bytes=100))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=100))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
@@ -342,7 +348,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset2 = core_readers._TFRecordDataset(filenames)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=100, shuffle_on_read=True))
+        snapshot.legacy_snapshot(
+            tmpdir, shard_size_bytes=100, shuffle_on_read=True))
     next2 = self.getNext(dataset2)
 
     res1 = self.evaluate(next2())
@@ -357,7 +364,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # make sure all the elements are still there
     dataset3 = core_readers._TFRecordDataset(filenames)
     dataset3 = dataset3.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=100, shuffle_on_read=True))
+        snapshot.legacy_snapshot(
+            tmpdir, shard_size_bytes=100, shuffle_on_read=True))
     self.assertDatasetProduces(dataset3, expected, assert_items_equal=True)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -373,7 +381,8 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir, shard_size_bytes=10))
+    dataset = dataset.apply(
+        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=10))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
@@ -381,14 +390,20 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset2 = core_readers._TFRecordDataset(filenames)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=10, shuffle_on_read=True,
-                          shuffle_seed=123456))
+        snapshot.legacy_snapshot(
+            tmpdir,
+            shard_size_bytes=10,
+            shuffle_on_read=True,
+            shuffle_seed=123456))
     next2 = self.getNext(dataset2)
 
     dataset3 = core_readers._TFRecordDataset(filenames)
     dataset3 = dataset3.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=10,
-                          shuffle_on_read=True, shuffle_seed=123456))
+        snapshot.legacy_snapshot(
+            tmpdir,
+            shard_size_bytes=10,
+            shuffle_on_read=True,
+            shuffle_seed=123456))
     next3 = self.getNext(dataset3)
 
     # make sure that the items are read back in the same order for both datasets
@@ -417,7 +432,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir,
             shard_size_bytes=1024 * 1024,
             num_reader_threads=2,
@@ -431,7 +446,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset2 = core_readers._TFRecordDataset(filenames)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir,
             shard_size_bytes=1024 * 1024,
             num_reader_threads=2,
@@ -464,7 +479,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir,
             compression=compression,
             num_writer_threads=threads,
@@ -477,7 +492,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset2 = core_readers._TFRecordDataset(filenames)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(tmpdir, compression=compression))
+        snapshot.legacy_snapshot(tmpdir, compression=compression))
     self.assertDatasetProduces(dataset2, expected, assert_items_equal=True)
 
   @combinations.generate(test_base.default_test_combinations())
@@ -489,7 +504,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset3 = dataset_ops.Dataset.range(200, 300)
 
     dataset = dataset1.concatenate(dataset2).concatenate(dataset3)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, list(range(300)))
 
     dataset4 = dataset_ops.Dataset.range(200, 300)
@@ -497,7 +512,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset6 = dataset_ops.Dataset.range(0, 100)
 
     dataset = dataset6.concatenate(dataset5).concatenate(dataset4)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, list(range(300)))
 
     self.assertSnapshotDirectoryContains(tmpdir, 1, 1, 1)
@@ -508,7 +523,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset1 = dataset_ops.Dataset.range(1000)
     dataset1 = dataset1.apply(
-        snapshot.snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
+        snapshot.legacy_snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
     next1 = self.getNext(dataset1)
 
     # Don't finish reading dataset1, so it is never finalized
@@ -524,7 +539,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # snapshot has expired.
     dataset2 = dataset_ops.Dataset.range(1000)
     dataset2 = dataset2.apply(
-        snapshot.snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
+        snapshot.legacy_snapshot(tmpdir, pending_snapshot_expiry_seconds=1))
     next2 = self.getNext(dataset2)
 
     for _ in range(500):
@@ -537,7 +552,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     dataset1 = dataset_ops.Dataset.range(1000)
     dataset1 = dataset1.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=10000))
+        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=10000))
     next1 = self.getNext(dataset1)
 
     for _ in range(1000):
@@ -547,7 +562,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     # Create second snapshot with a different shard_size_bytes
     dataset2 = dataset_ops.Dataset.range(1000)
     dataset2 = dataset1.apply(
-        snapshot.snapshot(tmpdir, shard_size_bytes=20000))
+        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=20000))
     next2 = self.getNext(dataset2)
 
     for _ in range(1000):
@@ -568,7 +583,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     dataset = dataset.map(lambda x: gen_array_ops.broadcast_to(x, [1024, 1024]))
     dataset = dataset.repeat(10)
     dataset = dataset.apply(
-        snapshot.snapshot(
+        snapshot.legacy_snapshot(
             tmpdir, shard_size_bytes=10 * 1024 * 1024, compression=compression))
     next_fn = self.getNext(dataset)
 
@@ -593,14 +608,14 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
 
     tmpdir = self.snapshot_dir
     dataset = core_readers._TFRecordDataset(filenames)
-    dataset = dataset.apply(snapshot.snapshot(tmpdir))
+    dataset = dataset.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset, expected)
 
     # remove the original files and try to read the data back only from snapshot
     self.removeTFRecords()
 
     dataset2 = core_readers._TFRecordDataset(filenames)
-    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
+    dataset2 = dataset2.apply(snapshot.legacy_snapshot(tmpdir))
     self.assertDatasetProduces(dataset2, expected)
 
     expected_after = [
@@ -610,7 +625,7 @@ class SnapshotDatasetTest(reader_dataset_ops_test_base.TFRecordDatasetTestBase,
     ]
 
     dataset3 = core_readers._TFRecordDataset(filenames)
-    dataset3 = dataset3.apply(snapshot.snapshot(tmpdir))
+    dataset3 = dataset3.apply(snapshot.legacy_snapshot(tmpdir))
     dataset3 = dataset3.map(lambda x: string_ops.substr_v2(x, 2, 1000))
     self.assertDatasetProduces(dataset3, expected_after)
 
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index 782f438c701..39790d843ba 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -23,11 +23,13 @@ import six
 
 from tensorflow.python import tf2
 from tensorflow.python.data.experimental.ops import compression_ops
+from tensorflow.python.data.experimental.ops.distribute_options import AutoShardPolicy
 from tensorflow.python.data.experimental.ops.distribute_options import ExternalStatePolicy
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 class ProcessingMode(object):
@@ -240,11 +242,18 @@ def _distribute(processing_mode,
     # to limit memory usage.
     dataset = dataset.map(
         lambda x: compression_ops.uncompress(x, output_spec=uncompressed_spec))
+
+    # Disable autosharding for shared jobs.
+    if job_name:
+      options = dataset_ops.Options()
+      options.experimental_distribute.auto_shard_policy = AutoShardPolicy.OFF
+      dataset = dataset.with_options(options)
     return dataset
 
   return _apply_fn
 
 
+@tf_export("data.experimental.service.distribute")
 def distribute(processing_mode,
                service,
                job_name=None,
@@ -289,32 +298,65 @@ def distribute(processing_mode,
   executed locally.
 
   The `job_name` argument allows jobs to be shared across multiple
-  datasets. Instead of each dataset creating its own job, all datasets with the
-  same `job_name` will consume from the same job. A new job will
-  be created for each iteration of the dataset (with each repetition of
-  `Dataset.repeat` counting as a new iteration). The following example
-  demonstrates shared iteration, with the assumption that the tf.data service is
-  running with a single worker.
+  datasets. Instead of each dataset creating its own job, all
+  datasets with the same `job_name` will consume from the same job. A new job
+  will be created for each iteration of the dataset (with each repetition of
+  `Dataset.repeat` counting as a new iteration). Suppose two training workers
+  (in either a single client or multi-client setup) iterate over the below
+  dataset, and there is a single tf.data worker:
 
   ```
   range5_dataset = tf.data.Dataset.range(5)
-  dataset1 = range5_dataset.apply(tf.data.experimental.service.distribute(
-      "parallel_epochs", "my_job_name", "grpc://dataservice:5000"))
-  dataset2 = range5_dataset.apply(tf.data.experimental.service.distribute(
-      "parallel_epochs", "my_job_name", "grpc://dataservice:5000"))
-  iter_1_1 = iter(dataset1)
-  iter_1_2 = iter(dataset1)
-  iter_2_1 = iter(dataset2)
-  iter_2_2 = iter(dataset2)
-  print(next(iter_1_1))  # Prints "0"
-  # iter_1_2 consumes from the same job as iter_1_1
-  print(next(iter_1_2))  # Prints "1"
-  # iter_2_1 consumes from a new job
-  print(next(iter_2_1))  # Prints "0"
-  # iter_2_2 consumes from the same job as iter_2_1
-  print(next(iter_2_2))  # Prints "1"
+  dataset = range5_dataset.apply(tf.data.experimental.service.distribute(
+      "parallel_epochs", "grpc://dataservice:5000", job_name="my_job_name"))
+  for iteration in range(3):
+    print(list(dataset))
   ```
 
+  The elements of each job will be split between the two processes, with
+  elements being consumed by the processes on a first-come first-served basis.
+  One possible result is that process 1 prints
+
+  ```
+  [0, 2, 4]
+  [0, 1, 3]
+  [1]
+  ```
+
+  and process 2 prints
+
+  ```
+  [1, 3]
+  [2, 4]
+  [0, 2, 3, 4]
+  ```
+
+  Job names must not be re-used across different training jobs within the
+  lifetime of the tf.data service. In general, the tf.data service is expected
+  to live for the duration of a single training job.
+  To use the tf.data service with multiple training jobs, make sure to use
+  different job names to avoid conflicts. For example, suppose a training job
+  calls `distribute` with `job_name="job"` and reads until end of input. If
+  another independent job connects to the same tf.data service and tries to read
+  from `job_name="job"`, it will immediately receive end of input, without
+  getting any data.
+
+  **Keras and Distribution Strategies**
+
+  The dataset produced by the `distribute` transformation can be passed to
+  Keras' `Model.fit` or Distribution Strategy's
+  `tf.distribute.Strategy.experimental_distribute_dataset` like any other
+  `tf.data.Dataset`. We recommend setting a `job_name` on the call to
+  `distribute` so that if there are multiple workers, they read data from the
+  same job. Note that the autosharding normally performed by
+  `experimental_distribute_dataset` will be disabled when setting a `job_name`,
+  since sharing the job already results in splitting data across the workers.
+  When using a shared job, data will be dynamically balanced across workers, so
+  that they reach end of input about the same time. This results in better
+  worker utilization than with autosharding, where each worker processes an
+  independent set of files, and some workers may run out of data earlier than
+  others.
+
   Args:
     processing_mode: A string specifying the policy for how data should be
       processed by tf.data workers. Currently, the only supported value is
diff --git a/tensorflow/python/data/experimental/ops/snapshot.py b/tensorflow/python/data/experimental/ops/snapshot.py
index 942aec712c3..490455fcbc3 100644
--- a/tensorflow/python/data/experimental/ops/snapshot.py
+++ b/tensorflow/python/data/experimental/ops/snapshot.py
@@ -29,7 +29,7 @@ COMPRESSION_SNAPPY = "SNAPPY"
 COMPRESSION_NONE = None
 
 
-class _SnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
+class _LegacySnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A Dataset that captures a snapshot or reads from a snapshot."""
 
   def __init__(self,
@@ -96,23 +96,23 @@ class _SnapshotDataset(dataset_ops.UnaryUnchangedStructureDataset):
         snapshot_name=self._snapshot_name,
         **self._flat_structure)
 
-    super(_SnapshotDataset, self).__init__(input_dataset, variant_tensor)
+    super(_LegacySnapshotDataset, self).__init__(input_dataset, variant_tensor)
 
 
-def snapshot(path,
-             compression=None,
-             reader_path_prefix=None,
-             writer_path_prefix=None,
-             shard_size_bytes=None,
-             pending_snapshot_expiry_seconds=None,
-             num_reader_threads=None,
-             reader_buffer_size=None,
-             num_writer_threads=None,
-             writer_buffer_size=None,
-             shuffle_on_read=None,
-             shuffle_seed=None,
-             mode=None,
-             snapshot_name=None):
+def legacy_snapshot(path,
+                    compression=None,
+                    reader_path_prefix=None,
+                    writer_path_prefix=None,
+                    shard_size_bytes=None,
+                    pending_snapshot_expiry_seconds=None,
+                    num_reader_threads=None,
+                    reader_buffer_size=None,
+                    num_writer_threads=None,
+                    writer_buffer_size=None,
+                    shuffle_on_read=None,
+                    shuffle_seed=None,
+                    mode=None,
+                    snapshot_name=None):
   """Writes to/reads from a snapshot of a dataset.
 
   This function attempts to determine whether a valid snapshot exists at the
@@ -168,7 +168,7 @@ def snapshot(path,
   """
 
   def _apply_fn(dataset):
-    return _SnapshotDataset(
+    return _LegacySnapshotDataset(
         input_dataset=dataset,
         path=path,
         compression=compression,
diff --git a/tensorflow/python/data/experimental/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
new file mode 100644
index 00000000000..5e1d8473633
--- /dev/null
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -0,0 +1,15 @@
+package(
+    default_visibility = ["//tensorflow:internal"],
+    licenses = ["notice"],  # Apache 2.0
+)
+
+exports_files(["LICENSE"])
+
+py_library(
+    name = "service",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/data/experimental/ops:data_service_ops",
+    ],
+)
diff --git a/tensorflow/python/data/experimental/service/__init__.py b/tensorflow/python/data/experimental/service/__init__.py
new file mode 100644
index 00000000000..f3c8aff2b3f
--- /dev/null
+++ b/tensorflow/python/data/experimental/service/__init__.py
@@ -0,0 +1,21 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for using the tf.data service."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.experimental.ops.data_service_ops import distribute
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 01ae1b61f6a..ed93e87088c 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -67,6 +67,7 @@ py_library(
         ":collective_util",
         ":cross_device_utils",
         ":device_util",
+        ":ps_values",
         ":reduce_util",
         ":tpu_values",
         ":values",
@@ -78,7 +79,9 @@ py_library(
         "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:executor",
         "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
     ],
@@ -315,18 +318,23 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":cross_device_ops",
+        ":device_util",
+        ":distribute_lib",
         ":input_lib",
         ":mirrored_run",
         ":multi_worker_util",
         ":numpy_dataset",
-        ":reduce_util",
+        ":ps_values",
         ":values",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:device",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
     ],
@@ -671,12 +679,14 @@ py_library(
         ":device_util",
         ":distribute_lib",
         ":reduce_util",
+        ":values_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:composite_tensor",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:tensor_util",
+        "//tensorflow/python:tf_export",
         "//tensorflow/python:type_spec",
         "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
@@ -686,6 +696,34 @@ py_library(
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/types",
+    ],
+)
+
+py_library(
+    name = "ps_values",
+    srcs = ["ps_values.py"],
+    deps = [
+        ":distribute_lib",
+        ":values",
+        ":values_util",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/tracking:base",
+        "//tensorflow/python/types",
+    ],
+)
+
+py_library(
+    name = "values_util",
+    srcs = ["values_util.py"],
+    deps = [
+        ":distribute_lib",
+        ":reduce_util",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:variable_scope",
     ],
 )
 
@@ -1037,23 +1075,57 @@ distribute_py_test(
     ],
     deps = [
         ":combinations",
-        ":device_util",
         ":distribute_lib",
-        ":mirrored_strategy",
-        ":parameter_server_strategy",
         ":strategy_combinations",
+        ":tpu_strategy",
+        ":tpu_values",
         ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:indexed_slices",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
+        "//tensorflow/python:saver",
+        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/saved_model/model_utils:mode_keys",
+        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/types",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+distribute_py_test(
+    name = "ps_values_test",
+    size = "medium",
+    srcs = ["ps_values_test.py"],
+    main = "ps_values_test.py",
+    tags = [
+        "multi_and_single_gpu",
+    ],
+    deps = [
+        ":combinations",
+        ":ps_values",
+        ":strategy_combinations",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1570,10 +1642,13 @@ cuda_py_test(
     deps = [
         ":central_storage_strategy",
         ":combinations",
+        ":device_util",
+        ":distribute_lib",
         ":multi_worker_test_base",
         ":multi_worker_util",
         ":parameter_server_strategy",
-        ":strategy_combinations",
+        ":ps_values",
+        ":reduce_util",
         ":strategy_test_lib",
         ":values",
         "//tensorflow/core:protos_all_py",
@@ -1581,16 +1656,22 @@ cuda_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:session",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_util",
-        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/keras/layers",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/keras/layers:core",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1679,11 +1760,16 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
     deps = [
         ":combinations",
+        ":multi_worker_test_base",
         ":reduce_util",
         ":strategy_combinations",
+        ":strategy_test_lib",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 7c7f521af98..40c60241ac0 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -418,6 +418,14 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
         split_batch_by=self._num_replicas_in_sync,
         input_context=input_context)
 
+  def _experimental_distribute_datasets_from_function(self, dataset_fn):
+    input_context = self._make_input_context()
+    return input_lib.get_distributed_datasets_from_function(
+        dataset_fn=dataset_fn,
+        input_workers=self._input_workers,
+        input_contexts=[input_context],
+        strategy=self._container_strategy())
+
   def _make_dataset_iterator(self, dataset):
     """Distributes the dataset to each local GPU."""
     input_context = self._make_input_context()
diff --git a/tensorflow/python/distribute/cross_device_ops.py b/tensorflow/python/distribute/cross_device_ops.py
index 8c8970f4aeb..aaca66833e0 100644
--- a/tensorflow/python/distribute/cross_device_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -19,15 +19,16 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import enum
 import threading
 
-import enum
 import six
 
 from tensorflow.python.client import device_lib
 from tensorflow.python.distribute import collective_util
 from tensorflow.python.distribute import cross_device_utils
 from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as value_lib
@@ -64,7 +65,7 @@ def validate_destinations(destinations):
   """Validates the `destination` is one of expected types."""
   if not isinstance(
       destinations,
-      (value_lib.DistributedValues, ops.Tensor, value_lib.AggregatingVariable,
+      (value_lib.DistributedValues, ops.Tensor, ps_values.AggregatingVariable,
        six.string_types, tpu_values.TPUMirroredVariable
       )) and not resource_variable_ops.is_resource_variable(destinations):
     raise ValueError("destinations must be one of a `DistributedValues` object,"
diff --git a/tensorflow/python/distribute/custom_training_loop_gradient_test.py b/tensorflow/python/distribute/custom_training_loop_gradient_test.py
index ebf5d440c3e..c2ce2caccd0 100644
--- a/tensorflow/python/distribute/custom_training_loop_gradient_test.py
+++ b/tensorflow/python/distribute/custom_training_loop_gradient_test.py
@@ -111,7 +111,6 @@ class GradientTapeTest(test.TestCase, parameterized.TestCase,
         return grads
       return distribution.experimental_local_results(
           distribution.run(train_step, args=(x,)))
-
     dist_dataset = distribution.experimental_distribute_dataset(dataset)
     results = []
     for x in dist_dataset:
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
index ecdc4fad159..fbc8923e050 100644
--- a/tensorflow/python/distribute/distribute_lib.py
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -836,46 +836,40 @@ class StrategyBase(object):
     where that limitation does not exist.
 
     The `dataset_fn` should take an `tf.distribute.InputContext` instance where
-    information about batching and input replication can be accessed:
+    information about batching and input replication can be accessed.
 
-    ```
-    def dataset_fn(input_context):
-      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
-      d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
-      return d.shard(
-          input_context.num_input_pipelines, input_context.input_pipeline_id)
+    You can also use the `element_spec` property of the distributed dataset
+    returned by this API to query the `tf.TypeSpec` of the elements returned
+    by the iterator. This can be used to set the `input_signature` property
+    of a `tf.function`.
 
-    inputs = strategy.experimental_distribute_datasets_from_function(dataset_fn)
+    >>> global_batch_size = 8
+    >>> def dataset_fn(input_context):
+    ...   batch_size = input_context.get_per_replica_batch_size(
+    ...                    global_batch_size)
+    ...   d = tf.data.Dataset.from_tensors([[1.]]).repeat().batch(batch_size)
+    ...   return d.shard(
+    ...       input_context.num_input_pipelines,
+    ...       input_context.input_pipeline_id)
 
-    for batch in inputs:
-      replica_results = strategy.run(replica_fn, args=(batch,))
-    ```
+    >>> strategy = tf.distribute.MirroredStrategy()
+    >>> ds = strategy.experimental_distribute_datasets_from_function(dataset_fn)
+
+    >>> def train(ds):
+    ...   @tf.function(input_signature=[ds.element_spec])
+    ...   def step_fn(inputs):
+    ...     # train the model with inputs
+    ...     return inputs
+
+    ...   for batch in ds:
+    ...     replica_results = strategy.run(replica_fn, args=(batch,))
+    >>> train(ds)
 
     IMPORTANT: The `tf.data.Dataset` returned by `dataset_fn` should have a
     per-replica batch size, unlike `experimental_distribute_dataset`, which uses
     the global batch size.  This may be computed using
     `input_context.get_per_replica_batch_size`.
 
-    To query the `tf.TypeSpec` of the elements in the distributed dataset
-    returned by this API, you need to use the `element_spec` property of the
-    distributed iterator. This `tf.TypeSpec` can be used to set the
-    `input_signature` property of a `tf.function`.
-
-    ```python
-    # If you want to specify `input_signature` for a `tf.function` you must
-    # first create the iterator.
-    iterator = iter(inputs)
-
-    @tf.function(input_signature=[iterator.element_spec])
-    def replica_fn_with_signature(inputs):
-      # train the model with inputs
-      return
-
-    for _ in range(steps):
-      strategy.run(replica_fn_with_signature,
-          args=(next(iterator),))
-    ```
-
     Args:
       dataset_fn: A function taking a `tf.distribute.InputContext` instance and
         returning a `tf.data.Dataset`.
diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py
index ce2b7ceb159..142684bb3e9 100644
--- a/tensorflow/python/distribute/parameter_server_strategy.py
+++ b/tensorflow/python/distribute/parameter_server_strategy.py
@@ -28,6 +28,7 @@ from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import mirrored_run
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import numpy_dataset
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
 from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
@@ -441,8 +442,8 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
 
         # Create and wrap the variable.
         v = next_creator(**kwargs)
-        wrapped = values.AggregatingVariable(
-            self._container_strategy(), v, aggregation)
+        wrapped = ps_values.AggregatingVariable(self._container_strategy(), v,
+                                                aggregation)
 
         # Add the wrapped variable to the requested collections.
         # The handling of eager mode and the global step matches
@@ -539,7 +540,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
     return nest.map_structure(_select_fn, structured)
 
   def _update(self, var, fn, args, kwargs, group):
-    if isinstance(var, values.AggregatingVariable):
+    if isinstance(var, ps_values.AggregatingVariable):
       var = var.get()
     if not resource_variable_ops.is_resource_variable(var):
       raise ValueError(
@@ -569,7 +570,7 @@ class ParameterServerStrategyExtended(distribute_lib.StrategyExtendedV1):
 
   def value_container(self, val):
     if (hasattr(val, "_aggregating_container") and
-        not isinstance(val, values.AggregatingVariable)):
+        not isinstance(val, ps_values.AggregatingVariable)):
       wrapper = val._aggregating_container()  # pylint: disable=protected-access
       if wrapper is not None:
         return wrapper
diff --git a/tensorflow/python/distribute/parameter_server_strategy_test.py b/tensorflow/python/distribute/parameter_server_strategy_test.py
index de5cf9e90d1..d67ed72a576 100644
--- a/tensorflow/python/distribute/parameter_server_strategy_test.py
+++ b/tensorflow/python/distribute/parameter_server_strategy_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.distribute import distribution_strategy_context as ds_con
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import multi_worker_util
 from tensorflow.python.distribute import parameter_server_strategy
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute import values
@@ -796,8 +797,8 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                        msg=('created_step %s type %s vs. get_step %s type %s' %
                             (id(created_step), created_step.__class__.__name__,
                              id(get_step), get_step.__class__.__name__)))
-      self.assertIs(values.AggregatingVariable, type(created_step))
-      self.assertIs(values.AggregatingVariable, type(get_step))
+      self.assertIs(ps_values.AggregatingVariable, type(created_step))
+      self.assertIs(ps_values.AggregatingVariable, type(get_step))
       self.assertIs(strategy, created_step.distribute_strategy)
 
   @combinations.generate(combinations.combine(mode=['graph']))
@@ -828,7 +829,7 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
           _ = v * v
         v, = tape.watched_variables()
         w = strategy.extended.value_container(v)
-        self.assertIs(values.AggregatingVariable, type(w))
+        self.assertIs(ps_values.AggregatingVariable, type(w))
 
       strategy.extended.call_for_each_replica(f)
 
diff --git a/tensorflow/python/distribute/ps_values.py b/tensorflow/python/distribute/ps_values.py
new file mode 100644
index 00000000000..37cd6e12d90
--- /dev/null
+++ b/tensorflow/python/distribute/ps_values.py
@@ -0,0 +1,304 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Various classes representing distributed values for PS."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import weakref
+
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import values
+from tensorflow.python.distribute import values_util
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import variable_scope as vs
+from tensorflow.python.ops import variables as variables_lib
+from tensorflow.python.training.tracking import base as trackable
+from tensorflow.python.types import core
+
+
+# Variable used in PSStrategy TF 1 and CentralStorageStrategy.
+class AggregatingVariable(variables_lib.Variable, core.Tensor):
+  """A wrapper around a variable that aggregates updates across replicas."""
+
+  def __init__(self, strategy, v, aggregation):
+    self._distribute_strategy = strategy
+    self._v = v
+    # NOTE: We don't use "_distributed_container" here because we don't want
+    # to trigger that code path in regroup().
+    v._aggregating_container = weakref.ref(self)  # pylint: disable=protected-access
+    self._aggregation = aggregation
+
+  def get(self):
+    return self._v
+
+  @property
+  def distribute_strategy(self):
+    return self._distribute_strategy
+
+  def __getattr__(self, name):
+    return getattr(self._v, name)
+
+  def _assign_func(self, *args, **kwargs):
+    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
+      f = kwargs.pop("f")
+      if ds_context.in_cross_replica_context():
+        if distribute_lib.get_update_replica_id() is not None:
+          # We are calling an assign function in an update context.
+          return f(self._v, *args, **kwargs)
+
+        # We are calling an assign function in cross replica context, wrap it in
+        # an update call.
+        return self._distribute_strategy.extended.update(
+            self, f, args=args, kwargs=kwargs)
+      else:
+        replica_context = ds_context.get_replica_context()
+        assert replica_context
+        # We are calling an assign function in replica context.
+        # We reduce the value we want to assign/add/sub. More details about how
+        # we handle the different use cases can be found in the _reduce method.
+        # We call the function with the reduced value.
+        if self._aggregation == vs.VariableAggregation.NONE:
+          raise ValueError(
+              values_util.aggregation_error_msg.format(
+                  variable_type="AggregatingVariable"))
+
+        def merge_fn(strategy,
+                     value,
+                     use_locking=False,
+                     name=None,
+                     read_value=True):
+          v = values_util.apply_aggregation(strategy, value, self._aggregation,
+                                            self)
+          if name and isinstance(name, values.PerReplica):
+            name = name.values[0]
+          return strategy.extended.update(
+              self,
+              f,
+              args=(v,),
+              kwargs={
+                  "use_locking": use_locking,
+                  "name": name,
+                  "read_value": read_value
+              })
+        return replica_context.merge_call(merge_fn, args=args, kwargs=kwargs)
+
+  def assign_sub(self, *args, **kwargs):
+    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
+    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
+
+  def assign_add(self, *args, **kwargs):
+    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
+    return self._assign_func(f=assign_add_fn, *args, **kwargs)
+
+  def assign(self, *args, **kwargs):
+    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
+    return self._assign_func(f=assign_fn, *args, **kwargs)
+
+  @property
+  def initializer(self):
+    return self._v.initializer
+
+  def initialized_value(self):
+    return self._v.initialized_value()
+
+  @property
+  def initial_value(self):
+    return self._v.initial_value
+
+  @property
+  def op(self):
+    return self._v.op
+
+  def read_value(self):
+    return self._v.read_value()
+
+  def eval(self, session=None):
+    return self._v.eval(session)
+
+  @property
+  def graph(self):
+    return self._v.graph
+
+  @property
+  def device(self):
+    return self._v.device
+
+  @property
+  def shape(self):
+    return self._v.shape
+
+  @property
+  def aggregation(self):
+    return self._aggregation
+
+  @property
+  def synchronization(self):
+    return self._v.synchronization
+
+  @property
+  def name(self):
+    return self._v.name
+
+  @property
+  def trainable(self):
+    return self._v.trainable
+
+  @property
+  def dtype(self):
+    return self._v.dtype
+
+  # TODO(josh11b): Test saving & restoring.
+  def _gather_saveables_for_checkpoint(self):
+    return {trackable.VARIABLE_VALUE_KEY: self._v}
+
+  # pylint: disable=multiple-statements
+  def __add__(self, o):
+    return self._v + o
+
+  def __radd__(self, o):
+    return o + self._v
+
+  def __sub__(self, o):
+    return self._v - o
+
+  def __rsub__(self, o):
+    return o - self._v
+
+  def __mul__(self, o):
+    return self._v * o
+
+  def __rmul__(self, o):
+    return o * self._v
+
+  def __truediv__(self, o):
+    return self._v / o
+
+  def __rtruediv__(self, o):
+    return o / self._v
+
+  def __floordiv__(self, o):
+    return self._v // o
+
+  def __rfloordiv__(self, o):
+    return o // self._v
+
+  def __mod__(self, o):
+    return self._v % o
+
+  def __rmod__(self, o):
+    return o % self._v
+
+  def __lt__(self, o):
+    return self._v < o
+
+  def __le__(self, o):
+    return self._v <= o
+
+  def __gt__(self, o):
+    return self._v > o
+
+  def __ge__(self, o):
+    return self._v >= o
+
+  def __and__(self, o):
+    return self._v & o
+
+  def __rand__(self, o):
+    return o & self._v
+
+  def __or__(self, o):
+    return self._v | o
+
+  def __ror__(self, o):
+    return o | self._v
+
+  def __xor__(self, o):
+    return self._v ^ o
+
+  def __rxor__(self, o):
+    return o ^ self._v
+
+  def __getitem__(self, o):
+    return self._v[o]
+
+  def __pow__(self, o, modulo=None):
+    return pow(self._v, o, modulo)
+
+  def __rpow__(self, o):
+    return pow(o, self._v)
+
+  def __invert__(self):
+    return ~self._v
+
+  def __neg__(self):
+    return -self._v
+
+  def __abs__(self):
+    return abs(self._v)
+
+  def __div__(self, o):
+    try:
+      return self._v.__div__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rdiv__(self, o):
+    try:
+      return self._v.__rdiv__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __matmul__(self, o):
+    try:
+      return self._v.__matmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __rmatmul__(self, o):
+    try:
+      return self._v.__rmatmul__(o)
+    except AttributeError:
+      # See https://docs.python.org/3/library/constants.html#NotImplemented
+      return NotImplemented
+
+  def __str__(self):
+    return str(self._v)
+
+  def __repr__(self):
+    return repr(self._v)
+
+  def _should_act_as_resource_variable(self):
+    """Pass resource_variable_ops.is_resource_variable check."""
+    pass
+
+  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
+    return ops.convert_to_tensor(self.get(), dtype=dtype, name=name,
+                                 as_ref=as_ref)
+
+
+# Register a conversion function which reads the value of the variable,
+# allowing instances of the class to be used as tensors.
+def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
+  return var._dense_var_to_tensor(dtype, name, as_ref)  # pylint: disable=protected-access
+
+
+ops.register_tensor_conversion_function(AggregatingVariable,
+                                        _tensor_conversion_aggregate)
diff --git a/tensorflow/python/distribute/ps_values_test.py b/tensorflow/python/distribute/ps_values_test.py
new file mode 100644
index 00000000000..b8d6b3f35a0
--- /dev/null
+++ b/tensorflow/python/distribute/ps_values_test.py
@@ -0,0 +1,65 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the distributed values library."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import ps_values
+from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            strategy_combinations.central_storage_strategy_with_two_gpus
+        ],
+        mode=["graph", "eager"]))
+class AggregatingVariableTest(test.TestCase, parameterized.TestCase):
+
+  def testAssignOutOfScope(self, distribution):
+    with distribution.scope():
+      aggregating = variables_lib.Variable(1.)
+    self.assertIsInstance(aggregating, ps_values.AggregatingVariable)
+    self.evaluate(aggregating.assign(3.))
+    self.assertEqual(self.evaluate(aggregating.read_value()), 3.)
+    self.assertEqual(self.evaluate(aggregating._v.read_value()), 3.)
+
+  def testAssignAdd(self, distribution):
+    with distribution.scope():
+      v = variable_scope.variable(
+          1, aggregation=variables_lib.VariableAggregation.MEAN)
+    self.evaluate(variables_lib.global_variables_initializer())
+
+    @def_function.function
+    def assign():
+      return v.assign_add(2)
+
+    per_replica_results = self.evaluate(
+        distribution.experimental_local_results(
+            distribution.experimental_run_v2(assign)))
+    self.assertAllEqual([3], per_replica_results)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index e69c8c7f129..350b187f67f 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -47,7 +47,6 @@ from tensorflow.python.training import ftrl
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
 
-
 FLAGS = flags.FLAGS
 
 _did_connect_to_cluster = False
@@ -58,16 +57,26 @@ def _get_tpu_strategy_creator(steps_per_run, use_single_core=False, **kwargs):
   def _create_tpu_strategy():
     global _did_connect_to_cluster
 
-    # These flags will be defined by tpu_test_wrapper.py.
-    resolver = tpu_cluster_resolver.TPUClusterResolver(
-        tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "",
-        zone=hasattr(FLAGS, "zone") and FLAGS.zone or None,
-        project=hasattr(FLAGS, "project") and FLAGS.project or None,
-    )
+    try:
+      # Attempt to locally discover the TPU. This will fail for Cloud TPU, in
+      # which case we fall back to the values passed as flags.
+      resolver = tpu_cluster_resolver.TPUClusterResolver()
+      did_automatically_resolve = True
+    except ValueError:
+      did_automatically_resolve = False
+
+      # These flags will be defined by tpu_test_wrapper.py.
+      resolver = tpu_cluster_resolver.TPUClusterResolver(
+          tpu=hasattr(FLAGS, "tpu") and FLAGS.tpu or "",
+          zone=hasattr(FLAGS, "zone") and FLAGS.zone or None,
+          project=hasattr(FLAGS, "project") and FLAGS.project or None,
+      )
+
     # Only connect once per process, rather than per test method.
-    if hasattr(FLAGS, "tpu") and FLAGS.tpu and not _did_connect_to_cluster:
-      remote.connect_to_cluster(resolver)
-      _did_connect_to_cluster = True
+    if getattr(FLAGS, "tpu", "") or did_automatically_resolve:
+      if not _did_connect_to_cluster:
+        remote.connect_to_cluster(resolver)
+        _did_connect_to_cluster = True
 
     topology = tpu_strategy_util.initialize_tpu_system(resolver)
     device_assignment = None
@@ -150,13 +159,13 @@ central_storage_strategy_with_gpu_and_cpu = combinations.NamedDistribution(
         ["/gpu:0", "/cpu:0"]),
     required_gpus=1)
 multi_worker_mirrored_two_workers = combinations.NamedDistribution(
-    "MultiWorkerMirrroedTwoWorkers",
+    "MultiWorkerMirroredTwoWorkers",
     collective_all_reduce_strategy.CollectiveAllReduceStrategy,
     has_chief=False,
     num_workers=2,
 )
 multi_worker_mirrored_one_chief_one_worker = combinations.NamedDistribution(
-    "MultiWorkerMirrroedOneChiefOneWorker",
+    "MultiWorkerMirroredOneChiefOneWorker",
     collective_all_reduce_strategy.CollectiveAllReduceStrategy,
     has_chief=True,
     num_workers=1,
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index c277310b6a0..4ed5054af2d 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -20,12 +20,16 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import combinations
+from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import strategy_combinations
+from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -61,5 +65,36 @@ class StrategyReduceTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
 
 
+class DistributedCollectiveAllReduceStrategyTest(
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.combine(
+          strategy=[strategy_combinations.multi_worker_mirrored_two_workers],
+          mode=['eager']))
+  def testDatasetFromFunction(self, strategy):
+    def dataset_fn(input_context):
+      global_batch_size = 10
+      batch_size = input_context.get_per_replica_batch_size(global_batch_size)
+      d = dataset_ops.DatasetV2.range(100).repeat().batch(batch_size)
+      return d.shard(input_context.num_input_pipelines,
+                     input_context.input_pipeline_id)
+
+    expected_sum_on_workers = [10, 35]
+    input_iterator = iter(
+        strategy.experimental_distribute_datasets_from_function(dataset_fn))
+
+    @def_function.function
+    def run(iterator):
+      return strategy.experimental_local_results(iterator.get_next())
+
+    result = run(input_iterator)
+    sum_value = math_ops.reduce_sum(result)
+    self.assertEqual(
+        sum_value.numpy(),
+        expected_sum_on_workers[multi_worker_test_base.get_task_index()])
+
+
 if __name__ == '__main__':
   combinations.main()
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index d03628f4714..9c830f7081c 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import weakref
 
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import composite_tensor
@@ -43,72 +43,6 @@ from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
-# Utility functions used by the different classes below.
-def _get_current_replica_id_as_int():
-  """Returns the current replica ID as an integer, or `None`."""
-  replica_context = ds_context.get_replica_context()
-  if replica_context:
-    replica_id = replica_context.replica_id_in_sync_group
-    if not isinstance(replica_id, int):
-      replica_id = tensor_util.constant_value(replica_id)
-  else:
-    replica_id = distribute_lib.get_update_replica_id()
-  return replica_id
-
-
-def _assign_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign(tensor)
-
-
-def _assign_add_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign_add(tensor)
-
-
-def _assign_sub_on_device(device, variable, tensor):
-  with ops.device(device):
-    return variable.assign_sub(tensor)
-
-
-def _assert_replica_context(strategy):
-  replica_context = ds_context.get_replica_context()
-  if not replica_context:
-    raise RuntimeError(
-        "Replica-local variables may only be assigned in a replica context.")
-  if replica_context.strategy is not strategy:
-    raise RuntimeError(
-        "Replica-local variables may only be assigned in a replica context.")
-
-
-def _apply_aggregation(strategy, value, aggregation, destinations):
-  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-    return strategy.extended.broadcast_to(
-        strategy.experimental_local_results(value)[0],
-        destinations=destinations)
-  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
-  return strategy.extended.reduce_to(reduce_op, value, destinations)
-
-
-_aggregation_error_msg = (
-    "You must specify an aggregation method to update a "
-    "{variable_type} in Replica Context. You can do so by passing "
-    "an explicit value for argument `aggregation` to tf.Variable(..)."
-    "e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)`"
-    "`tf.VariableAggregation` lists the possible aggregation methods."
-    "This is required because {variable_type} should always be "
-    "kept in sync. When updating them or assigning to them in a "
-    "replica context, we automatically try to aggregate the values "
-    "before updating the variable. For this aggregation, we need to "
-    "know the aggregation method. "
-    "Another alternative is to not try to update such "
-    "{variable_type} in replica context, but in cross replica "
-    "context. You can enter cross replica context by calling "
-    "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
-    "Inside `merge_fn`, you can then update the {variable_type} "
-    "using `tf.distribute.StrategyExtended.update()`.")
-
-
 @tf_export("distribute.DistributedValues", v1=[])
 class DistributedValues(object):
   """Base class for representing distributed values.
@@ -182,7 +116,7 @@ class DistributedValues(object):
 
   def _get(self):
     """Returns the value for the current device or raises a ValueError."""
-    replica_id = _get_current_replica_id_as_int()
+    replica_id = values_util.get_current_replica_id_as_int()
     if replica_id is None:
       return self._get_cross_replica()
     else:
@@ -195,7 +129,7 @@ class DistributedValues(object):
 
   def _get_on_device_or_primary(self):
     """Returns value in same replica or device if possible, else the _primary."""
-    replica_id = _get_current_replica_id_as_int()
+    replica_id = values_util.get_current_replica_id_as_int()
     if replica_id is None:
       # Try to find a value on the current device.
       current_device = device_util.canonicalize(device_util.current())
@@ -568,7 +502,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
 
   @property
   def handle(self):
-    replica_id = _get_current_replica_id_as_int()
+    replica_id = values_util.get_current_replica_id_as_int()
     if replica_id is None:
       raise ValueError("`handle` is not available outside the replica context"
                        " or a `tf.distribute.Strategy.update()` call.")
@@ -774,7 +708,7 @@ class DistributedVariable(DistributedDelegate, variables_lib.Variable,
           return update_fn(self._values[update_replica_id], value, **kwargs)
         return self._update_cross_replica(update_fn, value, **kwargs)
       else:
-        _assert_replica_context(self.distribute_strategy)
+        values_util.assert_replica_context(self.distribute_strategy)
         return self._update_replica(update_fn, value, **kwargs)
 
   def _should_act_as_resource_variable(self):
@@ -794,7 +728,7 @@ class _MirroredSaveable(saveable_object_util.ResourceVariableSaveable):
     tensor, = restored_tensors
     return control_flow_ops.group(
         tuple(
-            _assign_on_device(v.device, v, tensor)
+            values_util.assign_on_device(v.device, v, tensor)
             for v in self._mirrored_variable.values))
 
 
@@ -804,7 +738,8 @@ class MirroredVariable(DistributedVariable, Mirrored):
   def _update_replica(self, update_fn, value, **kwargs):
     if self.aggregation == vs.VariableAggregation.NONE:
       raise ValueError(
-          _aggregation_error_msg.format(variable_type="MirroredVariable"))
+          values_util.aggregation_error_msg.format(
+              variable_type="MirroredVariable"))
 
     def merge_fn(strategy, value, **kwargs):
       """Aggregate values and update all variables in cross replica context."""
@@ -824,7 +759,7 @@ class MirroredVariable(DistributedVariable, Mirrored):
             "cross-replica context.")
 
       assert strategy == self.distribute_strategy
-      v = _apply_aggregation(strategy, value, self.aggregation, self)
+      v = values_util.apply_aggregation(strategy, value, self.aggregation, self)
       return self._update_cross_replica(update_fn, v, **kwargs)
 
     return ds_context.get_replica_context().merge_call(
@@ -930,7 +865,7 @@ class _SyncOnReadSaveable(saveable_object.SaveableObject):
                              self._sync_on_read_variable.dtype)
     return control_flow_ops.group(
         tuple(
-            _assign_on_device(v.device, v, tensor)
+            values_util.assign_on_device(v.device, v, tensor)
             for v in self._sync_on_read_variable.values))
 
 
@@ -960,8 +895,8 @@ class SyncOnReadVariable(DistributedVariable):
               "SyncOnReadVariable does not support `assign_sub` in "
               "cross-replica context when aggregation is set to "
               "`tf.VariableAggregation.SUM`.")
-        return self._assign_on_each_device(_assign_sub_on_device, value,
-                                           read_value)
+        return self._assign_on_each_device(values_util.assign_sub_on_device,
+                                           value, read_value)
       else:
         return super(SyncOnReadVariable,
                      self).assign_sub(value, use_locking, name, read_value)
@@ -974,8 +909,8 @@ class SyncOnReadVariable(DistributedVariable):
               "SyncOnReadVariable does not support `assign_add` in "
               "cross-replica context when aggregation is set to "
               "`tf.VariableAggregation.SUM`.")
-        return self._assign_on_each_device(_assign_add_on_device, value,
-                                           read_value)
+        return self._assign_on_each_device(values_util.assign_add_on_device,
+                                           value, read_value)
       else:
         return super(SyncOnReadVariable,
                      self).assign_add(value, use_locking, name, read_value)
@@ -988,7 +923,7 @@ class SyncOnReadVariable(DistributedVariable):
         # when saving.
         if self._aggregation == vs.VariableAggregation.SUM:
           value = math_ops.cast(value / len(self._values), self.dtype)
-        return self._assign_on_each_device(_assign_on_device, value,
+        return self._assign_on_each_device(values_util.assign_on_device, value,
                                            read_value)
       else:
         return super(SyncOnReadVariable,
@@ -1388,275 +1323,3 @@ def validate_colocate(v, extended):
         "`colocate_vars_with` must only be passed a variable created in this "
         "tf.distribute.Strategy.scope(), not: %r" % (v,))
   _validate_colocate_extended(v, extended)
-
-
-# Variable used in PSStrategy TF 1 and CentralStorageStrategy.
-class AggregatingVariable(variables_lib.Variable, core.Tensor):
-  """A wrapper around a variable that aggregates updates across replicas."""
-
-  def __init__(self, strategy, v, aggregation):
-    self._distribute_strategy = strategy
-    self._v = v
-    # NOTE: We don't use "_distributed_container" here because we don't want
-    # to trigger that code path in regroup().
-    v._aggregating_container = weakref.ref(self)  # pylint: disable=protected-access
-    self._aggregation = aggregation
-
-  def get(self):
-    return self._v
-
-  @property
-  def distribute_strategy(self):
-    return self._distribute_strategy
-
-  def __getattr__(self, name):
-    return getattr(self._v, name)
-
-  def _assign_func(self, *args, **kwargs):
-    with ds_context.enter_or_assert_strategy(self._distribute_strategy):
-      f = kwargs.pop("f")
-      if ds_context.in_cross_replica_context():
-        if distribute_lib.get_update_replica_id() is not None:
-          # We are calling an assign function in an update context.
-          return f(self._v, *args, **kwargs)
-
-        # We are calling an assign function in cross replica context, wrap it in
-        # an update call.
-        return self._distribute_strategy.extended.update(
-            self, f, args=args, kwargs=kwargs)
-      else:
-        replica_context = ds_context.get_replica_context()
-        assert replica_context
-        # We are calling an assign function in replica context.
-        # We reduce the value we want to assign/add/sub. More details about how
-        # we handle the different use cases can be found in the _reduce method.
-        # We call the function with the reduced value.
-        if self._aggregation == vs.VariableAggregation.NONE:
-          raise ValueError(
-              _aggregation_error_msg.format(
-                  variable_type="AggregatingVariable"))
-
-        def merge_fn(strategy,
-                     value,
-                     use_locking=False,
-                     name=None,
-                     read_value=True):
-          v = _apply_aggregation(strategy, value, self._aggregation, self)
-          if name and isinstance(name, PerReplica):
-            name = name.values[0]
-          return strategy.extended.update(
-              self,
-              f,
-              args=(v,),
-              kwargs={
-                  "use_locking": use_locking,
-                  "name": name,
-                  "read_value": read_value
-              })
-        return replica_context.merge_call(merge_fn, args=args, kwargs=kwargs)
-
-  def assign_sub(self, *args, **kwargs):
-    assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
-    return self._assign_func(f=assign_sub_fn, *args, **kwargs)
-
-  def assign_add(self, *args, **kwargs):
-    assign_add_fn = lambda var, *a, **kw: var.assign_add(*a, **kw)
-    return self._assign_func(f=assign_add_fn, *args, **kwargs)
-
-  def assign(self, *args, **kwargs):
-    assign_fn = lambda var, *a, **kw: var.assign(*a, **kw)
-    return self._assign_func(f=assign_fn, *args, **kwargs)
-
-  @property
-  def initializer(self):
-    return self._v.initializer
-
-  def initialized_value(self):
-    return self._v.initialized_value()
-
-  @property
-  def initial_value(self):
-    return self._v.initial_value
-
-  @property
-  def op(self):
-    return self._v.op
-
-  def read_value(self):
-    return self._v.read_value()
-
-  def eval(self, session=None):
-    return self._v.eval(session)
-
-  @property
-  def graph(self):
-    return self._v.graph
-
-  @property
-  def device(self):
-    return self._v.device
-
-  @property
-  def shape(self):
-    return self._v.shape
-
-  @property
-  def aggregation(self):
-    return self._aggregation
-
-  @property
-  def synchronization(self):
-    return self._v.synchronization
-
-  @property
-  def name(self):
-    return self._v.name
-
-  @property
-  def trainable(self):
-    return self._v.trainable
-
-  @property
-  def dtype(self):
-    return self._v.dtype
-
-  # TODO(josh11b): Test saving & restoring.
-  def _gather_saveables_for_checkpoint(self):
-    return {trackable.VARIABLE_VALUE_KEY: self._v}
-
-  # pylint: disable=multiple-statements
-  def __add__(self, o):
-    return self._v + o
-
-  def __radd__(self, o):
-    return o + self._v
-
-  def __sub__(self, o):
-    return self._v - o
-
-  def __rsub__(self, o):
-    return o - self._v
-
-  def __mul__(self, o):
-    return self._v * o
-
-  def __rmul__(self, o):
-    return o * self._v
-
-  def __truediv__(self, o):
-    return self._v / o
-
-  def __rtruediv__(self, o):
-    return o / self._v
-
-  def __floordiv__(self, o):
-    return self._v // o
-
-  def __rfloordiv__(self, o):
-    return o // self._v
-
-  def __mod__(self, o):
-    return self._v % o
-
-  def __rmod__(self, o):
-    return o % self._v
-
-  def __lt__(self, o):
-    return self._v < o
-
-  def __le__(self, o):
-    return self._v <= o
-
-  def __gt__(self, o):
-    return self._v > o
-
-  def __ge__(self, o):
-    return self._v >= o
-
-  def __and__(self, o):
-    return self._v & o
-
-  def __rand__(self, o):
-    return o & self._v
-
-  def __or__(self, o):
-    return self._v | o
-
-  def __ror__(self, o):
-    return o | self._v
-
-  def __xor__(self, o):
-    return self._v ^ o
-
-  def __rxor__(self, o):
-    return o ^ self._v
-
-  def __getitem__(self, o):
-    return self._v[o]
-
-  def __pow__(self, o, modulo=None):
-    return pow(self._v, o, modulo)
-
-  def __rpow__(self, o):
-    return pow(o, self._v)
-
-  def __invert__(self):
-    return ~self._v
-
-  def __neg__(self):
-    return -self._v
-
-  def __abs__(self):
-    return abs(self._v)
-
-  def __div__(self, o):
-    try:
-      return self._v.__div__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __rdiv__(self, o):
-    try:
-      return self._v.__rdiv__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __matmul__(self, o):
-    try:
-      return self._v.__matmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __rmatmul__(self, o):
-    try:
-      return self._v.__rmatmul__(o)
-    except AttributeError:
-      # See https://docs.python.org/3/library/constants.html#NotImplemented
-      return NotImplemented
-
-  def __str__(self):
-    return str(self._v)
-
-  def __repr__(self):
-    return repr(self._v)
-
-  def _should_act_as_resource_variable(self):
-    """Pass resource_variable_ops.is_resource_variable check."""
-    pass
-
-  def _dense_var_to_tensor(self, dtype=None, name=None, as_ref=False):
-    return ops.convert_to_tensor(self.get(), dtype=dtype, name=name,
-                                 as_ref=as_ref)
-
-
-# Register a conversion function which reads the value of the variable,
-# allowing instances of the class to be used as tensors.
-def _tensor_conversion_aggregate(var, dtype=None, name=None, as_ref=False):
-  return var._dense_var_to_tensor(dtype, name, as_ref)  # pylint: disable=protected-access
-
-
-ops.register_tensor_conversion_function(AggregatingVariable,
-                                        _tensor_conversion_aggregate)
diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py
index bbff6c631cf..8016bfe9265 100644
--- a/tensorflow/python/distribute/values_test.py
+++ b/tensorflow/python/distribute/values_test.py
@@ -2008,38 +2008,6 @@ class SyncOnReadScatterReplicaTest(test.TestCase, parameterized.TestCase):
       self.evaluate(distribution.run(v.scatter_min, args=(delta,)))
 
 
-@combinations.generate(
-    combinations.combine(
-        distribution=[
-            strategy_combinations.central_storage_strategy_with_two_gpus
-        ],
-        mode=["graph", "eager"]))
-class AggregatingVariableTest(test.TestCase, parameterized.TestCase):
-
-  def testAssignOutOfScope(self, distribution):
-    with distribution.scope():
-      aggregating = variables_lib.Variable(1.)
-    self.assertIsInstance(aggregating, values.AggregatingVariable)
-    self.evaluate(aggregating.assign(3.))
-    self.assertEqual(self.evaluate(aggregating.read_value()), 3.)
-    self.assertEqual(self.evaluate(aggregating._v.read_value()), 3.)
-
-  def testAssignAdd(self, distribution):
-    with distribution.scope():
-      v = variable_scope.variable(
-          1, aggregation=variables_lib.VariableAggregation.MEAN)
-    self.evaluate(variables_lib.global_variables_initializer())
-
-    @def_function.function
-    def assign():
-      return v.assign_add(2)
-
-    per_replica_results = self.evaluate(
-        distribution.experimental_local_results(
-            distribution.experimental_run_v2(assign)))
-    self.assertAllEqual([3], per_replica_results)
-
-
 class MirroredTest(test.TestCase):
 
   def testAddOp(self):
diff --git a/tensorflow/python/distribute/values_util.py b/tensorflow/python/distribute/values_util.py
new file mode 100644
index 00000000000..c42ac9e4de1
--- /dev/null
+++ b/tensorflow/python/distribute/values_util.py
@@ -0,0 +1,91 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility functions used by values.py and ps_values.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import variable_scope as vs
+
+
+def get_current_replica_id_as_int():
+  """Returns the current replica ID as an integer, or `None`."""
+  replica_context = ds_context.get_replica_context()
+  if replica_context:
+    replica_id = replica_context.replica_id_in_sync_group
+    if not isinstance(replica_id, int):
+      replica_id = tensor_util.constant_value(replica_id)
+  else:
+    replica_id = distribute_lib.get_update_replica_id()
+  return replica_id
+
+
+def assign_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign(tensor)
+
+
+def assign_add_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign_add(tensor)
+
+
+def assign_sub_on_device(device, variable, tensor):
+  with ops.device(device):
+    return variable.assign_sub(tensor)
+
+
+def assert_replica_context(strategy):
+  replica_context = ds_context.get_replica_context()
+  if not replica_context:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+  if replica_context.strategy is not strategy:
+    raise RuntimeError(
+        "Replica-local variables may only be assigned in a replica context.")
+
+
+def apply_aggregation(strategy, value, aggregation, destinations):
+  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+    return strategy.extended.broadcast_to(
+        strategy.experimental_local_results(value)[0],
+        destinations=destinations)
+  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
+  return strategy.extended.reduce_to(reduce_op, value, destinations)
+
+
+aggregation_error_msg = (
+    "You must specify an aggregation method to update a "
+    "{variable_type} in Replica Context. You can do so by passing "
+    "an explicit value for argument `aggregation` to tf.Variable(..)."
+    "e.g. `tf.Variable(..., aggregation=tf.VariableAggregation.SUM)`"
+    "`tf.VariableAggregation` lists the possible aggregation methods."
+    "This is required because {variable_type} should always be "
+    "kept in sync. When updating them or assigning to them in a "
+    "replica context, we automatically try to aggregate the values "
+    "before updating the variable. For this aggregation, we need to "
+    "know the aggregation method. "
+    "Another alternative is to not try to update such "
+    "{variable_type} in replica context, but in cross replica "
+    "context. You can enter cross replica context by calling "
+    "`tf.distribute.get_replica_context().merge_call(merge_fn, ..)`."
+    "Inside `merge_fn`, you can then update the {variable_type} "
+    "using `tf.distribute.StrategyExtended.update()`.")
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 223b62ededa..72eaa663c6c 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -52,6 +52,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -61,6 +62,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
 
@@ -850,11 +852,13 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
   def _benchmark_tf_reduce_logsumexp(self,
                                      device=CPU,
                                      execution_mode=None,
-                                     defunc=False):
+                                     defunc=False,
+                                     xla_compile=False):
     with context.device(device):
       x = constant_op.constant([[1, 0.], [0., 0.]])
       if defunc:
-        reduce_func = def_function.function(math_ops.reduce_logsumexp)
+        reduce_func = def_function.function(
+            math_ops.reduce_logsumexp, experimental_compile=xla_compile)
         func = lambda: reduce_func(x)
       else:
         func = lambda: math_ops.reduce_logsumexp(x)
@@ -895,6 +899,16 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
     self._benchmark_tf_reduce_logsumexp(
         device=GPU, execution_mode=context.ASYNC, defunc=True)
 
+  @test_util.disable_tfrt("reduce logsumexp not supported")
+  def benchmark_tf_reduce_logsumexp_GPU_defun_compile(self):
+    self._benchmark_tf_reduce_logsumexp(
+        device=GPU, defunc=True, xla_compile=True)
+
+  @test_util.disable_tfrt("reduce logsumexp not supported")
+  def benchmark_tf_reduce_logsumexp_GPU_async_defun_compile(self):
+    self._benchmark_tf_reduce_logsumexp(
+        device=GPU, execution_mode=context.ASYNC, defunc=True, xla_compile=True)
+
   def _benchmark_tf_tensordot(self, device=CPU, execution_mode=None):
     with context.device(device):
       a = array_ops.ones((2, 2))
@@ -1356,6 +1370,47 @@ class MicroBenchmarks(benchmarks_test_base.MicroBenchmarksBase):
   def benchmarkTenResourceReadsInCondInInnerFunc(self):
     self._benchmarkResourceReadsInCondInInnerFunc(10)
 
+  def benchmark_tf_name_scope(self):
+
+    def fn():
+      with ops.name_scope_v2("name"):
+        pass
+
+    self._run(fn, 10000)
+
+  def benchmark_tf_nest_map_structure(self):
+    nested = {"a": [1, 2, 3], "b": (4, 5, 6)}
+
+    def fn():
+      nest.map_structure(lambda x: x, nested)
+
+    self._run(fn, 10000)
+
+  def benchmark_tf_nest_pack_sequence_as(self):
+    nested = {"a": [1, 2, 3], "b": (4, 5, 6)}
+    flat = nest.flatten(nested)
+
+    def fn():
+      nest.pack_sequence_as(nested, flat)
+
+    self._run(fn, 10000)
+
+  # TODO(b/157587712): Move to keras when benchmarks are setup.
+  def benchmark_tf_keras_layer_call(self):
+
+    class OnlyOverheadLayer(base_layer.Layer):
+
+      def call(self, x):
+        return x
+
+    layer = OnlyOverheadLayer()
+    x = ops.convert_to_tensor([[1.]])
+
+    def fn():
+      layer(x)
+
+    self._run(fn, 10000)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index ce495d772d0..37c802b9aa6 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -726,7 +726,7 @@ class _DelayedRewriteGradientFunctions(object):
     # pylint: enable=protected-access
 
     capture_mapping = dict(
-        zip([ops.tensor_id(t) for t in self._func_graph.outputs], op.outputs))
+        zip((ops.tensor_id(t) for t in self._func_graph.outputs), op.outputs))
     remapped_captures = [
         capture_mapping.get(ops.tensor_id(capture), capture)
         for capture in backwards_function.captured_inputs
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 5b6dac5be34..f83f65152c1 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -5133,14 +5133,13 @@ class Graph(object):
     Returns:
       The dtype that instances of `AutoCastVariable` will be casted to.
     """
-    if not hasattr(self._thread_local, "_auto_cast_variable_read_dtype"):
+    dtype = getattr(self._thread_local, "_auto_cast_variable_read_dtype", None)
+    if dtype is None:
       self._thread_local._auto_cast_variable_read_dtype = None  # pylint: disable=protected-access
-    return self._thread_local._auto_cast_variable_read_dtype  # pylint: disable=protected-access
+    return dtype
 
   @_auto_cast_variable_read_dtype.setter
   def _auto_cast_variable_read_dtype(self, dtype):
-    if dtype:
-      dtype = dtypes.as_dtype(dtype)
     self._thread_local._auto_cast_variable_read_dtype = dtype  # pylint: disable=protected-access
 
   @tf_contextlib.contextmanager
@@ -5353,7 +5352,7 @@ class _DefaultStack(threading.local):
     self.stack = []
 
   def get_default(self):
-    return self.stack[-1] if len(self.stack) >= 1 else None
+    return self.stack[-1] if self.stack else None
 
   def reset(self):
     self.stack = []
@@ -5541,10 +5540,13 @@ class _DefaultGraphStack(_DefaultStack):  # pylint: disable=protected-access
 
   def get_default(self):
     """Override that returns a global default if the stack is empty."""
-    ret = super(_DefaultGraphStack, self).get_default()
-    if ret is None:
-      ret = self._GetGlobalDefaultGraph()
-    return ret
+    if self.stack:
+      return self.stack[-1]
+    elif self._global_default_graph:
+      return self._global_default_graph
+    else:
+      self._global_default_graph = Graph()
+      return self._global_default_graph
 
   def _GetGlobalDefaultGraph(self):
     if self._global_default_graph is None:
@@ -6535,24 +6537,6 @@ class name_scope_v1(object):  # pylint: disable=invalid-name
     return self._name_scope.__exit__(*exc_info)
 
 
-def enter_eager_name_scope(ctx, name):
-  """Updates the eager context to enter the given name scope."""
-  old_name = ctx.scope_name
-  if not name:
-    scope_name = ""
-  else:
-    if name.endswith("/"):
-      # A trailing slash breaks out of nested name scopes, indicating a
-      # fully specified scope name, for compatibility with Graph.name_scope.
-      scope_name = name
-    else:
-      scope_name = name + "/"
-      if old_name:
-        scope_name = old_name + scope_name
-  ctx.scope_name = scope_name
-  return scope_name, old_name
-
-
 @tf_export("name_scope", v1=[])
 class name_scope_v2(object):
   """A context manager for use when defining a Python op.
@@ -6575,9 +6559,9 @@ class name_scope_v2(object):
   When executed, the Tensors `a`, `b`, `c`, will have names `MyOp/a`, `MyOp/b`,
   and `MyOp/c`.
 
-  If the scope name already exists, the name will be made unique by appending
-  `_n`. For example, calling `my_op` the second time will generate `MyOp_1/a`,
-  etc.
+  Inside a `tf.function`, if the scope name already exists, the name will be
+  made unique by appending `_n`. For example, calling `my_op` the second time
+  will generate `MyOp_1/a`, etc.
   """
 
   def __init__(self, name):
@@ -6587,9 +6571,9 @@ class name_scope_v2(object):
       name: The prefix to use on all names created within the name scope.
 
     Raises:
-      ValueError: If name is None, or not a string.
+      ValueError: If name is not a string.
     """
-    if name is None or not isinstance(name, six.string_types):
+    if not isinstance(name, six.string_types):
       raise ValueError("name for name_scope must be a string.")
     self._name = name
     self._exit_fns = []
@@ -6603,16 +6587,29 @@ class name_scope_v2(object):
 
     Returns:
       The scope name.
-
-    Raises:
-      ValueError: if neither `name` nor `default_name` is provided
-        but `values` are.
     """
     ctx = context.context()
     if ctx.executing_eagerly():
-      scope_name, old_scope_name = enter_eager_name_scope(ctx, self._name)
-      self._exit_fns.append(
-          lambda *a: setattr(ctx, "scope_name", old_scope_name))
+      # Names are not auto-incremented in eager mode.
+      # A trailing slash breaks out of nested name scopes, indicating a
+      # fully specified scope name, for compatibility with Graph.name_scope.
+      # This also prevents auto-incrementing.
+      old_name = ctx.scope_name
+      name = self._name
+      if not name:
+        scope_name = ""
+      elif name[-1] == "/":
+        scope_name = name
+      elif old_name:
+        scope_name = old_name + name + "/"
+      else:
+        scope_name = name + "/"
+      ctx.scope_name = scope_name
+
+      def _restore_name_scope(*_):
+        ctx.scope_name = old_name
+
+      self._exit_fns.append(_restore_name_scope)
     else:
       scope = get_default_graph().name_scope(self._name)
       scope_name = scope.__enter__()
@@ -6620,8 +6617,7 @@ class name_scope_v2(object):
     return scope_name
 
   def __exit__(self, type_arg, value_arg, traceback_arg):
-    exit_fn = self._exit_fns.pop()
-    exit_fn(type_arg, value_arg, traceback_arg)
+    self._exit_fns.pop()(type_arg, value_arg, traceback_arg)
     return False  # False values do not suppress exceptions
 
 
diff --git a/tensorflow/python/framework/subscribe.py b/tensorflow/python/framework/subscribe.py
index 8c3f91f62d8..c7cf8ce6070 100644
--- a/tensorflow/python/framework/subscribe.py
+++ b/tensorflow/python/framework/subscribe.py
@@ -58,8 +58,7 @@ def _recursive_apply(tensors, apply_fn):
       return tuple(tensors)
     return tensors_type(*tensors)  # collections.namedtuple
   elif tensors_type is dict:
-    return dict([(k, _recursive_apply(v, apply_fn)) for k, v in tensors.items()
-                ])
+    return dict((k, _recursive_apply(v, apply_fn)) for k, v in tensors.items())
   else:
     raise TypeError('_recursive_apply argument %r has invalid type %r' %
                     (tensors, tensors_type))
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 4981e1b68fd..1adec3d68fd 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -486,7 +486,7 @@ def skip_if_error(test_obj, error_type, messages=None):
   try:
     yield
   except error_type as e:
-    if not messages or any([message in str(e) for message in messages]):
+    if not messages or any(message in str(e) for message in messages):
       test_obj.skipTest("Skipping error: {}".format(str(e)))
     else:
       raise
@@ -2128,7 +2128,9 @@ class TensorFlowTestCase(googletest.TestCase):
               values=tensor.values.numpy(),
               indices=tensor.indices.numpy(),
               dense_shape=tensor.dense_shape.numpy())
-        return tensor.numpy()
+        # Convert tensors and composite tensors to numpy arrays.
+        return nest.map_structure(lambda t: t.numpy(), tensor,
+                                  expand_composites=True)
       except AttributeError as e:
         six.raise_from(ValueError("Unsupported type %s." % type(tensor)), e)
 
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 78e360c8354..67fa0c18ebd 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -140,7 +140,7 @@ py_library(
     deps = [
         ":backend",
         "//tensorflow/python/distribute:distributed_file_utils",
-        "//tensorflow/python/keras/distribute:multi_worker_training_state",
+        "//tensorflow/python/keras/distribute:worker_training_state",
         "//tensorflow/python/keras/protobuf:projector_config_proto_py",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:mode_keys",
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index d0c3eb03342..b29b519477c 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -4578,6 +4578,9 @@ def categorical_crossentropy(target, output, from_logits=False, axis=-1):
   [0. 0. 0.]
 
   """
+  target = ops.convert_to_tensor_v2(target)
+  output = ops.convert_to_tensor_v2(output)
+
   target.shape.assert_is_compatible_with(output.shape)
   if from_logits:
     return nn.softmax_cross_entropy_with_logits_v2(
@@ -4625,6 +4628,9 @@ def sparse_categorical_crossentropy(target, output, from_logits=False, axis=-1):
   Raises:
       ValueError: if `axis` is neither -1 nor one of the axes of `output`.
   """
+  target = ops.convert_to_tensor_v2(target)
+  output = ops.convert_to_tensor_v2(output)
+
   if not from_logits and not isinstance(
       output, (ops.EagerTensor, variables_module.Variable)):
     output = _backtrack_identity(output)
@@ -4700,6 +4706,9 @@ def binary_crossentropy(target, output, from_logits=False):
   Returns:
       A tensor.
   """
+  target = ops.convert_to_tensor_v2(target)
+  output = ops.convert_to_tensor_v2(output)
+
   if from_logits:
     return nn.sigmoid_cross_entropy_with_logits(labels=target, logits=output)
 
@@ -5939,10 +5948,13 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
               contains the decoded sequence.
               If `false`, returns the `top_paths` most probable
               decoded sequences.
+              Each decoded sequence has shape (samples, time_steps).
               Important: blank labels are returned as `-1`.
           Tensor `(top_paths, )` that contains
               the log probability of each decoded sequence.
   """
+  input_shape = shape(y_pred)
+  samples, steps = input_shape[0], input_shape[1]
   y_pred = math_ops.log(array_ops.transpose(y_pred, perm=[1, 0, 2]) + epsilon())
   input_length = math_ops.cast(input_length, dtypes_module.int32)
 
@@ -5957,7 +5969,7 @@ def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
         top_paths=top_paths)
   decoded_dense = [
       sparse_ops.sparse_to_dense(
-          st.indices, st.dense_shape, st.values, default_value=-1)
+          st.indices, (samples, steps), st.values, default_value=-1)
       for st in decoded
   ]
   return (decoded_dense, log_prob)
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 20547c570c7..f36db9605dc 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -1762,7 +1762,10 @@ class TestCTC(test.TestCase):
         -3.777835    # output beam 1
     ], np.float32)[np.newaxis, :]
 
-    decode_truth = [np.array([1, 0]), np.array([0, 1, 0])]
+    decode_truth = [
+        np.array([1, 0, -1, -1, -1, -1, -1]),
+        np.array([0, 1, 0, -1, -1, -1, -1])
+    ]
     beam_width = 2
     top_paths = 2
 
diff --git a/tensorflow/python/keras/benchmark/README.md b/tensorflow/python/keras/benchmark/README.md
new file mode 100644
index 00000000000..17e458b3a77
--- /dev/null
+++ b/tensorflow/python/keras/benchmark/README.md
@@ -0,0 +1,3 @@
+# Keras Benchmark
+
+This package contains benchmarks on Keras models and components.
diff --git a/tensorflow/python/keras/benchmark/__init__.py b/tensorflow/python/keras/benchmark/__init__.py
new file mode 100644
index 00000000000..a70e59e1834
--- /dev/null
+++ b/tensorflow/python/keras/benchmark/__init__.py
@@ -0,0 +1,18 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras Benchmark."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index db326ea32f0..c4f29797c24 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -33,11 +33,14 @@ import numpy as np
 import six
 
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.distribute import collective_all_reduce_strategy
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import distributed_file_utils
-from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
+from tensorflow.python.keras.distribute import worker_training_state
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils import version_utils
@@ -1192,51 +1195,19 @@ class ModelCheckpoint(Callback):
       self.save_weights_only = True
 
   def on_train_begin(self, logs=None):
-    # pylint: disable=protected-access
-    if self.model._in_multi_worker_mode():
-      # MultiWorkerTrainingState is used to manage the training state needed
-      # for preemption-recovery of a worker in multi-worker training.
-      self.model._training_state = (
-          training_state.MultiWorkerTrainingState(self.model, self.filepath))
-      self._training_state = self.model._training_state
-      if self._training_state.restore():
-        # If the training state needs to be and is successfully restored,
-        # it is recovering from a previous failure (or preemption). In such
-        # case, do not load the weights from user specified file path.
-        return
-
-    # If this is not multi worker training, restoring is not needed, or
-    # restoring failed, check if it should load weights on restart.
     if self.load_weights_on_restart:
-      if (not self.model._in_multi_worker_mode() or
-          multi_worker_util.should_load_checkpoint()):
-        filepath_to_load = (
-            self._get_most_recently_modified_file_matching_pattern(
-                self.filepath))
-        if (filepath_to_load is not None and
-            training_state.checkpoint_exists(filepath_to_load)):
-          try:
-            # `filepath` may contain placeholders such as `{epoch:02d}`, and
-            # thus it attempts to load the most recently modified file with file
-            # name matching the pattern.
-            self.model.load_weights(filepath_to_load)
-          except (IOError, ValueError) as e:
-            raise ValueError('Error loading file from {}. Reason: {}'.format(
-                filepath_to_load, e))
-
-  def on_train_end(self, logs=None):
-    # pylint: disable=protected-access
-    if self.model._in_multi_worker_mode():
-      if self.model.stop_training or getattr(
-          self.model, '_successful_loop_finish', False):
-        # In multi-worker training, on successful exit of training, delete the
-        # training state backup file that was saved for the purpose of worker
-        # recovery.
-        self._training_state.delete_backup()
-        # Restore the training state so the model is ready for next (possible)
-        # multi worker training.
-        del self._training_state
-        self.model._training_state = None
+      filepath_to_load = (
+          self._get_most_recently_modified_file_matching_pattern(self.filepath))
+      if (filepath_to_load is not None and
+          self._checkpoint_exists(filepath_to_load)):
+        try:
+          # `filepath` may contain placeholders such as `{epoch:02d}`, and
+          # thus it attempts to load the most recently modified file with file
+          # name matching the pattern.
+          self.model.load_weights(filepath_to_load)
+        except (IOError, ValueError) as e:
+          raise ValueError('Error loading file from {}. Reason: {}'.format(
+              filepath_to_load, e))
 
   def on_train_batch_end(self, batch, logs=None):
     if self._should_save_on_batch(batch):
@@ -1249,17 +1220,7 @@ class ModelCheckpoint(Callback):
     self.epochs_since_last_save += 1
     # pylint: disable=protected-access
     if self.save_freq == 'epoch':
-      if self.model._in_multi_worker_mode():
-        # Exclude training state variables in user-requested checkpoint file.
-        with self._training_state.untrack_vars():
-          self._save_model(epoch=epoch, logs=logs)
-      else:
-        self._save_model(epoch=epoch, logs=logs)
-    if self.model._in_multi_worker_mode():
-      # For multi-worker training, back up the weights and current training
-      # state for possible future recovery.
-      # TODO(rchao): Call `back_up` at finer period such as N steps.
-      self._training_state.back_up(epoch)
+      self._save_model(epoch=epoch, logs=logs)
 
   def _should_save_on_batch(self, batch):
     """Handles batch-level saving logic, supports steps_per_execution."""
@@ -1353,6 +1314,14 @@ class ModelCheckpoint(Callback):
     distributed_file_utils.remove_temp_dir_with_filepath(
         self._write_filepath, self.model.distribute_strategy)
 
+  def _checkpoint_exists(self, filepath):
+    """Returns whether the checkpoint `filepath` refers to exists."""
+    if filepath.endswith('.h5'):
+      return file_io.file_exists(filepath)
+    tf_saved_model_exists = file_io.file_exists(filepath)
+    tf_weights_only_checkpoint_exists = file_io.file_exists(filepath + '.index')
+    return tf_saved_model_exists or tf_weights_only_checkpoint_exists
+
   def _get_most_recently_modified_file_matching_pattern(self, pattern):
     """Returns the most recently modified filepath matching pattern.
 
@@ -1445,6 +1414,119 @@ class ModelCheckpoint(Callback):
       return file_path_with_largest_file_name
 
 
+@keras_export('keras.callbacks.experimental.BackupAndRestore', v1=[])
+class BackupAndRestore(Callback):
+  """Callback to back up and restore the training state.
+
+  `BackupAndRestore` callback is intended to recover from interruptions that
+  happened in the middle of a model.fit execution by backing up the
+  training states in a temporary checkpoint file (based on TF CheckpointManager)
+  at the end of each epoch. If training restarted before completion, the
+  training state and model are restored to the most recently saved state at the
+  beginning of a new model.fit() run.
+  Note that user is responsible to bring jobs back up.
+  This callback is important for the backup and restore mechanism for fault
+  tolerance purpose. And the model to be restored from an previous checkpoint is
+  expected to be the same as the one used to back up. If user changes arguments
+  passed to compile or fit, the checkpoint saved for fault tolerance can become
+  invalid.
+
+  Note:
+  1. This callback is not compatible with disabling eager execution.
+  2. A checkpoint is saved at the end of each epoch, when restoring we'll redo
+  any partial work from an unfinished epoch in which the training got restarted
+  (so the work done before a interruption doesn't affect the final model state).
+  3. This works for both single worker and multi-worker mode, only
+  MirroredStrategy and MultiWorkerMirroredStrategy are supported for now.
+
+  Example:
+
+  >>> class InterruptingCallback(tf.keras.callbacks.Callback):
+  ...   def on_epoch_begin(self, epoch, logs=None):
+  ...     if epoch == 4:
+  ...       raise RuntimeError('Interrupting!')
+  >>> callback = tf.keras.callbacks.experimental.BackupAndRestore(
+  ... backup_dir="/tmp")
+  >>> model = tf.keras.models.Sequential([tf.keras.layers.Dense(10)])
+  >>> model.compile(tf.keras.optimizers.SGD(), loss='mse')
+  >>> try:
+  ...   model.fit(np.arange(100).reshape(5, 20), np.zeros(5), epochs=10,
+  ...             batch_size=1, callbacks=[callback, InterruptingCallback()],
+  ...             verbose=0)
+  ... except:
+  ...   pass
+  >>> history = model.fit(np.arange(100).reshape(5, 20), np.zeros(5), epochs=10,
+  ...             batch_size=1, callbacks=[callback], verbose=0)
+  >>> # Only 6 more epochs are run, since first trainning got interrupted at
+  >>> # zero-indexed epoch 4, second training will continue from 4 to 9.
+  >>> len(history.history['loss'])
+  6
+
+  Arguments:
+      backup_dir: String, path to save the model file. This is the directory in
+        which the system stores temporary files to recover the model from jobs
+        terminated unexpectedly. The directory cannot be reused elsewhere to
+        store other checkpoints, e.g. by BackupAndRestore callback of another
+        training, or by another callback (ModelCheckpoint) of the same training.
+  """
+
+  def __init__(self, backup_dir):
+    super(BackupAndRestore, self).__init__()
+    self.backup_dir = backup_dir
+    self._supports_tf_logs = True
+    self._supported_strategies = (
+        distribute_lib._DefaultDistributionStrategy,
+        mirrored_strategy.MirroredStrategy,
+        collective_all_reduce_strategy.CollectiveAllReduceStrategy)
+
+    if not context.executing_eagerly():
+      if ops.inside_function():
+        raise ValueError('This Callback\'s method contains Python state and '
+                         'should be called outside of `tf.function`s.')
+      else:  # Legacy graph mode:
+        raise ValueError(
+            'BackupAndRestore only supports eager mode. In graph '
+            'mode, consider using ModelCheckpoint to manually save '
+            'and restore weights with `model.load_weights()` and by '
+            'providing `initial_epoch` in `model.fit()` for fault tolerance.')
+
+    # Only the chief worker writes model checkpoints, but all workers
+    # restore checkpoint at on_train_begin().
+    self._chief_worker_only = False
+
+  def set_model(self, model):
+    self.model = model
+
+  def on_train_begin(self, logs=None):
+    # TrainingState is used to manage the training state needed for
+    # failure-recovery of a worker in training.
+    # pylint: disable=protected-access
+
+    if not isinstance(self.model.distribute_strategy,
+                      self._supported_strategies):
+      raise NotImplementedError(
+          'Currently only support empty strategy, MirroredStrategy and '
+          'MultiWorkerMirroredStrategy.')
+    self.model._training_state = (
+        worker_training_state.WorkerTrainingState(self.model, self.backup_dir))
+    self._training_state = self.model._training_state
+    self._training_state.restore()
+
+  def on_train_end(self, logs=None):
+    # pylint: disable=protected-access
+    # On exit of training, delete the training state backup file that was saved
+    # for the purpose of worker recovery.
+    self._training_state.delete_backup()
+
+    # Clean up the training state.
+    del self._training_state
+    del self.model._training_state
+
+  def on_epoch_end(self, epoch, logs=None):
+    # Back up the model and current epoch for possible future recovery.
+    self._training_state.back_up(epoch)
+
+
 @keras_export('keras.callbacks.EarlyStopping')
 class EarlyStopping(Callback):
   """Stop training when a monitored metric has stopped improving.
@@ -2341,7 +2423,7 @@ class CSVLogger(Callback):
 
     if self.model.stop_training:
       # We set NA so that csv parsers do not fail for this last epoch.
-      logs = dict([(k, logs[k]) if k in logs else (k, 'NA') for k in self.keys])
+      logs = dict((k, logs[k]) if k in logs else (k, 'NA') for k in self.keys)
 
     if not self.writer:
 
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 6a39ebc5007..ef5302b45a5 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -44,7 +44,6 @@ py_library(
         "//tensorflow/python/keras:losses",
         "//tensorflow/python/keras:optimizers",
         "//tensorflow/python/keras:regularizers",
-        "//tensorflow/python/keras/distribute:multi_worker_training_state",
         "//tensorflow/python/keras/mixed_precision/experimental:autocast_variable",
         "//tensorflow/python/keras/mixed_precision/experimental:policy",
         "//tensorflow/python/keras/saving",
@@ -56,23 +55,20 @@ py_library(
 )
 
 py_library(
-    name = "multi_worker_training_state",
+    name = "worker_training_state",
     srcs = [
-        "multi_worker_training_state.py",
+        "worker_training_state.py",
     ],
     srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python/distribute:multi_worker_util",
-    ],
 )
 
 cuda_py_test(
-    name = "multi_worker_training_state_test",
-    srcs = ["multi_worker_training_state_test.py"],
+    name = "worker_training_state_test",
+    srcs = ["worker_training_state_test.py"],
     shard_count = 4,
     deps = [
         ":multi_worker_testing_utils",
-        ":multi_worker_training_state",
+        ":worker_training_state",
         "//tensorflow/python:platform",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:multi_worker_test_base",
@@ -375,27 +371,6 @@ py_test(
     ],
 )
 
-cuda_py_test(
-    name = "multi_worker_fault_tolerance_test",
-    srcs = ["multi_worker_fault_tolerance_test.py"],
-    shard_count = 14,
-    # TODO(b/132384649): Enable once fixed.
-    tags = [
-        "no_oss",
-    ],
-    deps = [
-        ":distribute",
-        ":multi_worker_testing_utils",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/distribute:collective_all_reduce_strategy",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:distribute_config",
-        "//tensorflow/python/distribute:distribute_coordinator",
-        "//tensorflow/python/distribute:multi_worker_test_base",
-        "//tensorflow/python/keras",
-    ],
-)
-
 py_library(
     name = "multi_worker_testing_utils",
     srcs = [
diff --git a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
index 7ea385e0b04..8daa46f6ea3 100644
--- a/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
+++ b/tensorflow/python/keras/distribute/multi_worker_callback_tf2_test.py
@@ -28,11 +28,19 @@ from tensorflow.python.distribute import multi_process_runner
 from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 
 
+def checkpoint_exists(filepath):
+  """Returns whether the checkpoint `filepath` refers to exists."""
+  if filepath.endswith('.h5'):
+    return file_io.file_exists(filepath)
+  tf_saved_model_exists = file_io.file_exists(filepath)
+  tf_weights_only_checkpoint_exists = file_io.file_exists(filepath + '.index')
+  return tf_saved_model_exists or tf_weights_only_checkpoint_exists
+
+
 def _model_setup(test_obj, file_format):
   """Set up a MNIST Keras model for testing purposes.
 
@@ -89,7 +97,7 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
           (test_base.get_task_type(), test_base.get_task_index(), extension))
 
       # The saving_filepath shouldn't exist at the beginning (as it's unique).
-      test_obj.assertFalse(training_state.checkpoint_exists(saving_filepath))
+      test_obj.assertFalse(checkpoint_exists(saving_filepath))
 
       model.fit(
           x=train_ds,
@@ -104,15 +112,14 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
 
       # If it's chief, the model should be saved; if not, the model shouldn't.
       test_obj.assertEqual(
-          training_state.checkpoint_exists(saving_filepath),
-          test_base.is_chief())
+          checkpoint_exists(saving_filepath), test_base.is_chief())
 
       # If it's chief, the model should be saved (`write_filepath` should
       # simply return `saving_filepath`); if not, i.e. for non-chief workers,
       # the temporary path generated by `write_filepath` should no longer
       # contain the checkpoint that has been deleted.
       test_obj.assertEqual(
-          training_state.checkpoint_exists(
+          checkpoint_exists(
               distributed_file_utils.write_filepath(
                   saving_filepath, model._distribution_strategy)),
           test_base.is_chief())
@@ -148,6 +155,69 @@ class KerasCallbackMultiProcessTest(parameterized.TestCase, test.TestCase):
         cluster_spec=test_base.create_cluster_spec(num_workers=2),
         args=(self, saving_filepath))
 
+  @combinations.generate(combinations.combine(mode=['eager']))
+  def test_backupandrestore_checkpoint_works_with_interruption(self, mode):
+
+    class InterruptingCallback(callbacks.Callback):
+
+      def on_epoch_begin(self, epoch, logs=None):
+        if epoch == 2:
+          raise RuntimeError('Interrupting!')
+
+    class AssertCallback(callbacks.Callback):
+
+      def on_epoch_begin(self, epoch, logs=None):
+        # the interruption happened on epoch 2 as specified in
+        # InterruptingCallback, so the initial epoch after restart will begin
+        # at 2.
+        assert epoch > 1
+
+    def proc_model_checkpoint_works_with_same_file_path(test_obj,
+                                                        saving_filepath):
+      model, _, train_ds, steps = _model_setup(test_obj, file_format='')
+      num_epoch = 4
+
+      # The saving_filepath shouldn't exist at the beginning (as it's unique).
+      test_obj.assertFalse(file_io.file_exists(saving_filepath))
+      bar_dir = os.path.join(os.path.dirname(saving_filepath), 'backup')
+
+      try:
+        model.fit(
+            x=train_ds,
+            epochs=num_epoch,
+            steps_per_epoch=steps,
+            callbacks=[
+                callbacks.ModelCheckpoint(filepath=saving_filepath),
+                callbacks.BackupAndRestore(backup_dir=bar_dir),
+                InterruptingCallback()
+            ])
+      except RuntimeError as e:
+        if 'Interrupting!' not in str(e):
+          raise
+
+      backup_filepath = os.path.join(bar_dir, 'checkpoint')
+      test_obj.assertTrue(file_io.file_exists(backup_filepath))
+      test_obj.assertTrue(file_io.file_exists(saving_filepath))
+
+      model.fit(
+          x=train_ds,
+          epochs=num_epoch,
+          steps_per_epoch=steps,
+          callbacks=[
+              callbacks.ModelCheckpoint(filepath=saving_filepath),
+              callbacks.BackupAndRestore(backup_dir=bar_dir),
+              AssertCallback()
+          ])
+      test_obj.assertFalse(file_io.file_exists(backup_filepath))
+      test_obj.assertTrue(file_io.file_exists(saving_filepath))
+
+    saving_filepath = os.path.join(self.get_temp_dir(), 'checkpoint')
+
+    multi_process_runner.run(
+        proc_model_checkpoint_works_with_same_file_path,
+        cluster_spec=test_base.create_cluster_spec(num_workers=2),
+        args=(self, saving_filepath))
+
   @combinations.generate(combinations.combine(mode=['eager']))
   def test_tensorboard_saves_on_chief_but_not_otherwise(self, mode):
 
diff --git a/tensorflow/python/keras/distribute/multi_worker_fault_tolerance_test.py b/tensorflow/python/keras/distribute/multi_worker_fault_tolerance_test.py
deleted file mode 100644
index fa58d2479ac..00000000000
--- a/tensorflow/python/keras/distribute/multi_worker_fault_tolerance_test.py
+++ /dev/null
@@ -1,346 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests Keras multi worker fault tolerance."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-import json
-import os
-import signal
-import sys
-import tempfile
-import threading
-
-from absl.testing import parameterized
-from tensorflow.python.distribute import collective_all_reduce_strategy as collective_strategy
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import distribute_coordinator as dc
-from tensorflow.python.distribute import mirrored_strategy
-from tensorflow.python.distribute import multi_worker_test_base as test_base
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras import callbacks
-from tensorflow.python.keras.distribute import multi_worker_testing_utils
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
-from tensorflow.python.platform import test
-
-
-def get_strategy_object(strategy_cls):
-  if strategy_cls == mirrored_strategy.MirroredStrategy:
-    return strategy_cls(mirrored_strategy.all_local_devices())
-  else:
-    # CollectiveAllReduceStrategy and ParameterServerStrategy.
-    return strategy_cls()
-
-
-class KerasMultiWorkerFaultToleranceTest(test_base.IndependentWorkerTestBase,
-                                         parameterized.TestCase):
-
-  class PreemptionAtBatchBoundarySimulatingCallback(callbacks.Callback):
-    """Callback to simulate preemption at batch boundary."""
-
-    def on_epoch_begin(self, epoch, logs=None):
-      self._current_epoch = epoch
-
-    def on_batch_begin(self, batch, logs=None):
-      if self._current_epoch == 1 and batch == 1 and not test_base.is_chief():
-        # Simulate preemption at the start of second batch of second epoch.
-        raise RuntimeError('Preemption!')
-
-    def on_batch_end(self, batch, logs=None):
-      assert self._current_epoch < 1 or batch < 1
-
-    def on_epoch_end(self, epoch, logs=None):
-      assert epoch < 1
-
-  # TODO(rchao): Add tests for checking 0th and 2nd epoch boundary.
-  class PreemptionAtEpochBoundarySimulatingCallback(callbacks.Callback):
-    """Callback to simulate preemption at epoch boundary."""
-
-    def on_epoch_begin(self, epoch, logs=None):
-      if epoch == 1 and not test_base.is_chief():
-        # Simulate preemption at the start of second epoch.
-        raise RuntimeError('Preemption!')
-
-    def on_epoch_end(self, epoch, logs=None):
-      assert epoch < 1
-
-  @combinations.generate(
-      combinations.combine(
-          # Eager runtime unfortunately cannot be tested with multi-threading.
-          # TODO(rchao): Add test to use multi-process for eager mode after
-          # b/132095481 is resolved.
-          mode=['graph'],
-          strategy_cls=[collective_strategy.CollectiveAllReduceStrategy],
-          required_gpus=[0, 1],
-          file_format=['h5', 'tf'],
-          preemption_callback=[
-              PreemptionAtEpochBoundarySimulatingCallback,
-              PreemptionAtBatchBoundarySimulatingCallback
-          ],
-          # FT should work regardless of `ModelCheckpoint`'s parameters.
-          save_weights_only=[True, False],
-          load_weights_on_restart=[True, False],
-      ))
-  def testFaultToleranceInSyncStrategy(self, strategy_cls, file_format,
-                                       preemption_callback, save_weights_only,
-                                       load_weights_on_restart):
-    """Test fault-tolerance with multi-threading using sync dist-strat.
-
-    This test simulates multi-worker training that is interrupted by a
-    preemption, by having two threads, each of which represents a chief and a
-    non-chief worker, where the non-chief raises an error in the middle of
-    training loop. Upon excepting the error, a new thread with a new cluster
-    spec is created to simulate the recovered non-chief worker. Meanwhile, the
-    chief worker cannot proceed and hangs since the non-chief worker has
-    crashed. To simulate a restart of the chief, a new thread has been prepared
-    to run to take over chief with the help of a condition variable. It is
-    expected that after the restart of both chief and non-chief workers, the
-    training continues from the epoch they previously failed at. The test
-    concludes by verifying the preemption-interrupted training can finish with
-    the same loss and accuracy had the preemption not occurred.
-
-    TODO(rchao): Add test to check preemption on chief (possibly using multi
-    processes).
-
-    TODO(rchao): Add test to check fault-tolerance with multiple `model.fit()`.
-
-    Arguments:
-      strategy_cls: The strategy class to use.
-      file_format: `h5` or `tf`.
-      preemption_callback: The callback to simulate preemption.
-      save_weights_only: The argument for `model.fit()`'s `save_weights_only`.
-      load_weights_on_restart: The argument for `model.fit()`'s
-        `load_weights_on_restart`.
-    """
-
-    def _independent_worker_fn(*args, **kwargs):  # pylint: disable=unused-argument
-      with test.mock.patch.object(dc, '_run_std_server',
-                                  self._make_mock_run_std_server()):
-        # `before_restart` is True for the threads that represent the original
-        # chief and non-chief worker, and False for threads that represent the
-        # restarted chief and non-chief workers.
-        before_restart = kwargs['before_restart']
-
-        # Model building under strategy scope. Following is the code we expect
-        # the user runs on every worker.
-        strategy = get_strategy_object(strategy_cls)
-        batch_size = 64
-        steps = 3
-        train_ds, _ = multi_worker_testing_utils.mnist_synthetic_dataset(
-            batch_size, steps)
-
-        with strategy.scope():
-          model = multi_worker_testing_utils.get_mnist_model((28, 28, 1))
-
-        # Function to start a new thread. This will be called twice in the
-        # following code: one represents the restart of the non-chief, and one
-        # represents the restart of the chief as a result of the restart of the
-        # non-chief (so the training can continue in sync).
-        def start_new_thread(new_chief):
-          new_thread_tf_config = json.loads(os.environ['TF_CONFIG'])
-
-          # Update the ports in new chief and new worker threads.
-          new_thread_tf_config['cluster']['worker'] = kwargs['reserved_ports']
-
-          # Since both new chief and new worker threads are started from the
-          # worker thread, we need to overwrite the tf config task index.
-          new_thread_tf_config['task']['index'] = 0 if new_chief else 1
-          return self._run_task_in_thread(
-              task_fn=_independent_worker_fn,
-              cluster_spec=None,
-              task_type=None,
-              task_id=None,
-              tf_config=new_thread_tf_config,
-              before_restart=False,
-              new_chief=new_chief)
-
-        try:
-
-          class CkptSavedEpochAssertingCallback(callbacks.Callback):
-
-            def __init__(self, test_obj):
-              super(CkptSavedEpochAssertingCallback, self).__init__()
-              self.test_obj = test_obj
-
-            def on_epoch_begin(self, epoch, logs=None):
-              # `_ckpt_saved_epoch` attribute is set at the end of every epoch.
-              self.test_obj.assertEqual(
-                  K.eval(self.model._ckpt_saved_epoch) ==
-                  training_state.CKPT_SAVED_EPOCH_UNUSED_VALUE, epoch == 0)
-
-          callbacks_list = [
-              callbacks.ModelCheckpoint(
-                  filepath=saving_filepath,
-                  save_weights_only=save_weights_only,
-                  load_weights_on_restart=load_weights_on_restart),
-              CkptSavedEpochAssertingCallback(self)
-          ]
-          if before_restart:
-            callbacks_list.append(preemption_callback())
-
-          self.assertFalse(hasattr(model, training_state.CKPT_SAVED_EPOCH))
-          history = model.fit(
-              x=train_ds,
-              epochs=num_epoch,
-              steps_per_epoch=steps,
-              callbacks=callbacks_list)
-          self.assertFalse(hasattr(model, training_state.CKPT_SAVED_EPOCH))
-
-          # `history` of the training result is collected to be compared against
-          # each other. It is expected that the training results (loss and
-          # accuracy`) are the same with or without preemption.
-          self._histories.append(history.history)
-
-        except RuntimeError:
-          # pylint: disable=g-assert-in-except
-          self.assertTrue(before_restart)
-          # Reset the barrier so the new threads simulating recovery can
-          # continue.
-          self._barrier._counter = 0
-          self._barrier._flag = False
-
-          # At this point we block the original non-chief thread, and
-          # start the new threads that simulate the restarted chief and
-          # non-chief, joining the threads and return.
-          new_chief_thread = start_new_thread(new_chief=True)
-          new_worker_thread = start_new_thread(new_chief=False)
-          self.join_independent_workers([new_chief_thread, new_worker_thread])
-          return
-
-        # Successful end of a `fit()` call.
-        with self._lock:
-          self._successful_thread_ends += 1
-        self.assertFalse(before_restart)
-
-    # Common parameters
-    num_workers = 2
-    num_epoch = 3
-    # History list storing the results for preemption and no preemption cases.
-    self._histories = []
-    # Lock required to prevent race condition between two threads.
-    self._lock = threading.Lock()
-    strategy = get_strategy_object(strategy_cls)
-
-    def handler(signum, frame):
-      del signum, frame
-      # `session.run()` within `model.fit()` can time out. Skipping it as it
-      # doesn't represent the failure of this test.
-      self.skipTest('Skipping test due to `session.run()` timeout.')
-
-    signal.signal(signal.SIGALRM, handler)
-    # Alarming within 5 min before the test timeouts and fails.
-    signal.alarm(240)
-
-    def get_saving_dir_and_filepath():
-      saving_dir = tempfile.mkdtemp(prefix=self.get_temp_dir())
-      saving_filepath = os.path.join(saving_dir, 'checkpoint.' + file_format)
-      return saving_dir, saving_filepath
-
-    # Case 1: Training for `num_epoch` without preemptions.
-    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
-    self._barrier = dc._Barrier(2)
-    self._successful_thread_ends = 0
-    # Get a new temporary filepath to save the checkpoint to.
-    saving_dir, saving_filepath = get_saving_dir_and_filepath()
-    threads = self.run_multiple_tasks_in_threads(
-        _independent_worker_fn,
-        cluster_spec,
-        # Pass `saving_filepath` from the parent thread to ensure every worker
-        # has the same filepath to save.
-        saving_filepath=saving_filepath,
-        before_restart=False,
-        new_chief=False)
-    threads_to_join = []
-    if strategy.extended.experimental_between_graph:
-      for ts in threads.values():
-        threads_to_join.extend(ts)
-    else:
-      threads_to_join = [threads['worker'][0]]
-    self.join_independent_workers(threads_to_join)
-
-    # `self.test_skipped_reason` could be set when a non-main thread attempts
-    # to skip the test.
-    # `multi_worker_test_base.skip_if_grpc_server_cant_be_started()` is an
-    # example of where this can be set. Since raising `SkipTest` in a non-main
-    # thread doesn't actually skip the test, we check if the test should be
-    # skipped here once we have joined the threads.
-    if getattr(self, 'test_skipped_reason', None) is not None:
-      self.skipTest(self.test_skipped_reason)
-
-    self.assertTrue(
-        training_state.remove_checkpoint_if_exists(saving_dir, saving_filepath))
-    self.assertEqual(self._successful_thread_ends, 2)
-
-    # Case 2: Training for `num_epoch` epoch with preemptions.
-    # The preemption is simulated at both epoch boundary and batch boundary.
-    cluster_spec = test_base.create_cluster_spec(num_workers=num_workers)
-    self._barrier = dc._Barrier(2)
-    # Ports reserved for new threads simulating recovery.
-    reserved_ports = [
-        'localhost:%s' % test_base.pick_unused_port()
-        for _ in range(num_workers)
-    ]
-    self._successful_thread_ends = 0
-    # Get a new temporary filepath to save the checkpoint to.
-    saving_dir, saving_filepath = get_saving_dir_and_filepath()
-    threads = self.run_multiple_tasks_in_threads(
-        _independent_worker_fn,
-        cluster_spec,
-        # Pass `saving_filepath` from the parent thread to ensure every worker
-        # has the same filepath to save.
-        saving_filepath=saving_filepath,
-        reserved_ports=reserved_ports,
-        before_restart=True,
-        new_chief=False)
-    threads_to_join = []
-    if strategy.extended.experimental_between_graph:
-      # Only join the non-chief thread since the first thread for chief will
-      # eventually hang and be ignored.
-      threads_to_join = [threads['worker'][1]]
-    else:
-      threads_to_join = [threads['worker'][0]]
-    self.join_independent_workers(threads_to_join)
-    if getattr(self, 'test_skipped_reason', None) is not None:
-      self.skipTest(self.test_skipped_reason)
-
-    self.assertTrue(
-        training_state.remove_checkpoint_if_exists(saving_dir, saving_filepath))
-    self.assertEqual(self._successful_thread_ends, 2)
-
-    def assert_all_elements_are_identical(list_to_check):
-      first_item = list_to_check[0]
-      for item in list_to_check[1:]:
-        self.assertAllClose(first_item, item, rtol=2e-5, atol=1e-5)
-
-    # Important: the results from preemption interrupted and non-interrupted
-    # cases should give the same final results.
-    assert_all_elements_are_identical(
-        [history['acc'][-1] for history in self._histories])
-    assert_all_elements_are_identical(
-        [history['loss'][-1] for history in self._histories])
-    # The length of `self._histories` would be num_workers * num_runs (3).
-    self.assertLen(self._histories, 4)
-
-    # Results from case 1 should have 3 full epochs.
-    self.assertLen(self._histories[0]['acc'], 3)
-    # Results from case 2 should only have 2 full epochs because it restarted at
-    # epoch 1.
-    self.assertLen(self._histories[-1]['acc'], 2)
-
-
-if __name__ == '__main__':
-  with test.mock.patch.object(sys, 'exit', os._exit):
-    test.main()
diff --git a/tensorflow/python/keras/distribute/multi_worker_training_state.py b/tensorflow/python/keras/distribute/multi_worker_training_state.py
deleted file mode 100644
index d967cf8b1d2..00000000000
--- a/tensorflow/python/keras/distribute/multi_worker_training_state.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Training state management in multi-worker distributed training."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import contextlib
-import os
-import uuid
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.utils import mode_keys
-from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import variables
-from tensorflow.python.training.tracking import tracking
-
-# Constant for `tf.keras.Model` attribute to store the epoch at which the most
-# recently saved checkpoint was saved.
-CKPT_SAVED_EPOCH = '_ckpt_saved_epoch'
-
-CKPT_SAVED_EPOCH_UNUSED_VALUE = -1
-
-
-def checkpoint_exists(filepath):
-  """Returns whether the checkpoint `filepath` refers to exists."""
-  if filepath.endswith('.h5'):
-    return file_io.file_exists(filepath)
-  tf_saved_model_exists = file_io.file_exists(filepath)
-  tf_weights_only_checkpoint_exists = file_io.file_exists(filepath + '.index')
-  return tf_saved_model_exists or tf_weights_only_checkpoint_exists
-
-
-def remove_checkpoint_if_exists(ckpt_dir, filepath):
-  """Removes the checkpoint if it exists and returns whether it has removed."""
-  if checkpoint_exists(filepath):
-    _remove_dir(ckpt_dir)
-    return True
-  return False
-
-
-def _remove_dir(dir_to_remove):
-  file_io.delete_recursively(dir_to_remove)
-
-
-def _get_backup_filepath(original_filepath):
-  backup_dir = os.path.join(os.path.dirname(original_filepath), 'backup')
-  return backup_dir, os.path.join(backup_dir, 'training_state')
-
-
-def _get_temp_filepath(original_filepath):
-  temp_dir = os.path.join(
-      os.path.dirname(original_filepath), 'temp_training_states',
-      str(uuid.uuid4()))
-  return temp_dir, os.path.join(temp_dir, 'training_state')
-
-
-class MultiWorkerTrainingState(object):
-  """Training state management class in multi-worker distributed training.
-
-  In multi-worker training, model weights and epoch information are saved
-  periodically for fault-tolerance, also known as preemption-recovery purpose.
-  This class provides apis for backing up and restoring the training state.
-  """
-
-  def __init__(self, model, original_filepath):
-    self._model = model
-
-    # The directory and filepath that store the training state backup file.
-    self._backup_dir, self._backup_filepath = _get_backup_filepath(
-        original_filepath)
-
-    # For those who should not checkpoint (e.g. non-chief worker in sync
-    # training), create a temporary directory to write to (that will be
-    # removed later).
-    if not multi_worker_util.should_save_checkpoint():
-      self._temp_dir, self._temp_filepath = _get_temp_filepath(
-          original_filepath)
-
-    # The epoch at which the checkpoint is saved. Used for fault-tolerance.
-    # GPU device only has int64 dtype registered VarHandleOp.
-    self._ckpt_saved_epoch = variables.Variable(
-        initial_value=constant_op.constant(
-            CKPT_SAVED_EPOCH_UNUSED_VALUE, dtype=dtypes.int64),
-        name='ckpt_saved_epoch')
-
-    # Variable initialization.
-    K.set_value(self._ckpt_saved_epoch, CKPT_SAVED_EPOCH_UNUSED_VALUE)
-
-    # Calling `AutoTrackable.__setattr__` to avoid getting added as a weight of
-    # model (which is done in `Layer.__setattr__`), which breaks saving/loading
-    # in hdf5 format. Once becomes an attr of `model`, _ckpt_saved_epoch gets
-    # tracked and will be included in the checkpoint file when backing up.
-    tracking.AutoTrackable.__setattr__(self._model, CKPT_SAVED_EPOCH,
-                                       self._ckpt_saved_epoch)
-
-  def back_up(self, epoch):
-    """Back up the current state of training into a checkpoint file.
-
-    Arguments:
-      epoch: The current epoch information to be saved.
-    """
-    # pylint: disable=protected-access
-    self._assert_in_multi_worker_mode()
-
-    # Update `_ckpt_saved_epoch`.
-    K.set_value(self._ckpt_saved_epoch, epoch)
-
-    # If this is multi-worker training, and this worker should not
-    # save checkpoint, we replace the filepath with a dummy filepath so
-    # it writes to a file that will be removed at the end of _save_model()
-    # call. This is because the SyncOnReadVariable needs to be synced across
-    # all the workers in order to be read, and all workers need to initiate
-    # that.
-    if multi_worker_util.should_save_checkpoint():
-      save_filepath = self._backup_filepath
-    else:
-      save_filepath = self._temp_filepath
-
-    # Save the weights plus CKPT_SAVED_EPOCH variable.
-    self._model.save_weights(save_filepath, overwrite=True)
-
-    if not multi_worker_util.should_save_checkpoint():
-      # Remove the file in multi-worker training where this worker should
-      # not checkpoint. It is a dummy file previously saved for sync distributed
-      # training.
-      _remove_dir(self._temp_dir)
-
-  def restore(self):
-    """Restore the training state from the backed up checkpoint file.
-
-    Returns:
-      True if the training state is successfully restored. False if the training
-      state doesn't need to be restored, or error occurred so it can't.
-    """
-    self._assert_in_multi_worker_mode()
-    if not multi_worker_util.should_load_checkpoint():
-      # For multi-worker training, it should not restore a model in certain
-      # worker setting (e.g. non-chief worker in ParameterServerStrategy).
-      return False
-    if file_io.file_exists(self._backup_dir):
-      try:
-        # Load the weights plus CKPT_SAVED_EPOCH variable.
-        self._model.load_weights(self._backup_filepath)
-        return True
-
-      except (IOError, ValueError) as e:
-        raise ValueError('Error loading file from {}. Reason: {}'.format(
-            self._backup_filepath, e))
-    return False
-
-  def delete_backup(self):
-    """Delete the backup directories.
-
-    Delete the backup directories which should not exist after `fit()`
-    successfully finishes.
-    """
-    self._assert_in_multi_worker_mode()
-    # Model may not have such attr if there was a failure before the attr was
-    # added to the model
-    if hasattr(self._model, CKPT_SAVED_EPOCH):
-      tracking.AutoTrackable.__delattr__(self._model, CKPT_SAVED_EPOCH)
-    if multi_worker_util.should_save_checkpoint():
-      _remove_dir(self._backup_dir)
-    else:
-      assert not file_io.file_exists(self._temp_dir)
-
-  def maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
-    """Maybe load initial epoch from ckpt considering possible worker recovery.
-
-    When `_ckpt_saved_epoch` attribute exists and is not
-    `CKPT_SAVED_EPOCH_UNUSED_VALUE`, this is under multi-worker training setting
-    and indicates the worker is recovering from previous failure. In this case,
-    infer `initial_epoch` from `self._ckpt_saved_epoch` to continue previous
-    unfinished training from certain epoch.
-
-    Arguments:
-      initial_epoch: The original initial_epoch user passes in in `fit()`.
-      mode: The mode for running `model.fit()`.
-
-    Returns:
-      If the training is recovering from previous failure under multi-worker
-      training setting, return the epoch the training is supposed to continue
-      at. Otherwise, return the `initial_epoch` the user passes in.
-    """
-    self._assert_in_multi_worker_mode()
-
-    # TODO(rchao): Add recovery for validation case
-    # (when mode == ModeKeys.TEST).
-    epoch = K.eval(self._ckpt_saved_epoch)
-    if mode == mode_keys.ModeKeys.TRAIN and epoch >= 0:
-      # The most recently saved epoch is one epoch prior to the epoch it
-      # failed at, so return the value of 'self._ckpt_saved_epoch' plus one.
-      return epoch + 1
-    return initial_epoch
-
-  @contextlib.contextmanager
-  def untrack_vars(self):
-    """Provides a scope within which training state variables are untracked.
-
-    Regular checkpoint file saved by `ModelCheckpoint` callback that the user
-    requests should not contain training state variables such as
-    `CKPT_SAVED_EPOCH`, or the epoch the checkpoint is most recently saved at.
-
-    Yields:
-      None.
-    """
-    tracking.AutoTrackable.__delattr__(self._model, CKPT_SAVED_EPOCH)
-    yield
-    tracking.AutoTrackable.__setattr__(self._model, CKPT_SAVED_EPOCH,
-                                       self._ckpt_saved_epoch)
-
-  def _assert_in_multi_worker_mode(self):
-    # pylint: disable=protected-access
-    if not self._model._in_multi_worker_mode():
-      raise ValueError('MultiWorkerTrainingState is only supposed to be used '
-                       'in multi-worker training. This indicates some error '
-                       'that needs to be fixed. Please submit a bug issue to '
-                       'tf.keras team.')
diff --git a/tensorflow/python/keras/distribute/worker_training_state.py b/tensorflow/python/keras/distribute/worker_training_state.py
new file mode 100644
index 00000000000..06cf46b3333
--- /dev/null
+++ b/tensorflow/python/keras/distribute/worker_training_state.py
@@ -0,0 +1,153 @@
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Training state management."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.distribute import distributed_file_utils
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils import mode_keys
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import variables
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.tracking import util as trackable_util
+
+# Constant for `tf.keras.Model` attribute to store the epoch at which the most
+# recently saved checkpoint was saved.
+CKPT_SAVED_EPOCH = '_ckpt_saved_epoch'
+
+CKPT_SAVED_EPOCH_UNUSED_VALUE = -1
+
+
+class WorkerTrainingState(object):
+  """Training state management class.
+
+  This class provides apis for backing up and restoring the training state.
+  This allows model and epoch information to be saved periodically and restore
+  for fault-tolerance, also known as preemption-recovery purpose.
+  """
+
+  def __init__(self, model, checkpoint_dir):
+    self._model = model
+
+    # The epoch at which the checkpoint is saved. Used for fault-tolerance.
+    # GPU device only has int64 dtype registered VarHandleOp.
+    self._ckpt_saved_epoch = variables.Variable(
+        initial_value=constant_op.constant(
+            CKPT_SAVED_EPOCH_UNUSED_VALUE, dtype=dtypes.int64),
+        name='ckpt_saved_epoch')
+
+    # Variable initialization.
+    K.set_value(self._ckpt_saved_epoch, CKPT_SAVED_EPOCH_UNUSED_VALUE)
+
+    # _ckpt_saved_epoch gets tracked and is included in the checkpoint file
+    # when backing up.
+    checkpoint = trackable_util.Checkpoint(
+        model=self._model, ckpt_saved_epoch=self._ckpt_saved_epoch)
+
+    # If this is single-worker training, checkpoint_dir are the same for
+    # write_checkpoint_manager and read_checkpoint_manager.
+    #
+    # If this is multi-worker training, and this worker should not
+    # save checkpoint, we replace the write_checkpoint_manager's checkpoint_dir
+    # with a temp filepath, so it writes to a file that will be removed at the
+    # end of back_up() call. This is necessary because the SyncOnReadVariable
+    # needs to be synced across all the workers in order to be read, and all
+    # workers need to perform `save()`.
+    # But all workers should restore from the same checkpoint_dir as passed in
+    # read_checkpoint_manager.
+    self.write_checkpoint_dir = distributed_file_utils.write_dirpath(
+        checkpoint_dir, self._model.distribute_strategy)
+    self.write_checkpoint_manager = checkpoint_management.CheckpointManager(
+        checkpoint, directory=self.write_checkpoint_dir, max_to_keep=1)
+    if self.write_checkpoint_dir == checkpoint_dir:
+      self.read_checkpoint_manager = self.write_checkpoint_manager
+    else:
+      self.read_checkpoint_manager = checkpoint_management.CheckpointManager(
+          checkpoint, directory=checkpoint_dir, max_to_keep=1)
+
+  def back_up(self, epoch):
+    """Back up the current state of training into a checkpoint file.
+
+    Arguments:
+      epoch: The current epoch information to be saved.
+    """
+    K.set_value(self._ckpt_saved_epoch, epoch)
+    # Save the model plus CKPT_SAVED_EPOCH variable.
+    if self.write_checkpoint_manager.save():
+      distributed_file_utils.remove_temp_dirpath(
+          self.write_checkpoint_manager.directory,
+          self._model.distribute_strategy)
+
+  def restore(self):
+    """Restore the training state from the backed up checkpoint file.
+
+    Returns:
+      True if the training state is successfully restored. False if the training
+      state doesn't need to be restored, or error occurred so it can't.
+    """
+    # For multi-worker training, it should not restore a model in certain
+    # worker setting (e.g. non-chief worker in ParameterServerStrategy).
+    # pylint: disable=protected-access
+    if self._model._in_multi_worker_mode(
+    ) and not multi_worker_util.should_load_checkpoint():
+      return
+    self.read_checkpoint_manager.restore_or_initialize()
+
+  def delete_backup(self):
+    """Delete the backup directories.
+
+    Delete the backup directories which should not exist after `fit()`
+    successfully finishes.
+    """
+    # pylint: disable=protected-access
+    for pathname in file_io.get_matching_files(
+        self.write_checkpoint_manager._prefix + '*'):
+      file_io.delete_recursively(pathname)
+    for pathname in file_io.get_matching_files(
+        os.path.join(self.write_checkpoint_manager.directory, 'checkpoint')):
+      file_io.delete_recursively(pathname)
+
+  def maybe_load_initial_epoch_from_ckpt(self, initial_epoch, mode):
+    """Maybe load initial epoch from ckpt considering possible worker recovery.
+
+    When `_ckpt_saved_epoch` attribute exists and is not
+    `CKPT_SAVED_EPOCH_UNUSED_VALUE`, this is under multi-worker training setting
+    and indicates the worker is recovering from previous failure. In this case,
+    infer `initial_epoch` from `self._ckpt_saved_epoch` to continue previous
+    unfinished training from certain epoch.
+
+    Arguments:
+      initial_epoch: The original initial_epoch user passes in in `fit()`.
+      mode: The mode for running `model.fit()`.
+
+    Returns:
+      If the training is recovering from previous failure under multi-worker
+      training setting, return the epoch the training is supposed to continue
+      at. Otherwise, return the `initial_epoch` the user passes in.
+    """
+
+    epoch = K.eval(self._ckpt_saved_epoch)
+    if mode == mode_keys.ModeKeys.TRAIN and epoch >= 0:
+      # The most recently saved epoch is one epoch prior to the epoch it
+      # failed at, so return the value of 'self._ckpt_saved_epoch' plus one.
+      return epoch + 1
+    return initial_epoch
diff --git a/tensorflow/python/keras/distribute/multi_worker_training_state_test.py b/tensorflow/python/keras/distribute/worker_training_state_test.py
similarity index 78%
rename from tensorflow/python/keras/distribute/multi_worker_training_state_test.py
rename to tensorflow/python/keras/distribute/worker_training_state_test.py
index 984db20b3b9..80a3deaa914 100644
--- a/tensorflow/python/keras/distribute/multi_worker_training_state_test.py
+++ b/tensorflow/python/keras/distribute/worker_training_state_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests of `multi_worker_training_state.py` utilities."""
+"""Tests of `worker_training_state.py` utilities."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -26,12 +26,12 @@ from tensorflow.python.distribute import multi_worker_test_base as test_base
 from tensorflow.python.framework.errors_impl import NotFoundError
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras.distribute import multi_worker_testing_utils
-from tensorflow.python.keras.distribute import multi_worker_training_state as training_state
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 
 
-class MultiWorkerTrainingStateTest(test_base.IndependentWorkerTestBase,
-                                   parameterized.TestCase):
+class ModelCheckpointTest(test_base.IndependentWorkerTestBase,
+                          parameterized.TestCase):
 
   @combinations.generate(
       combinations.combine(
@@ -48,7 +48,7 @@ class MultiWorkerTrainingStateTest(test_base.IndependentWorkerTestBase,
         callbacks.ModelCheckpoint(
             filepath=saving_filepath, save_weights_only=save_weights_only)
     ]
-    self.assertFalse(training_state.checkpoint_exists(saving_filepath))
+    self.assertFalse(file_io.file_exists(saving_filepath))
 
     try:
       model.fit(
@@ -56,11 +56,10 @@ class MultiWorkerTrainingStateTest(test_base.IndependentWorkerTestBase,
     except NotFoundError as e:
       if 'Failed to create a NewWriteableFile' in e.message:
         self.skipTest('b/138941852, path not found error in Windows py35.')
-
-    self.assertTrue(training_state.checkpoint_exists(saving_filepath))
-    self.assertTrue(
-        training_state.remove_checkpoint_if_exists(saving_dir, saving_filepath))
-    self.assertFalse(training_state.checkpoint_exists(saving_filepath))
+    tf_saved_model_exists = file_io.file_exists(saving_filepath)
+    tf_weights_only_checkpoint_exists = file_io.file_exists(saving_filepath +
+                                                            '.index')
+    self.assertTrue(tf_saved_model_exists or tf_weights_only_checkpoint_exists)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index b986f9a405e..9958f70ed55 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -88,6 +88,11 @@ from tensorflow.tools.docs import doc_controls
 # Prefix that is added to the TF op layer names.
 _TF_OP_LAYER_NAME_PREFIX = 'tf_op_layer_'
 
+# TODO(mdan): Should we have a single generic type for types that can be passed
+# to tf.cast?
+_AUTOCAST_TYPES = (ops.Tensor, sparse_tensor.SparseTensor,
+                   ragged_tensor.RaggedTensor)
+
 _keras_layers_gauge = monitoring.BoolGauge('/tensorflow/api/keras/layers',
                                            'keras layers usage', 'method')
 _keras_model_gauge = monitoring.BoolGauge(
@@ -850,11 +855,11 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     # setting the `_keras_mask` attribute on the inputs to a Layer. Masks passed
     # explicitly take priority.
     mask_arg_passed_by_framework = False
-    input_masks = self._collect_input_masks(inputs, input_list, args, kwargs)
-    if (self._expects_mask_arg and input_masks is not None and
-        not self._call_arg_was_passed('mask', args, kwargs)):
-      mask_arg_passed_by_framework = True
+    input_masks, mask_is_implicit = self._get_input_masks(
+        inputs, input_list, args, kwargs)
+    if self._expects_mask_arg and mask_is_implicit:
       kwargs['mask'] = input_masks
+      mask_arg_passed_by_framework = True
 
     # If `training` argument is None or not explicitly passed,
     # propagate `training` value from this layer's calling layer.
@@ -873,11 +878,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
       # Priority 3a: `learning_phase()` has been set.
       elif backend.global_learning_phase_is_set():
         training_value = backend.learning_phase()
-      # Priority 3b: Pass the `learning_phase()` if in the Keras FuncGraph.
-      elif build_graph:
-        with backend.get_graph().as_default():
-          if base_layer_utils.is_in_keras_graph():
-            training_value = backend.learning_phase()
 
       if self._expects_training_arg and training_value is not None:
         # Force the training_value to be bool type which matches to the contract
@@ -936,7 +936,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
 
             try:
               with base_layer_utils.autocast_context_manager(
-                  self._compute_dtype):
+                  self._compute_dtype_object):
                 # Add auto_control_deps in V2 when they are not already added by
                 # a `tf.function`.
                 if (ops.executing_eagerly_outside_functions() and
@@ -998,7 +998,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           self._maybe_build(inputs)
           cast_inputs = self._maybe_cast_inputs(inputs, input_list)
           with base_layer_utils.autocast_context_manager(
-              self._compute_dtype):
+              self._compute_dtype_object):
             outputs = self.call(cast_inputs, *args, **kwargs)
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, input_masks, build_graph)
@@ -1491,7 +1491,9 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
           self._metrics.append(metric_obj)
         else:
           from tensorflow.python.keras import metrics as metrics_mod  # pylint:disable=g-import-not-at-top
-          metric_obj = metrics_mod.Mean(name=name, dtype=value.dtype)
+          # Build the metric object with the value's dtype if it defines one
+          metric_obj = metrics_mod.Mean(
+              name=name, dtype=getattr(value, 'dtype', None))
           self._metrics.append(metric_obj)
 
       if should_update_state:
@@ -2118,6 +2120,15 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     self._dtype_defaulted_to_floatx = (not dtype and
                                        policy.policy_defaults_to_floatx())
 
+    # Performance optimization: cache the compute dtype as a Dtype object or
+    # None, so that str to Dtype conversion doesn't happen in Layer.__call__.
+    # TODO(b/157486353): Investigate returning DTypes in Policy.
+    if self._dtype_policy.compute_dtype:
+      self._compute_dtype_object = dtypes.as_dtype(
+          self._dtype_policy.compute_dtype)
+    else:
+      self._compute_dtype_object = None
+
   # TODO(reedwm): Expose this property?
   @property
   def _compute_dtype(self):
@@ -2145,22 +2156,20 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     Returns:
       `inputs`, but tensors may have been casted to self._compute_dtype
     """
-    compute_dtype = self._compute_dtype
+    compute_dtype_object = self._compute_dtype_object
     should_autocast = (
-        self._autocast and compute_dtype and
-        dtypes.as_dtype(compute_dtype).is_floating)
+        self._autocast and compute_dtype_object and
+        compute_dtype_object.is_floating)
 
     if (should_autocast and
-        any(self._should_cast_single_input(x) for x in input_list)):
+        any(map(self._should_cast_single_input, input_list))):
       # Only perform expensive `nest` operation when needed.
       return nest.map_structure(self._cast_single_input, inputs)
     else:
       return inputs
 
   def _should_cast_single_input(self, x):
-    cast_types = (ops.Tensor, sparse_tensor.SparseTensor,
-                  ragged_tensor.RaggedTensor)
-    return (isinstance(x, cast_types) and x.dtype.is_floating and
+    return (isinstance(x, _AUTOCAST_TYPES) and x.dtype.is_floating and
             x.dtype.base_dtype.name != self._compute_dtype)
 
   def _cast_single_input(self, x):
@@ -2168,7 +2177,7 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
     if self._should_cast_single_input(x):
       if self._dtype_defaulted_to_floatx:
         self._warn_about_input_casting(x.dtype.base_dtype)
-      return math_ops.cast(x, self._compute_dtype)
+      return math_ops.cast(x, self._compute_dtype_object)
     else:
       return x
 
@@ -2312,20 +2321,26 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
         # Do not track masks for `TensorFlowOpLayer` construction.
         output._keras_mask._keras_history_checked = True
 
-  def _collect_input_masks(self, inputs, input_list, args, kwargs):
-    """Checks if `mask` argument was passed, else gathers mask from inputs."""
-    if self._call_arg_was_passed('mask', args, kwargs):
-      return self._get_call_arg_value('mask', args, kwargs)
-
-    if not self._should_compute_mask:
-      return None
-
-    input_masks = [getattr(t, '_keras_mask', None) for t in input_list]
-    if all(mask is None for mask in input_masks):
-      return None
-
-    # Only do expensive `nest` operation when masking is actually being used.
-    return nest.pack_sequence_as(inputs, input_masks)
+  def _get_input_masks(self, inputs, input_list, args, kwargs):
+    if (not self._expects_mask_arg and not self.supports_masking and
+        not self._compute_mask_overridden):
+      # Input masks only need to be retrieved if they are needed for `call`
+      # or `compute_mask`.
+      input_masks = None
+      implicit_mask = False
+    elif self._call_arg_was_passed('mask', args, kwargs):
+      input_masks = self._get_call_arg_value('mask', args, kwargs)
+      implicit_mask = False
+    else:
+      input_masks = [getattr(t, '_keras_mask', None) for t in input_list]
+      if all(mask is None for mask in input_masks):
+        input_masks = None
+        implicit_mask = False
+      else:
+        # Only do expensive `nest` op when masking is actually being used.
+        input_masks = nest.pack_sequence_as(inputs, input_masks)
+        implicit_mask = True
+    return input_masks, implicit_mask
 
   def _call_arg_was_passed(self, arg_name, args, kwargs, inputs_in_args=False):
     # Performance optimization: do no work in most common case.
@@ -2751,12 +2766,6 @@ class Layer(module.Module, version_utils.LayerVersionSelector):
   def _call_accepts_kwargs(self):
     return self._call_full_argspec.varkw is not None
 
-  @property
-  @tracking.cached_per_instance
-  def _should_compute_mask(self):
-    return ('mask' in self._call_fn_args or
-            getattr(self, 'compute_mask', None) is not None)
-
   @property
   def _eager_losses(self):
     # A list of loss values containing activity regularizers and losses
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 82c60eb34c8..ca138d79020 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -622,26 +622,22 @@ class SymbolicSupportTest(keras_parameterized.TestCase):
   def test_using_symbolic_tensors_with_tf_ops(self):
     # Single-input.
     x = input_layer.Input((3,))
-    y = math_ops.square(x)
-    self.assertEqual(y.graph, backend.get_graph())
+    math_ops.square(x)
 
     # Multi-inputs.
     x1, x2 = input_layer.Input((3,)), input_layer.Input((3,))
-    y = array_ops.concat([x1, x2], axis=1)
-    self.assertEqual(y.graph, backend.get_graph())
+    array_ops.concat([x1, x2], axis=1)
 
     # Mixing Keras symbolic tensors and graph tensors from the same graph works.
     with backend.get_graph().as_default():
       x1 = input_layer.Input((3,))
     x2 = input_layer.Input((3,))
-    y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, backend.get_graph())
+    math_ops.matmul(x1, x2)
 
     # Creating same op type (matmul) multiple times in the Keras graph works.
     x1 = input_layer.Input((3,))
     x2 = input_layer.Input((3,))
-    y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, backend.get_graph())
+    math_ops.matmul(x1, x2)
 
   def test_mixing_eager_and_graph_tensors(self):
     with ops.Graph().as_default():
@@ -663,7 +659,7 @@ class SymbolicSupportTest(keras_parameterized.TestCase):
     x1 = input_layer.Input((3,))
     x2 = array_ops.ones((3, 3))
     y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, backend.get_graph())
+
     fn = backend.function(inputs=[x1], outputs=[y])
     x_val = np.random.random((3, 3))
     y_val = np.ones((3, 3))
@@ -676,7 +672,7 @@ class SymbolicSupportTest(keras_parameterized.TestCase):
     x1 = input_layer.Input((3,))
     x2 = np.ones((3, 3), dtype='float32')
     y = math_ops.matmul(x1, x2)
-    self.assertEqual(y.graph, backend.get_graph())
+
     fn = backend.function(inputs=[x1], outputs=[y])
     x_val = np.random.random((3, 3))
     y_val = np.ones((3, 3))
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
index 7e4e0e5da4a..6d25995e4c2 100644
--- a/tensorflow/python/keras/engine/base_layer_utils.py
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -493,7 +493,7 @@ def autocast_context_manager(dtype):
   Returns:
     A context manager to automatically cast AutoCastVariables.
   """
-  if dtype and not dtypes.as_dtype(dtype).is_floating:
+  if dtype and not dtype.is_floating:
     dtype = None
   return ops.get_default_graph()._enable_auto_casting_variables(dtype)  # pylint: disable=protected-access
 
diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index 80e0b4be2f1..4c0826be4dc 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -768,7 +768,7 @@ class Layer(base_layer.Layer):
           if not self.dynamic:
             try:
               with base_layer_utils.autocast_context_manager(
-                  self._compute_dtype):
+                  self._compute_dtype_object):
                 outputs = call_fn(cast_inputs, *args, **kwargs)
 
             except errors.OperatorNotAllowedInGraphError as e:
@@ -813,7 +813,7 @@ class Layer(base_layer.Layer):
           self._maybe_build(inputs)
           cast_inputs = self._maybe_cast_inputs(inputs)
           with base_layer_utils.autocast_context_manager(
-              self._compute_dtype):
+              self._compute_dtype_object):
             outputs = self.call(cast_inputs, *args, **kwargs)
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, input_masks)
@@ -1749,6 +1749,14 @@ class Layer(base_layer.Layer):
     self._dtype_defaulted_to_floatx = (not dtype and
                                        policy.policy_defaults_to_floatx())
 
+    # Performance optimization: cache the compute dtype as a Dtype object or
+    # None, so that str to Dtype conversion doesn't happen in Layer.__call__.
+    if self._dtype_policy.compute_dtype:
+      self._compute_dtype_object = dtypes.as_dtype(
+          self._dtype_policy.compute_dtype)
+    else:
+      self._compute_dtype_object = None
+
   # TODO(reedwm): Expose this property?
   @property
   def _compute_dtype(self):
diff --git a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
index 9d9e0a062b3..100f3ca2022 100644
--- a/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
+++ b/tensorflow/python/keras/integration_test/gradient_checkpoint_test.py
@@ -75,7 +75,7 @@ def _limit_gpu_memory():
   if gpus:
     tf.config.experimental.set_virtual_device_configuration(
         gpus[0],
-        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024)])
+        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1152)])
     return True
   return False
 
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index e0f087b2453..36e58ef6552 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -64,6 +64,7 @@ else:
   from tensorflow.python.keras.layers.preprocessing.text_vectorization import TextVectorization as TextVectorizationV2
   TextVectorizationV1 = TextVectorization
 from tensorflow.python.keras.layers.preprocessing.category_crossing import CategoryCrossing
+from tensorflow.python.keras.layers.preprocessing.discretization import Discretization
 from tensorflow.python.keras.layers.preprocessing.hashing import Hashing
 
 # Advanced activations.
diff --git a/tensorflow/python/keras/layers/preprocessing/category_crossing.py b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
index 79c27d9ec36..84e5332bea5 100644
--- a/tensorflow/python/keras/layers/preprocessing/category_crossing.py
+++ b/tensorflow/python/keras/layers/preprocessing/category_crossing.py
@@ -140,9 +140,9 @@ class CategoryCrossing(Layer):
   def call(self, inputs):
     depth_tuple = self._depth_tuple if self.depth else (len(inputs),)
     ragged_out = sparse_out = False
-    if any([ragged_tensor.is_ragged(inp) for inp in inputs]):
+    if any(ragged_tensor.is_ragged(inp) for inp in inputs):
       ragged_out = True
-    elif any([isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs]):
+    elif any(isinstance(inp, sparse_tensor.SparseTensor) for inp in inputs):
       sparse_out = True
 
     outputs = []
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization.py b/tensorflow/python/keras/layers/preprocessing/discretization.py
index 3052cfb4369..d621410146c 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization.py
@@ -19,70 +19,59 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-
-INTEGER = "int"
-BINARY = "binary"
+from tensorflow.python.util.tf_export import keras_export
 
 
+@keras_export("keras.layers.experimental.preprocessing.Discretization")
 class Discretization(Layer):
   """Buckets data into discrete ranges.
 
   This layer will place each element of its input data into one of several
-  contiguous ranges and output either an integer index or a one-hot vector
-  indicating which range each element was placed in.
-
-  What happens in `adapt()`: The dataset is examined and sliced.
+  contiguous ranges and output an integer index indicating which range each
+  element was placed in.
 
   Input shape:
     Any `tf.Tensor` or `tf.RaggedTensor` of dimension 2 or higher.
 
   Output shape:
-    The same as the input shape if `output_mode` is 'int', or
-      `[output_shape, num_buckets]` if `output_mode` is 'binary'.
+    Same as input shape.
 
   Attributes:
     bins: Optional boundary specification. Bins include the left boundary and
       exclude the right boundary, so `bins=[0., 1., 2.]` generates bins
       `(-inf, 0.)`, `[0., 1.)`, `[1., 2.)`, and `[2., +inf)`.
-    output_mode: One of 'int', 'binary'. Defaults to 'int'.
 
   Examples:
 
   Bucketize float values based on provided buckets.
   >>> input = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
-  >>> layer = Discretization(bins=[0., 1., 2.])
+  >>> layer = tf.keras.layers.experimental.preprocessing.Discretization(
+  ...          bins=[0., 1., 2.])
   >>> layer(input)
   <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
   array([[0, 2, 3, 1],
          [1, 3, 2, 1]], dtype=int32)>
   """
 
-  def __init__(self, bins, output_mode=INTEGER, **kwargs):
+  def __init__(self, bins, **kwargs):
     super(Discretization, self).__init__(**kwargs)
     self.bins = bins
-    self.output_mode = output_mode
 
   def get_config(self):
     config = {
         "bins": self.bins,
-        "output_mode": self.output_mode,
     }
     base_config = super(Discretization, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
   def compute_output_shape(self, input_shape):
-    if self.output_mode == INTEGER:
-      return input_shape
-    else:
-      return tensor_shape.TensorShape([dim for dim in input_shape] +
-                                      [len(self.bins)])
+    return input_shape
 
   def compute_output_signature(self, input_spec):
     output_shape = self.compute_output_shape(input_spec.shape.as_list())
@@ -99,26 +88,14 @@ class Discretization(Layer):
       # Ragged map_flat_values doesn't touch the non-values tensors in the
       # ragged composite tensor. If this op is the only op a Keras model,
       # this can cause errors in Graph mode, so wrap the tensor in an identity.
-      integer_buckets = array_ops.identity(integer_buckets)
+      return array_ops.identity(integer_buckets)
     elif isinstance(inputs, sparse_tensor.SparseTensor):
       integer_buckets = math_ops._bucketize(  # pylint: disable=protected-access
           inputs.values,
           boundaries=self.bins)
+      return sparse_tensor.SparseTensor(
+          indices=array_ops.identity(inputs.indices),
+          values=integer_buckets,
+          dense_shape=array_ops.identity(inputs.dense_shape))
     else:
-      integer_buckets = math_ops._bucketize(inputs, boundaries=self.bins)  # pylint: disable=protected-access
-
-    if self.output_mode == INTEGER:
-      if isinstance(inputs, sparse_tensor.SparseTensor):
-        return sparse_tensor.SparseTensor(
-            indices=array_ops.identity(inputs.indices),
-            values=integer_buckets,
-            dense_shape=array_ops.identity(inputs.dense_shape))
-      return integer_buckets
-    else:
-      if isinstance(inputs, sparse_tensor.SparseTensor):
-        raise ValueError("`output_mode=binary` is not supported for "
-                         "sparse input")
-      # The 'bins' array is the set of boundaries between the bins. We actually
-      # have 'len(bins)+1' outputs.
-      # TODO(momernick): This will change when we have the ability to adapt().
-      return array_ops.one_hot(integer_buckets, depth=len(self.bins) + 1)
+      return math_ops._bucketize(inputs, boundaries=self.bins)  # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
index 7da40b88920..aaeef8ea868 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_distribution_test.py
@@ -45,8 +45,7 @@ class DiscretizationDistributionTest(
 
     with distribution.scope():
       input_data = keras.Input(shape=(None,))
-      layer = discretization.Discretization(
-          bins=[0., 1., 2.], output_mode=discretization.INTEGER)
+      layer = discretization.Discretization(bins=[0., 1., 2.])
       bucket_data = layer(input_data)
       self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
diff --git a/tensorflow/python/keras/layers/preprocessing/discretization_test.py b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
index 110bccd55e1..54acf267066 100644
--- a/tensorflow/python/keras/layers/preprocessing/discretization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/discretization_test.py
@@ -32,29 +32,8 @@ from tensorflow.python.platform import test
 
 
 @keras_parameterized.run_all_keras_modes
-class CategoricalEncodingInputTest(
-    keras_parameterized.TestCase,
-    preprocessing_test_utils.PreprocessingLayerTest):
-
-  def test_bucketize_with_explicit_buckets_one_hot(self):
-    input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
-
-    # pyformat: disable
-    expected_output = [[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0]],
-                       [[0, 1, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0]]]
-    # pyformat: enable
-    num_buckets = 4
-    expected_output_shape = [None, None, num_buckets]
-
-    input_data = keras.Input(shape=(None,))
-    layer = discretization.Discretization(
-        bins=[0., 1., 2.], output_mode=discretization.BINARY)
-    bucket_data = layer(input_data)
-    self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
-
-    model = keras.Model(inputs=input_data, outputs=bucket_data)
-    output_dataset = model.predict(input_array)
-    self.assertAllEqual(expected_output, output_dataset)
+class DiscretizationTest(keras_parameterized.TestCase,
+                         preprocessing_test_utils.PreprocessingLayerTest):
 
   def test_bucketize_with_explicit_buckets_integer(self):
     input_array = np.array([[-1.5, 1.0, 3.4, .5], [0.0, 3.0, 1.3, 0.0]])
@@ -63,8 +42,7 @@ class CategoricalEncodingInputTest(
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,))
-    layer = discretization.Discretization(
-        bins=[0., 1., 2.], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[0., 1., 2.])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
@@ -79,8 +57,7 @@ class CategoricalEncodingInputTest(
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,), dtype=dtypes.int64)
-    layer = discretization.Discretization(
-        bins=[-.5, 0.5, 1.5], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
@@ -94,8 +71,7 @@ class CategoricalEncodingInputTest(
         indices=indices, values=[-1.5, 1.0, 3.4], dense_shape=[2, 3])
     expected_output = [0, 2, 3]
     input_data = keras.Input(shape=(3,), dtype=dtypes.float32, sparse=True)
-    layer = discretization.Discretization(
-        bins=[-.5, 0.5, 1.5], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=bucket_data)
@@ -111,8 +87,7 @@ class CategoricalEncodingInputTest(
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,), ragged=True)
-    layer = discretization.Discretization(
-        bins=[0., 1., 2.], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[0., 1., 2.])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
@@ -128,8 +103,7 @@ class CategoricalEncodingInputTest(
     expected_output_shape = [None, None]
 
     input_data = keras.Input(shape=(None,), ragged=True, dtype=dtypes.int64)
-    layer = discretization.Discretization(
-        bins=[-.5, 0.5, 1.5], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
     self.assertAllEqual(expected_output_shape, bucket_data.shape.as_list())
 
@@ -143,8 +117,7 @@ class CategoricalEncodingInputTest(
         indices=indices, values=[-1, 1, 3], dense_shape=[2, 3])
     expected_output = [0, 2, 3]
     input_data = keras.Input(shape=(3,), dtype=dtypes.int32, sparse=True)
-    layer = discretization.Discretization(
-        bins=[-.5, 0.5, 1.5], output_mode=discretization.INTEGER)
+    layer = discretization.Discretization(bins=[-.5, 0.5, 1.5])
     bucket_data = layer(input_data)
 
     model = keras.Model(inputs=input_data, outputs=bucket_data)
diff --git a/tensorflow/python/keras/layers/preprocessing/hashing.py b/tensorflow/python/keras/layers/preprocessing/hashing.py
index 05b4445829a..f4a4ae0ccc8 100644
--- a/tensorflow/python/keras/layers/preprocessing/hashing.py
+++ b/tensorflow/python/keras/layers/preprocessing/hashing.py
@@ -168,7 +168,7 @@ class Hashing(Layer):
   def _process_input_list(self, inputs):
     # TODO(momernick): support ragged_cross_hashed with corrected fingerprint
     # and siphash.
-    if any([isinstance(inp, ragged_tensor.RaggedTensor) for inp in inputs]):
+    if any(isinstance(inp, ragged_tensor.RaggedTensor) for inp in inputs):
       raise ValueError('Hashing with ragged input is not supported yet.')
     sparse_inputs = [
         inp for inp in inputs if isinstance(inp, sparse_tensor.SparseTensor)
diff --git a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
index 14720d3541d..f00a9657039 100644
--- a/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import gen_stateful_random_ops
 from tensorflow.python.ops import image_ops_impl as image_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import test
@@ -1114,7 +1115,10 @@ class RandomHeightTest(keras_parameterized.TestCase):
       with tf_test_util.use_gpu():
         input_image = np.reshape(np.arange(0, 6), (2, 3, 1)).astype(dtype)
         layer = image_preprocessing.RandomHeight(factor=(1., 1.))
-        output_image = layer(np.expand_dims(input_image, axis=0))
+        # Return type of RandomHeight() is float32 if `interpolation` is not
+        # set to `ResizeMethod.NEAREST_NEIGHBOR`; cast `layer` to desired dtype.
+        output_image = math_ops.cast(layer(np.expand_dims(input_image, axis=0)),
+                                     dtype=dtype)
         # pyformat: disable
         expected_output = np.asarray([
             [0, 1, 2],
@@ -1202,7 +1206,10 @@ class RandomWidthTest(keras_parameterized.TestCase):
       with tf_test_util.use_gpu():
         input_image = np.reshape(np.arange(0, 6), (3, 2, 1)).astype(dtype)
         layer = image_preprocessing.RandomWidth(factor=(1., 1.))
-        output_image = layer(np.expand_dims(input_image, axis=0))
+        # Return type of RandomWidth() is float32 if `interpolation` is not
+        # set to `ResizeMethod.NEAREST_NEIGHBOR`; cast `layer` to desired dtype.
+        output_image = math_ops.cast(layer(np.expand_dims(input_image, axis=0)),
+                                     dtype=dtype)
         # pyformat: disable
         expected_output = np.asarray([
             [0, 0.25, 0.75, 1],
diff --git a/tensorflow/python/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
index 992ff562755..6b58a08a4bf 100644
--- a/tensorflow/python/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -48,6 +48,7 @@ from tensorflow.python.keras.layers import wrappers
 from tensorflow.python.keras.layers.preprocessing import category_crossing
 from tensorflow.python.keras.layers.preprocessing import category_encoding
 from tensorflow.python.keras.layers.preprocessing import category_encoding_v1
+from tensorflow.python.keras.layers.preprocessing import discretization
 from tensorflow.python.keras.layers.preprocessing import hashing
 from tensorflow.python.keras.layers.preprocessing import image_preprocessing
 from tensorflow.python.keras.layers.preprocessing import normalization as preprocessing_normalization
@@ -64,7 +65,7 @@ ALL_MODULES = (base_layer, input_layer, advanced_activations, convolutional,
                embeddings, einsum_dense, local, merge, noise, normalization,
                pooling, image_preprocessing, preprocessing_normalization_v1,
                preprocessing_text_vectorization_v1, recurrent, wrappers,
-               hashing, category_crossing, category_encoding_v1)
+               hashing, category_crossing, category_encoding_v1, discretization)
 ALL_V2_MODULES = (rnn_cell_wrapper_v2, normalization_v2, recurrent_v2,
                   preprocessing_normalization, preprocessing_text_vectorization,
                   category_encoding)
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 99eadaec4c8..fee65be18c9 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import combinations
 from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import layers
@@ -2247,6 +2248,23 @@ class ResetStatesTest(keras_parameterized.TestCase):
     self.assertArrayNear(self.evaluate(m_obj.total_cm)[0], [1, 0], 1e-1)
     self.assertArrayNear(self.evaluate(m_obj.total_cm)[1], [3, 0], 1e-1)
 
+  def test_reset_states_recall_float64(self):
+    # Test case for GitHub issue 36790.
+    try:
+      backend.set_floatx('float64')
+      r_obj = metrics.Recall()
+      model = _get_model([r_obj])
+      x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
+      y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
+      model.evaluate(x, y)
+      self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
+      self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
+      model.evaluate(x, y)
+      self.assertEqual(self.evaluate(r_obj.true_positives), 50.)
+      self.assertEqual(self.evaluate(r_obj.false_negatives), 50.)
+    finally:
+      backend.set_floatx('float32')
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/mixed_precision/experimental/BUILD b/tensorflow/python/keras/mixed_precision/experimental/BUILD
index 25d05b78c3e..4b52b442754 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/BUILD
+++ b/tensorflow/python/keras/mixed_precision/experimental/BUILD
@@ -125,11 +125,14 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:ps_values",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/types",
     ],
 )
 
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
index 29e5a68c854..7d0abe30581 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import ps_values as ps_distribute_values
 from tensorflow.python.distribute import values as distribute_values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
@@ -437,7 +438,7 @@ def create_autocast_variable(variable):
     An AutoCastVariable that wraps the variable.
   """
   if not isinstance(variable, (distribute_values.DistributedVariable,
-                               distribute_values.AggregatingVariable)):
+                               ps_distribute_values.AggregatingVariable)):
     return AutoCastVariable(variable)
 
   class AutoCastDistributedVariable(AutoCastVariable, variable.__class__):
@@ -448,7 +449,8 @@ def create_autocast_variable(variable):
     """
 
     def __repr__(self):
-      if issubclass(distribute_values.AggregatingVariable, variable.__class__):
+      if issubclass(ps_distribute_values.AggregatingVariable,
+                    variable.__class__):
         # AggregatingVariable's __repr__ simply calls super.__repr__. So we do
         # the same here for consistency, which calls AutoCastVariable.__repr__.
         return super(AutoCastDistributedVariable, self).__repr__()
diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
index 41591d3edbd..78041973cc1 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py
@@ -146,7 +146,8 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(x.true_dtype, dtypes.float32)
       self.assertIsInstance(x.true_dtype, dtypes.DType)
 
-      with ops.get_default_graph()._enable_auto_casting_variables('float16'):
+      dtype = dtypes.float16
+      with ops.get_default_graph()._enable_auto_casting_variables(dtype):
         self.assertEqual(x.dtype, dtypes.float16)
         self.assertIsInstance(x.dtype, dtypes.DType)
         self.assertEqual(x.true_dtype, dtypes.float32)
diff --git a/tensorflow/python/keras/saving/hdf5_format.py b/tensorflow/python/keras/saving/hdf5_format.py
index f3adb2d0695..800d609fe99 100644
--- a/tensorflow/python/keras/saving/hdf5_format.py
+++ b/tensorflow/python/keras/saving/hdf5_format.py
@@ -876,7 +876,7 @@ def _legacy_weights(layer):
       non_trainable_weights.
   """
   weights = layer.trainable_weights + layer.non_trainable_weights
-  if any([not isinstance(w, variables_module.Variable) for w in weights]):
+  if any(not isinstance(w, variables_module.Variable) for w in weights):
     raise NotImplementedError(
         'Save or restore weights that is not an instance of `tf.Variable` is '
         'not supported in h5, use `save_format=\'tf\'` instead. Got a model '
diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py
index 43c09a62ea9..7f725d3978e 100644
--- a/tensorflow/python/keras/saving/save.py
+++ b/tensorflow/python/keras/saving/save.py
@@ -76,7 +76,7 @@ def save_model(model,
 
   Note that the model weights may have different scoped names after being
   loaded. Scoped names include the model/layer names, such as
-  "dense_1/kernel:0"`. It is recommended that you use the layer properties to
+  `"dense_1/kernel:0"`. It is recommended that you use the layer properties to
   access specific variables, e.g. `model.get_layer("dense_1").kernel`.
 
   _SavedModel serialization_
diff --git a/tensorflow/python/keras/saving/saved_model/save_impl.py b/tensorflow/python/keras/saving/saved_model/save_impl.py
index f3be6d0a595..7802470c523 100644
--- a/tensorflow/python/keras/saving/saved_model/save_impl.py
+++ b/tensorflow/python/keras/saving/saved_model/save_impl.py
@@ -521,7 +521,8 @@ def layer_call_wrapper(call_collection, method):
     with base_layer_utils.call_context().enter(
         layer, inputs=inputs, build_graph=False, training=training,
         saving=True):
-      with base_layer_utils.autocast_context_manager(layer._compute_dtype):  # pylint: disable=protected-access
+      with base_layer_utils.autocast_context_manager(
+          layer._compute_dtype_object):  # pylint: disable=protected-access
         ret = method(*args, **kwargs)
     _restore_layer_losses(original_losses)
     return ret
diff --git a/tensorflow/python/keras/utils/metrics_utils.py b/tensorflow/python/keras/utils/metrics_utils.py
index 58fff40564d..5f9b57c095e 100644
--- a/tensorflow/python/keras/utils/metrics_utils.py
+++ b/tensorflow/python/keras/utils/metrics_utils.py
@@ -299,9 +299,19 @@ def update_confusion_matrix_variables(variables_to_update,
                      '`multi_label` is True.')
   if variables_to_update is None:
     return
-  y_true = math_ops.cast(y_true, dtype=dtypes.float32)
-  y_pred = math_ops.cast(y_pred, dtype=dtypes.float32)
-  thresholds = ops.convert_to_tensor_v2(thresholds, dtype=dtypes.float32)
+  if not any(
+      key for key in variables_to_update if key in list(ConfusionMatrix)):
+    raise ValueError(
+        'Please provide at least one valid confusion matrix '
+        'variable to update. Valid variable key options are: "{}". '
+        'Received: "{}"'.format(
+            list(ConfusionMatrix), variables_to_update.keys()))
+
+  variable_dtype = list(variables_to_update.values())[0].dtype
+
+  y_true = math_ops.cast(y_true, dtype=variable_dtype)
+  y_pred = math_ops.cast(y_pred, dtype=variable_dtype)
+  thresholds = ops.convert_to_tensor_v2(thresholds, dtype=variable_dtype)
   num_thresholds = thresholds.shape[0]
   if multi_label:
     one_thresh = math_ops.equal(
@@ -314,14 +324,6 @@ def update_confusion_matrix_variables(variables_to_update,
                                                                sample_weight)
     one_thresh = math_ops.cast(True, dtype=dtypes.bool)
 
-  if not any(
-      key for key in variables_to_update if key in list(ConfusionMatrix)):
-    raise ValueError(
-        'Please provide at least one valid confusion matrix '
-        'variable to update. Valid variable key options are: "{}". '
-        'Received: "{}"'.format(
-            list(ConfusionMatrix), variables_to_update.keys()))
-
   invalid_keys = [
       key for key in variables_to_update if key not in list(ConfusionMatrix)
   ]
@@ -401,7 +403,7 @@ def update_confusion_matrix_variables(variables_to_update,
 
   if sample_weight is not None:
     sample_weight = weights_broadcast_ops.broadcast_weights(
-        math_ops.cast(sample_weight, dtype=dtypes.float32), y_pred)
+        math_ops.cast(sample_weight, dtype=variable_dtype), y_pred)
     weights_tiled = array_ops.tile(
         array_ops.reshape(sample_weight, thresh_tiles), data_tiles)
   else:
@@ -422,9 +424,9 @@ def update_confusion_matrix_variables(variables_to_update,
 
   def weighted_assign_add(label, pred, weights, var):
     label_and_pred = math_ops.cast(
-        math_ops.logical_and(label, pred), dtype=dtypes.float32)
+        math_ops.logical_and(label, pred), dtype=var.dtype)
     if weights is not None:
-      label_and_pred *= weights
+      label_and_pred *= math_ops.cast(weights, dtype=var.dtype)
     return var.assign_add(math_ops.reduce_sum(label_and_pred, 1))
 
   loop_vars = {
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 220df9c7f8a..b87ca1623b0 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -328,7 +328,7 @@ def shape_type_conversion(fn):
 
 
 def are_all_symbolic_tensors(tensors):
-  return all(is_symbolic_tensor(tensor) for tensor in tensors)
+  return all(map(is_symbolic_tensor, tensors))
 
 
 _user_convertible_tensor_types = set()
@@ -346,9 +346,12 @@ def is_symbolic_tensor(tensor):
   Returns:
     True for symbolic tensors, False for eager tensors.
   """
-  if isinstance(tensor, tuple(_user_convertible_tensor_types)):
-    tensor = ops.convert_to_tensor_or_composite(tensor)
-  if isinstance(tensor, variables.Variable):
+  if isinstance(tensor, ops.Tensor):
+    return hasattr(tensor, 'graph')
+  elif isinstance(tensor, composite_tensor.CompositeTensor):
+    component_tensors = nest.flatten(tensor, expand_composites=True)
+    return any(hasattr(t, 'graph') for t in component_tensors)
+  elif isinstance(tensor, variables.Variable):
     # Variables that are output of a Keras Layer in Functional API mode
     # should be considered symbolic.
     # TODO(omalleyt): We need a better way to check this in order to
@@ -356,12 +359,11 @@ def is_symbolic_tensor(tensor):
     # return Variables as outputs.
     return (getattr(tensor, '_keras_history', False) or
             not context.executing_eagerly())
-  if isinstance(tensor, composite_tensor.CompositeTensor):
-    component_tensors = nest.flatten(tensor, expand_composites=True)
-    return any(hasattr(t, 'graph') for t in component_tensors)
-  if isinstance(tensor, ops.Tensor):
-    return hasattr(tensor, 'graph')
-  return False
+  elif isinstance(tensor, tuple(_user_convertible_tensor_types)):
+    tensor = ops.convert_to_tensor_or_composite(tensor)
+    return is_symbolic_tensor(tensor)
+  else:
+    return False
 
 
 def register_symbolic_tensor_type(cls):
@@ -526,4 +528,3 @@ def to_numpy_or_python_type(tensors):
     return t  # Don't turn ragged or sparse tensors to NumPy.
 
   return nest.map_structure(_to_single_numpy_or_python_type, tensors)
-
diff --git a/tensorflow/python/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
index e56f07e4bb7..3720708543f 100644
--- a/tensorflow/python/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -317,8 +317,9 @@ def plot_model(model,
   # Return the image as a Jupyter Image object, to be displayed in-line.
   # Note that we cannot easily detect whether the code is running in a
   # notebook, and thus we always return the Image if Jupyter is available.
-  try:
-    from IPython import display
-    return display.Image(filename=to_file)
-  except ImportError:
-    pass
+  if extension != 'pdf':
+    try:
+      from IPython import display
+      return display.Image(filename=to_file)
+    except ImportError:
+      pass
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 3e0676b0746..36e58bee829 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -23,7 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -61,35 +61,32 @@ class MatrixUnaryFunctorGradientTest(test_lib.TestCase):
 
 def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
 
-  @test_util.run_v1_only('b/120545219')
+  @test_util.enable_control_flow_v2
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
-    with self.session(use_gpu=True):
+
+    def RandomInput():
       np.random.seed(1)
-      a_np = np.random.uniform(
+      return np.random.uniform(
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
-      a = constant_op.constant(a_np)
-      if functor_.__name__ == 'matrix_square_root':
-        # Square the input matrix to ensure that its matrix square root exists
-        a = math_ops.matmul(a, a)
-        a_np = self.evaluate(a)
-      b = functor_(a, **kwargs_)
 
-      # Optimal stepsize for central difference is O(epsilon^{1/3}).
-      epsilon = np.finfo(dtype_).eps
-      delta = epsilon**(1.0 / 3.0)
-      # tolerance obtained by looking at actual differences using
-      # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
-      tol = 1e-6 if dtype_ == np.float64 else 0.05
+    if functor_.__name__ == 'matrix_square_root':
+      # Square the input matrix to ensure that its matrix square root exists
+      f = lambda x: functor_(math_ops.matmul(x, x), **kwargs_)
+    else:
+      f = functor_
 
-      theoretical, numerical = gradient_checker.compute_gradient(
-          a,
-          a.get_shape().as_list(),
-          b,
-          b.get_shape().as_list(),
-          x_init_value=a_np,
-          delta=delta)
-      self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+    # Optimal stepsize for central difference is O(epsilon^{1/3}).
+    epsilon = np.finfo(dtype_).eps
+    delta = epsilon**(1.0 / 3.0)
+    # tolerance obtained by looking at actual differences using
+    # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+    tol = 1e-6 if dtype_ == np.float64 else 0.05
+
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        f, [RandomInput()], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
   return Test
 
@@ -104,42 +101,33 @@ def _GetMatrixBinaryFunctorGradientTest(functor_,
                                         float32_tol_fudge=1.0,
                                         **kwargs_):
 
-  @test_util.run_v1_only('b/120545219')
+  @test_util.run_in_graph_and_eager_modes(use_gpu=True)
   def Test(self):
-    # TODO(rmlarsen): Debug illegal address bug on CUDA and re-enable
-    # GPU test for matrix_solve.
-    use_gpu = False if functor_ == linalg_ops.matrix_solve else True
 
-    with self.session(use_gpu=use_gpu):
+    def RandomInput():
       np.random.seed(1)
-      a_np = np.random.uniform(
+      return np.random.uniform(
           low=-1.0, high=1.0,
           size=np.prod(shape_)).reshape(shape_).astype(dtype_)
-      a = constant_op.constant(a_np)
 
-      b_np = np.random.uniform(
-          low=-1.0, high=1.0,
-          size=np.prod(shape_)).reshape(shape_).astype(dtype_)
-      b = constant_op.constant(b_np)
-      c = functor_(a, b, **kwargs_)
+    fixed = RandomInput()
 
-      # Optimal stepsize for central difference is O(epsilon^{1/3}).
-      epsilon = np.finfo(dtype_).eps
-      delta = epsilon**(1.0 / 3.0)
-      # tolerance obtained by looking at actual differences using
-      # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
-      tol = 1e-6 if dtype_ == np.float64 else float32_tol_fudge * 0.05
-      # The gradients for a and b may be of very different magnitudes,
-      # so to not get spurious failures we test them separately.
-      for factor, factor_init in [a, a_np], [b, b_np]:
-        theoretical, numerical = gradient_checker.compute_gradient(
-            factor,
-            factor.get_shape().as_list(),
-            c,
-            c.get_shape().as_list(),
-            x_init_value=factor_init,
-            delta=delta)
-        self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+    # Optimal stepsize for central difference is O(epsilon^{1/3}).
+    epsilon = np.finfo(dtype_).eps
+    delta = epsilon**(1.0 / 3.0)
+    # tolerance obtained by looking at actual differences using
+    # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+    tol = 1e-6 if dtype_ == np.float64 else float32_tol_fudge * 0.05
+
+    # check gradient w.r.t. left argument.
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        lambda x: functor_(x, fixed, **kwargs_), [RandomInput()], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
+
+    # check gradient w.r.t. right argument.
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        lambda y: functor_(fixed, y, **kwargs_), [RandomInput()], delta=delta)
+    self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
   return Test
 
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 475badb6efe..eadb8ceff07 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -133,8 +131,4 @@ class NumericsTest(test.TestCase):
 
 
 if __name__ == "__main__":
-  # TODO(b/130689556): XLA CPU does not honor inf/nan which causes problems
-  os.environ[
-      "XLA_FLAGS"] = "--xla_cpu_enable_fast_math=false " + os.environ.get(
-          "XLA_FLAGS", "")
   test.main()
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 39d88896416..b53147552c3 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -235,19 +235,22 @@ class SliceTest(test.TestCase):
       self.assertAllEqual(slice_val, inp[x, 0:y])
 
   def testSimple(self):
-    with self.session(use_gpu=True) as sess:
-      inp = np.random.rand(4, 4).astype("f")
-      a = constant_op.constant(
-          [float(x) for x in inp.ravel(order="C")],
-          shape=[4, 4],
-          dtype=dtypes.float32)
-      slice_t = array_ops.slice(a, [0, 0], [2, 2])
-      slice2_t = a[:2, :2]
-      slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
-    self.assertAllEqual(slice_val, inp[:2, :2])
-    self.assertAllEqual(slice2_val, inp[:2, :2])
-    self.assertEqual(slice_val.shape, slice_t.get_shape())
-    self.assertEqual(slice2_val.shape, slice2_t.get_shape())
+    with test_util.use_gpu():
+      for dtype in [
+          np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
+          np.float16, np.float32, np.float64, np.complex64, np.complex128,]:
+        inp = np.random.rand(4, 4).astype(dtype)
+        a = constant_op.constant(
+            [float(x) for x in inp.ravel(order="C")],
+            shape=[4, 4],
+            dtype=dtypes.float32)
+        slice_t = array_ops.slice(a, [0, 0], [2, 2])
+        slice2_t = a[:2, :2]
+        slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
+        self.assertAllEqual(slice_val, np.array(inp[:2, :2], dtype=np.float32))
+        self.assertAllEqual(slice2_val, np.array(inp[:2, :2], dtype=np.float32))
+        self.assertEqual(slice_val.shape, slice_t.get_shape())
+        self.assertEqual(slice2_val.shape, slice2_t.get_shape())
 
   @test_util.run_deprecated_v1
   def testComplex(self):
diff --git a/tensorflow/python/module/module_test.py b/tensorflow/python/module/module_test.py
index 963d3549b2b..0578823c7fb 100644
--- a/tensorflow/python/module/module_test.py
+++ b/tensorflow/python/module/module_test.py
@@ -26,6 +26,7 @@ from absl.testing import parameterized
 import six
 
 from tensorflow.python import tf2
+from tensorflow.python.distribute import ps_values
 from tensorflow.python.distribute import tpu_values
 from tensorflow.python.distribute import values as distributed_values
 from tensorflow.python.eager import context
@@ -250,7 +251,7 @@ class VariableTrackingTest(test_util.TensorFlowTestCase):
         None, [variables.Variable(1.)], variables.VariableAggregation.SUM)
     tpu = tpu_values.TPUMirroredVariable(
         strategy=None, values=[variables.Variable(42.)], aggregation=None)
-    aggregating = distributed_values.AggregatingVariable(
+    aggregating = ps_values.AggregatingVariable(
         strategy=None, v=variables.Variable(1.), aggregation=None)
 
     m = module.Module()
@@ -514,8 +515,8 @@ class FlattenTest(parameterized.TestCase, test_util.TensorFlowTestCase):
 
     m = module.Module()
     m.layers = {non_orderable(): None, non_orderable(): None}
-    with self.assertRaisesRegexp(ValueError,
-                                 "Error processing property 'layers'"):
+    with self.assertRaisesRegex(ValueError,
+                                "Error processing property 'layers'"):
       m.variables  # pylint: disable=pointless-statement
 
 
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index a641633b1f5..118c2cfca55 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -983,7 +983,7 @@ def _slice_helper(tensor, slice_spec, var=None):
   with ops.name_scope(
       None,
       "strided_slice", [tensor] + begin + end + strides,
-      skip_on_eager=False) as name:
+      skip_on_eager=False):
     if begin:
       packed_begin, packed_end, packed_strides = (stack(begin), stack(end),
                                                   stack(strides))
@@ -1009,8 +1009,7 @@ def _slice_helper(tensor, slice_spec, var=None):
         shrink_axis_mask=shrink_axis_mask,
         new_axis_mask=new_axis_mask,
         ellipsis_mask=ellipsis_mask,
-        var=var,
-        name=name)
+        var=var)
 
 
 # pylint: disable=undefined-variable,protected-access,redefined-outer-name
@@ -1194,7 +1193,7 @@ def strided_slice(input_,
       if var is None:
         raise ValueError("Sliced assignment is only supported for variables")
       else:
-        if name is None:
+        if name is None and parent_name:
           name = parent_name + "_assign"
 
         return var._strided_slice_assign(
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index dababc7615e..479d1122742 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -942,7 +942,10 @@ class _CondGradFuncGraph(util.CondBranchFuncGraph):
     return captured_tensor
 
 
-def indexed_case(branch_index, branch_fns, name="indexed_case"):
+def indexed_case(branch_index,
+                 branch_fns,
+                 name="indexed_case",
+                 lower_using_switch_merge=None):
   """Like conv_v2, except emits a Case op instead of an If."""
   if isinstance(branch_index, int):
     raise TypeError("branch_index must not be a Python int", branch_index)
@@ -976,7 +979,8 @@ def indexed_case(branch_index, branch_fns, name="indexed_case"):
     return _build_case(
         branch_index,
         branch_graphs, [g.external_captures for g in branch_graphs],
-        name=scope)
+        name=scope,
+        lower_using_switch_merge=lower_using_switch_merge)
 
 
 @ops.RegisterGradient("Case")
@@ -1064,7 +1068,11 @@ def _CaseGrad(op, *grads):  # pylint: disable=invalid-name
   return [None] + outputs
 
 
-def _build_case(branch_index, branch_graphs, branch_inputs, name=None):
+def _build_case(branch_index,
+                branch_graphs,
+                branch_inputs,
+                name=None,
+                lower_using_switch_merge=None):
   """Creates an `Case` op from `branch_index`, branch graphs and inputs.
 
   Note that this modifies `branch_graphs` to make the inputs match, and to
@@ -1080,6 +1088,7 @@ def _build_case(branch_index, branch_graphs, branch_inputs, name=None):
     branch_inputs: List of lists of Tensors to be passed to corresponding
       branch_graph as input.
     name: the name for the Case op.
+    lower_using_switch_merge: Lower this op using switch merge ops (optional).
 
   Returns:
     A list of Tensors which are the outputs of the Case op. Does not include
@@ -1105,7 +1114,7 @@ def _build_case(branch_index, branch_graphs, branch_inputs, name=None):
   case_op, tensors = _get_op_and_outputs(tensors)
 
   if case_op is not None:
-    util.maybe_set_lowering_attr(case_op)
+    util.maybe_set_lowering_attr(case_op, lower_using_switch_merge)
     util.maybe_propagate_compile_time_consts_in_xla(case_op)
     _set_read_only_resource_inputs_attr(case_op, branch_graphs)
     # Prevent fetching since the variant outputs can't be fetched directly.
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 918c989432d..3398308d42e 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util as util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
@@ -3283,7 +3284,11 @@ def _indexed_case_verify_and_canonicalize_args(branch_fns, default,
   return actions
 
 
-def _indexed_case_helper(branch_fns, default, branch_index, name):
+def _indexed_case_helper(branch_fns,
+                         default,
+                         branch_index,
+                         name,
+                         lower_using_switch_merge=None):
   """Implementation of case that emits the n-way indexed Case op.
 
   Args:
@@ -3293,6 +3298,7 @@ def _indexed_case_helper(branch_fns, default, branch_index, name):
     branch_index: Optional int `Tensor`, which selects for the corresponding
       pred_fn_pair.
     name: A name for this operation (optional).
+    lower_using_switch_merge: Lower this op using switch merge ops (optional).
 
   Returns:
     The tensors returned by the pair whose key matched branch_index, or
@@ -3314,7 +3320,10 @@ def _indexed_case_helper(branch_fns, default, branch_index, name):
           | math_ops.greater_equal(branch_index, len(branch_fns)),
           len(branch_fns) - 1, branch_index)
       return branch_fns[int(branch_index)]()
-    return cond_v2.indexed_case(branch_index, branch_fns)
+    return cond_v2.indexed_case(
+        branch_index,
+        branch_fns,
+        lower_using_switch_merge=lower_using_switch_merge)
 
 
 @tf_export("case", v1=[])
@@ -3607,6 +3616,50 @@ def switch_case(branch_index,
   return _indexed_case_helper(branch_fns, default, branch_index, name)
 
 
+def execute_fn_for_device(device_branch_fns, default_fn, name="execute_fn"):
+  """Executes one of the provided callables based on the device placement.
+
+  This API is used when the implementations for high level function depend on
+  the underlying device placement. It takes a dictionary of device type to
+  callables. The device type includes "CPU", "GPU", "TPU", etc. When the type of
+  the device where to run this op matches the key in 'device_branch_fns',
+  the corresponding callable is executed, falling back to 'default_fn' if none
+  matches.
+
+  **Example:**
+  ```python
+  def f1(): return tf.constant(1)
+  def f2(): return tf.constant(2)
+  r = tf.execute_fn_for_device({"CPU": f1, "GPU": f2}, default_fn=f1)
+  ```
+  'r' is evaluated as 1 when it runs on CPU, 2 running on GPU, 1 running on
+  any other device types.
+
+
+  Args:
+    device_branch_fns: a dictionary of device types to the callables. Each
+      callable must return a matching structure of tensors.
+    default_fn: fallback callable when the underlying device does not match any
+      key in the 'device_branch_fns'.
+    name: A name for this operation (optional).
+
+  Returns:
+    The tensors returned by the callable identified by device type during
+    execution, or those returned by 'default_fn' if no key matches.
+  """
+
+  device_branch_fns_upper = {k.upper(): v for k, v in device_branch_fns.items()}
+  branch_fns = list(device_branch_fns_upper.values())
+  devices = list(device_branch_fns_upper.keys())
+  device_index = gen_functional_ops.device_index(device_names=devices)
+  return _indexed_case_helper(
+      branch_fns,
+      default_fn,
+      device_index,
+      name,
+      lower_using_switch_merge=False)
+
+
 class XLAControlFlowContext(ControlFlowContext):
   """Base class for XLA and TPU control flow contexts."""
 
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 2979eb79bfd..f4459d8e34a 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -1211,6 +1211,92 @@ class IndexedCaseTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       control_flow_ops.switch_case(array_ops.constant(1), branches)
 
 
+class ExecuteFnForDeviceTest(test_util.TensorFlowTestCase):
+
+  def testCommonCases(self):
+
+    def cpu_fn(x):
+      return x + x
+
+    def gpu_fn(x):
+      return x * x
+
+    def flexible_fn(a):
+      branches = {"CPU": lambda: cpu_fn(a), "GPU": lambda: gpu_fn(a)}
+      return control_flow_ops.execute_fn_for_device(branches, lambda: cpu_fn(a))
+
+    @def_function.function
+    def flexible_defun(a):
+      return flexible_fn(a)
+
+    def run_defun_and_tape(a):
+      with backprop.GradientTape() as tape:
+        tape.watch(a)
+        result = flexible_defun(a)
+      grad = tape.gradient(result, a)
+      r = flexible_fn(a)
+      return r, result, grad
+
+    a = array_ops.constant(3.)
+    with ops.device("cpu:0"):
+      r, result, grad = run_defun_and_tape(a)
+      self.assertEqual(6., self.evaluate(r))
+      self.assertEqual(6., self.evaluate(result))
+      self.assertEqual([2.], self.evaluate(grad))
+
+    if test_util.is_gpu_available():
+      with ops.device("gpu:0"):
+        r, result, grad = run_defun_and_tape(a)
+        self.assertEqual(9., self.evaluate(r))
+        self.assertEqual(9., self.evaluate(result))
+        self.assertEqual([6.], self.evaluate(grad))
+
+    # no device annotation
+    r, result, grad = run_defun_and_tape(a)
+    if test_util.is_gpu_available():
+      self.assertEqual(9., self.evaluate(r))
+      self.assertEqual(9., self.evaluate(result))
+      self.assertEqual([6.], self.evaluate(grad))
+    else:
+      self.assertEqual(6., self.evaluate(r))
+      self.assertEqual(6., self.evaluate(result))
+      self.assertEqual([2.], self.evaluate(grad))
+
+  def testFallBack(self):
+
+    def default_fn(x):
+      return x
+
+    def tpu_fn(x):
+      return x * x * x
+
+    def flexible_fn(a):
+      branches = {"TPU": lambda: tpu_fn(a)}
+      return control_flow_ops.execute_fn_for_device(
+          branches, default_fn=lambda: default_fn(a))
+
+    @def_function.function
+    def flexible_defun(a):
+      return flexible_fn(a)
+
+    a = array_ops.constant(3.)
+    with ops.device("cpu:0"):
+      result_defun = flexible_defun(a)
+      result_defun = flexible_fn(a)
+      self.assertEqual(3., self.evaluate(result_defun))
+      # execute_fn_for_device is not inside defun_function.
+      result = flexible_fn(a)
+      self.assertEqual(3., self.evaluate(result))
+
+    if test_util.is_gpu_available():
+      with ops.device("gpu:0"):
+        result_defun = flexible_defun(a)
+        self.assertEqual(3., self.evaluate(result_defun))
+        # execute_fn_for_device is not inside defun_function.
+        result = flexible_fn(a)
+        self.assertEqual(3., self.evaluate(result))
+
+
 class CaseTest(test_util.TensorFlowTestCase):
 
   @test_util.run_deprecated_v1
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 7e87d25fe99..4fc464c545c 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -92,7 +92,7 @@ def unique_grad_fn_name(forward_name):
   return "%s_grad_%s" % (forward_name, ops.uid())
 
 
-def maybe_set_lowering_attr(op):
+def maybe_set_lowering_attr(op, lower_using_switch_merge=None):
   """Sets the flag to enable lowering on `op` if necessary.
 
   Lowering allows cond_v2 and while_v2 to avoid some of the limitations of
@@ -108,14 +108,21 @@ def maybe_set_lowering_attr(op):
     - When the eager execution context specifies the executor of functions to
       be the single threaded executor (see context.function_executor_type()).
       Because the single threaded executor does not support v1 control flow ops.
+    - When 'lower_using_switch_merge' is explicitly set to False.
 
   Args:
     op: An `If` or `While` Operation.
+    lower_using_switch_merge: Explicit value to lower or not (optional).
   """
-  if (not _DISABLE_LOWER_USING_SWITCH_MERGE and
-      not control_flow_util.GraphOrParentsInXlaContext(op.graph) and
-      context.context().function_call_options.executor_type !=
-      "SINGLE_THREADED_EXECUTOR"):
+  if lower_using_switch_merge is not None:
+    # pylint: disable=protected-access
+    op._set_attr("_lower_using_switch_merge",
+                 attr_value_pb2.AttrValue(b=lower_using_switch_merge))
+    # pylint: enable=protected-access
+  elif (not _DISABLE_LOWER_USING_SWITCH_MERGE and
+        not control_flow_util.GraphOrParentsInXlaContext(op.graph) and
+        context.context().function_call_options.executor_type !=
+        "SINGLE_THREADED_EXECUTOR"):
     # pylint: disable=protected-access
     op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True))
     # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 4bda85077bc..3ed29b35f90 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -654,6 +654,15 @@ def l2_normalize_v2(x, axis=None, epsilon=1e-12, name=None):
   """
   with ops.name_scope(name, "l2_normalize", [x]) as name:
     x = ops.convert_to_tensor(x, name="x")
+    if x.dtype.is_complex:
+      square_real = math_ops.square(math_ops.real(x))
+      square_imag = math_ops.square(math_ops.imag(x))
+      square_sum = math_ops.real(
+          math_ops.reduce_sum(square_real + square_imag, axis, keepdims=True))
+      x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
+      norm_real = math_ops.multiply(math_ops.real(x), x_inv_norm)
+      norm_imag = math_ops.multiply(math_ops.imag(x), x_inv_norm)
+      return math_ops.complex(norm_real, norm_imag, name=name)
     square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
     x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
     return math_ops.multiply(x, x_inv_norm, name=name)
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 0088c04f909..911eca9fbae 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -303,6 +303,20 @@ class L2NormalizeTest(test_lib.TestCase):
       print("L2Normalize gradient err = %g " % err)
       self.assertLess(err, 1e-4)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testL2NormalizeComplex(self):
+    x_shape = [20, 7, 3]
+    for dtype in [np.complex64, np.complex128]:
+      np.random.seed(1)
+      x_np = (
+          np.random.random_sample(x_shape).astype(dtype) +
+          np.random.random_sample(x_shape).astype(dtype) * 1j)
+      for dim in range(len(x_shape)):
+        y_np = self._l2Normalize(x_np, dim)
+        x_tf = constant_op.constant(x_np, name="x")
+        y_tf = nn_impl.l2_normalize_v2(x_tf, dim)
+        self.assertAllClose(y_np, self.evaluate(y_tf))
+
 
 class DropoutTest(test_lib.TestCase):
 
@@ -1207,6 +1221,7 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [4, 9])
 
+  @test_util.disable_xla("unsupported data format")
   def testNHWCToWHCN(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
@@ -1215,6 +1230,7 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
       y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [9, 4, 3, 7])
 
+  @test_util.disable_xla("unsupported data format")
   def testNHWCToWHCN_Size2(self):
     x_val = [4, 9]
     x = constant_op.constant(x_val)
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 03120fb8dc4..629c8a4a52f 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -16,6 +16,7 @@ TENSORFLOW_API_INIT_FILES = [
     "config/threading/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
+    "data/experimental/service/__init__.py",
     "debugging/__init__.py",
     "debugging/experimental/__init__.py",
     "distribute/__init__.py",
@@ -99,6 +100,7 @@ KERAS_API_INIT_FILES = [
     "keras/applications/xception/__init__.py",
     "keras/backend/__init__.py",
     "keras/callbacks/__init__.py",
+    "keras/callbacks/experimental/__init__.py",
     "keras/constraints/__init__.py",
     "keras/datasets/__init__.py",
     "keras/datasets/boston_housing/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index a8154c6f35c..5c9f1694081 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -16,6 +16,7 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "config/threading/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
+    "data/experimental/service/__init__.py",
     "debugging/__init__.py",
     "debugging/experimental/__init__.py",
     "distribute/__init__.py",
@@ -119,6 +120,7 @@ KERAS_API_INIT_FILES_V1 = [
     "keras/applications/xception/__init__.py",
     "keras/backend/__init__.py",
     "keras/callbacks/__init__.py",
+    "keras/callbacks/experimental/__init__.py",
     "keras/constraints/__init__.py",
     "keras/datasets/__init__.py",
     "keras/datasets/boston_housing/__init__.py",
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 498d8cfdcbb..523129c7037 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -80,6 +80,10 @@ tf_module {
     name: "UNKNOWN_CARDINALITY"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "service"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
new file mode 100644
index 00000000000..12f4f3c2b08
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.service.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.data.experimental.service"
+tf_module {
+  member_method {
+    name: "distribute"
+    argspec: "args=[\'processing_mode\', \'service\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
new file mode 100644
index 00000000000..82ce2303a76
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Discretization"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.discretization.Discretization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'bins\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index a922b143910..4a0522dc08f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Discretization"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Hashing"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 37a95cc88d1..25ae132c775 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -342,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "BatchFunction"
-    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'None\'], "
+    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'enable_large_batch_splitting\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchIFFT"
@@ -1140,6 +1140,10 @@ tf_module {
     name: "DestroyTemporaryVariable"
     argspec: "args=[\'ref\', \'var_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DeviceIndex"
+    argspec: "args=[\'device_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Diag"
     argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index ce77b867902..2fc32b21adc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -68,6 +68,10 @@ tf_module {
     name: "UNKNOWN_CARDINALITY"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "service"
+    mtype: "<type \'module\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
new file mode 100644
index 00000000000..12f4f3c2b08
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.service.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.data.experimental.service"
+tf_module {
+  member_method {
+    name: "distribute"
+    argspec: "args=[\'processing_mode\', \'service\', \'job_name\', \'max_outstanding_requests\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.-backup-and-restore.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.-backup-and-restore.pbtxt
new file mode 100644
index 00000000000..4b0ab2536ae
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.-backup-and-restore.pbtxt
@@ -0,0 +1,82 @@
+path: "tensorflow.keras.callbacks.experimental.BackupAndRestore"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.callbacks.BackupAndRestore\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'backup_dir\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "on_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_epoch_begin"
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_epoch_end"
+    argspec: "args=[\'self\', \'epoch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_predict_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_test_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_begin"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_batch_end"
+    argspec: "args=[\'self\', \'batch\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_begin"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "on_train_end"
+    argspec: "args=[\'self\', \'logs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_model"
+    argspec: "args=[\'self\', \'model\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_params"
+    argspec: "args=[\'self\', \'params\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.pbtxt
new file mode 100644
index 00000000000..670df243e9c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.experimental.pbtxt
@@ -0,0 +1,7 @@
+path: "tensorflow.keras.callbacks.experimental"
+tf_module {
+  member {
+    name: "BackupAndRestore"
+    mtype: "<type \'type\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt
index 31716a24407..cd1d0d940a3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.callbacks.pbtxt
@@ -56,4 +56,8 @@ tf_module {
     name: "TerminateOnNaN"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "experimental"
+    mtype: "<type \'module\'>"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
new file mode 100644
index 00000000000..82ce2303a76
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-discretization.pbtxt
@@ -0,0 +1,218 @@
+path: "tensorflow.keras.layers.experimental.preprocessing.Discretization"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.preprocessing.discretization.Discretization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.module.module.Module\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.tracking.AutoTrackable\'>"
+  is_instance: "<class \'tensorflow.python.training.tracking.base.Trackable\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.version_utils.LayerVersionSelector\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dynamic"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name_scope"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "submodules"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'bins\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_signature"
+    argspec: "args=[\'self\', \'input_spec\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "with_name_scope"
+    argspec: "args=[\'cls\', \'method\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
index a922b143910..4a0522dc08f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.pbtxt
@@ -12,6 +12,10 @@ tf_module {
     name: "CenterCrop"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Discretization"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Hashing"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 37a95cc88d1..25ae132c775 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -342,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "BatchFunction"
-    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'None\'], "
+    argspec: "args=[\'in_tensors\', \'captured_tensors\', \'f\', \'num_batch_threads\', \'max_batch_size\', \'batch_timeout_micros\', \'Tout\', \'max_enqueued_batches\', \'allowed_batch_sizes\', \'container\', \'shared_name\', \'batching_queue\', \'enable_large_batch_splitting\', \'name\'], varargs=None, keywords=None, defaults=[\'10\', \'[]\', \'\', \'\', \'\', \'False\', \'None\'], "
   }
   member_method {
     name: "BatchIFFT"
@@ -1140,6 +1140,10 @@ tf_module {
     name: "DestroyTemporaryVariable"
     argspec: "args=[\'ref\', \'var_name\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "DeviceIndex"
+    argspec: "args=[\'device_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "Diag"
     argspec: "args=[\'diagonal\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index c52b94e0f1f..3009213d43a 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -147,3 +147,8 @@ pip3 install --upgrade argparse
 # tree
 pip2 install dm-tree
 pip3 install dm-tree
+
+# tf.distribute multi worker tests require the following:
+# Those tests are Python3 only.
+pip3 install --upgrade dill
+pip3 install --upgrade tblib
diff --git a/tensorflow/tools/ci_build/release/common_win.bat b/tensorflow/tools/ci_build/release/common_win.bat
index 464782dcefd..03217ce7e56 100644
--- a/tensorflow/tools/ci_build/release/common_win.bat
+++ b/tensorflow/tools/ci_build/release/common_win.bat
@@ -28,7 +28,7 @@ SET PATH=%PATH%;C:\%PYTHON_DIRECTORY%
 
 %PIP_EXE% install setuptools --upgrade
 %PIP_EXE% install future>=0.17.1 --no-deps
-%PIP_EXE% install --ignore-installed tf-estimator-nightly --no-deps
+%PIP_EXE% install --ignore-installed --force-reinstall --upgrade tf-estimator-nightly --no-deps
 %PIP_EXE% install tb-nightly --no-deps
 %PIP_EXE% install numpy --upgrade --no-deps
 %PIP_EXE% install opt_einsum --upgrade
diff --git a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
index 34ef1974916..a90a3e5a212 100755
--- a/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
+++ b/tensorflow/tools/ci_build/release/ubuntu_16/gpu_pip_on_cpu/build.sh
@@ -45,10 +45,15 @@ bazel build --config=opt \
   --crosstool_top=//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain \
   tensorflow/tools/pip_package:build_pip_package
 
+# Set TF nightly flag so we get the proper version of estimator
+if [[ "$IS_NIGHTLY" == 1 ]]; then
+  NIGHTLY_FLAG="--nightly_flag"
+fi
+
 PIP_WHL_DIR=whl
 mkdir -p ${PIP_WHL_DIR}
 PIP_WHL_DIR=$(readlink -f ${PIP_WHL_DIR})  # Get absolute path
-bazel-bin/tensorflow/tools/pip_package/build_pip_package "${PIP_WHL_DIR}"
+bazel-bin/tensorflow/tools/pip_package/build_pip_package "${PIP_WHL_DIR}" "${NIGHTLY_FLAG}"
 WHL_PATH=$(ls "${PIP_WHL_DIR}"/*.whl)
 
 cp "${WHL_PATH}" "$(pwd)"/.
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
index 7b3dcbd33c0..d2135f38ab4 100644
--- a/tensorflow/tools/dockerfiles/assembler.py
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -558,7 +558,7 @@ def main(argv):
       # Only build images for host architecture
       proc_arch = platform.processor()
       is_x86 = proc_arch.startswith('x86')
-      if (is_x86 and any([arch in tag for arch in ['ppc64le']]) or
+      if (is_x86 and any(arch in tag for arch in ['ppc64le']) or
           not is_x86 and proc_arch not in tag):
         continue
 
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index c0442a5986d..1f2dd5d31d2 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -11,6 +11,8 @@ package(
 
 exports_files(["LICENSE"])
 
+tpu_module = "tpu.,distribute.tpu_strategy"
+
 py_library(
     name = "tf_doctest_lib",
     srcs = ["tf_doctest_lib.py"],
@@ -23,7 +25,7 @@ py_library(
 py_test(
     name = "tf_doctest",
     srcs = ["tf_doctest.py"],
-    args = ["--module_prefix_skip=tpu.,distribute.tpu_strategy"],
+    args = ["--module_prefix_skip=" + tpu_module],
     python_version = "PY3",
     tags = [
         "no_oss_py2",
@@ -45,7 +47,7 @@ py_test(
 tpu_py_test(
     name = "tf_doctest_tpu",
     srcs = ["tf_doctest.py"],
-    args = ["--module=tpu.,distribute.tpu_strategy"],
+    args = ["--module=" + tpu_module],
     disable_experimental = True,
     disable_v3 = True,
     main = "tf_doctest.py",
@@ -64,6 +66,32 @@ tpu_py_test(
     ],
 )
 
+py_test(
+    name = "tf_doctest_gpu",
+    srcs = ["tf_doctest.py"],
+    args = [
+        "--module=distribute.",
+        "--module_prefix_skip=" + tpu_module,
+    ],
+    main = "tf_doctest.py",
+    python_version = "PY3",
+    tags = [
+        "no_oss_py2",
+        "no_pip",
+        "no_rocm",
+        "no_windows",  # numpy prints differently on windows.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "requires-gpu-nvidia",
+    ],
+    deps = [
+        ":tf_doctest_lib",
+        "//tensorflow:tensorflow_py",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "tf_doctest_test",
     srcs = ["tf_doctest_test.py"],
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 217edee0f86..db87f9a730d 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -655,8 +655,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "1108f5c737dbdab0277874a7e5b237491839c43a"
-    LLVM_SHA256 = "bbdaaa145a5a8eed8e6a0f06a3b9965f32b03286eddea5f50c5af2d1f3d008df"
+    LLVM_COMMIT = "cf86a234ba86acf0bb875e21d27833be36e08be4"
+    LLVM_SHA256 = "5375bdcdabd4886ab86eddfddef6e21dbc3cac9df67af7d3c44fadb527f74e25"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
diff --git a/third_party/aws/workspace.bzl b/third_party/aws/workspace.bzl
index 13c573337d9..658aaaff00d 100644
--- a/third_party/aws/workspace.bzl
+++ b/third_party/aws/workspace.bzl
@@ -9,11 +9,11 @@ def repo():
     third_party_http_archive(
         name = "aws",
         urls = [
-            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.7.266.tar.gz",
-            "https://github.com/aws/aws-sdk-cpp/archive/1.7.266.tar.gz",
+            "https://mirror.bazel.build/github.com/aws/aws-sdk-cpp/archive/1.7.336.tar.gz",
+            "https://github.com/aws/aws-sdk-cpp/archive/1.7.336.tar.gz",
         ],
-        sha256 = "39fd8a2999260d2b8fcbc8187f1ed5299972c2b8bd14adb7850fd674fea67fb7",
-        strip_prefix = "aws-sdk-cpp-1.7.266",
+        sha256 = "758174f9788fed6cc1e266bcecb20bf738bd5ef1c3d646131c9ed15c2d6c5720",
+        strip_prefix = "aws-sdk-cpp-1.7.336",
         build_file = "//third_party/aws:BUILD.bazel",
     )
 
diff --git a/third_party/cpuinfo/workspace.bzl b/third_party/cpuinfo/workspace.bzl
index e7aff433892..ed5b8aea41a 100644
--- a/third_party/cpuinfo/workspace.bzl
+++ b/third_party/cpuinfo/workspace.bzl
@@ -5,11 +5,11 @@ load("//third_party:repo.bzl", "third_party_http_archive")
 def repo():
     third_party_http_archive(
         name = "cpuinfo",
-        strip_prefix = "cpuinfo-19b9316c71e4e45b170a664bf62ddefd7ac9feb5",
-        sha256 = "e0a485c072de957668eb324c49d726dc0fd736cfb9436b334325f20d93085003",
+        strip_prefix = "cpuinfo-5cefcd6293e6881754c2c53f99e95b159d2d8aa5",
+        sha256 = "8ea076bcc4ff73cdff520ece01b776d2a778ced60956f5eb88697a78e22c389d",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/pytorch/cpuinfo/archive/19b9316c71e4e45b170a664bf62ddefd7ac9feb5.zip",
-            "https://github.com/pytorch/cpuinfo/archive/19b9316c71e4e45b170a664bf62ddefd7ac9feb5.zip",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/pytorch/cpuinfo/archive/5cefcd6293e6881754c2c53f99e95b159d2d8aa5.zip",
+            "https://github.com/pytorch/cpuinfo/archive/5cefcd6293e6881754c2c53f99e95b159d2d8aa5.zip",
         ],
         build_file = "//third_party/cpuinfo:BUILD.bazel",
     )
diff --git a/third_party/gpus/check_cuda_libs.py b/third_party/gpus/check_cuda_libs.py
index 479380da975..686d36f5c77 100644
--- a/third_party/gpus/check_cuda_libs.py
+++ b/third_party/gpus/check_cuda_libs.py
@@ -62,7 +62,7 @@ def check_cuda_lib(path, check_soname=True):
     output = subprocess.check_output([objdump, "-p", path]).decode("utf-8")
     output = [line for line in output.splitlines() if "SONAME" in line]
     sonames = [line.strip().split(" ")[-1] for line in output]
-    if not any([soname == os.path.basename(path) for soname in sonames]):
+    if not any(soname == os.path.basename(path) for soname in sonames):
       raise ConfigError("None of the libraries match their SONAME: " + path)
 
 
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 4c3d56c42a7..c16b62f635a 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -2744,6 +2744,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ml_policies",
+    srcs = glob([
+        "lib/Analysis/ML/*.c",
+        "lib/Analysis/ML/*.cpp",
+        "lib/Analysis/ML/*.inc",
+        "lib/Analysis/ML/*.h",
+    ]),
+    hdrs = glob([
+        "include/llvm/Analysis/ML/*.h",
+        "include/llvm/Analysis/ML/*.def",
+        "include/llvm/Analysis/ML/*.inc",
+    ]),
+    copts = llvm_copts,
+    deps = [
+        ":config",
+        ":core",
+        ":support",
+    ],
+)
+
 cc_library(
     name = "msp430_asm_parser",
     srcs = glob([
@@ -3227,6 +3248,7 @@ cc_library(
         ":inst_combine",
         ":instrumentation",
         ":ipo",
+        ":ml_policies",
         ":scalar",
         ":support",
         ":target",
diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD
index a57088432e2..df875ebb62d 100644
--- a/third_party/mlir/BUILD
+++ b/third_party/mlir/BUILD
@@ -2895,6 +2895,14 @@ gentbl(
             "-gen-op-defs",
             "include/mlir/Dialect/OpenMP/OpenMPOps.cpp.inc",
         ),
+        (
+            "-gen-enum-decls",
+            "include/mlir/Dialect/OpenMP/OpenMPOpsEnums.h.inc",
+        ),
+        (
+            "-gen-enum-defs",
+            "include/mlir/Dialect/OpenMP/OpenMPOpsEnums.cpp.inc",
+        ),
         (
             "-gen-dialect-decls",
             "include/mlir/Dialect/OpenMP/OpenMPOpsDialect.h.inc",
@@ -2926,6 +2934,7 @@ cc_library(
     deps = [
         ":IR",
         ":OpenMPOpsIncGen",
+        ":StandardOps",
         "@llvm-project//llvm:support",
     ],
 )
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 8e6f48df99e..05b233232e3 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -2,14 +2,14 @@
 container_digests = {
     "ubuntu16.04": "sha256:b90dcf2f35f3354909f4491bdf019c110b4b4d95ef0395ebf178bc5d523a4208",
     "centos6": "sha256:d09c12fb26fbbe8398b4973260c75172eb67d509dae9d6f4ad54279b7d6b0494",
-    "ubuntu16.04-manylinux2010": "sha256:d5b056506e14eb216b6e27988814617a09dea77ec1ab46972072038f9df3e728",
+    "ubuntu16.04-manylinux2010": "sha256:5d855d2e9905c3824d71129fbf29696eb18d2237c5d152ab8d23f6882b83f115",
     "cuda10.0-cudnn7-ubuntu14.04": "sha256:d433e1221f802dac393bc8652fabcc63aa46896cd920bb888ae0e2002fe6b756",
     "cuda10.0-cudnn7-centos7": "sha256:a453b7147a60928a8345689eae48916a746b3578b5e831bfa151f0529d469c88",
     "cuda10.0-cudnn7-centos6": "sha256:a1909ba09c703340ee0074ce63dd94fe8fea48035a25264677907a609e2375e0",
     "cuda10.1-cudnn7-centos6": "sha256:454b899657e87893ee5e68dc0f87df59b6a0a7418ae09cafcc3dd65ac71feca9",
     "cuda10.0-cudnn7-ubuntu16.04-manylinux2010": "sha256:5812d9d0ef0a3276fc5faaf4cd01f3d6e03d635893a6e2d2e04f6f01d626c432",
-    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:1e4e888f14a3d5b127151f7970487613a46ca957babe0432786627c78c0b1a36",
-    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:13aa5e700bb609521cd4365d4152d7d8f4118cae7ce174ce7d54cc529e21766a",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010": "sha256:5e6d21c8ef226316eb6df5e2e6015244c16a8e5d936b52a09820442d2f8a919f",
+    "cuda10.1-cudnn7-ubuntu16.04-manylinux2010-multipython": "sha256:4dd708781c17a9e8d641c6ad05cc6e235e7147ff70f7b4a2ff6b31af43be4540",
     "rocm-ubuntu16.04": "sha256:e645447dd6127325f3e97b8bf23424f637a8579d963b34fcc6772cf7cfaa0ebe",
     "windows-1803": "sha256:f109576c7c0c8a1783ff22b666e8923b52dbbe7933f69a1c7a7275202c304a12",
 }