From d5630178ea205e129f6dc50f840c8dba0821cd97 Mon Sep 17 00:00:00 2001
From: Tomasz Strejczek <tomasz.strejczek@outlook.com>
Date: Sat, 12 Sep 2020 17:29:24 +0200
Subject: [PATCH 01/54]  Make fast build work with MSVC

---
 .bazelrc                                      |  2 -
 tensorflow/compiler/xla/literal.cc            |  1 +
 .../compiler/xla/service/hlo_instruction.h    |  8 ++-
 .../compiler/xla/service/hlo_instructions.cc  |  1 +
 tensorflow/core/kernels/concat_lib_gpu.cc     | 29 +++++++++
 .../core/kernels/cwise_op_gpu_conj.cu.cc      |  8 +++
 .../kernels/dense_update_functor_gpu.cu.cc    | 19 ++++++
 .../core/kernels/depthtospace_op_gpu.cu.cc    | 55 +++++++++++++++++
 .../image/resize_bilinear_op_gpu.cu.cc        |  5 ++
 .../core/kernels/spacetodepth_op_gpu.cu.cc    | 61 +++++++++++++++++++
 .../windows/msvc_wrapper_for_nvcc.py.tpl      |  4 +-
 11 files changed, 189 insertions(+), 4 deletions(-)
diff --git a/.bazelrc b/.bazelrc
index 774f614cddd..fd2ab83a571 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -339,8 +339,6 @@ build:windows --copt=/experimental:preprocessor
 build:windows --host_copt=/experimental:preprocessor
 
 # Misc build options we need for windows.
-build:windows --linkopt=/DEBUG
-build:windows --host_linkopt=/DEBUG
 build:windows --linkopt=/OPT:REF
 build:windows --host_linkopt=/OPT:REF
 build:windows --linkopt=/OPT:ICF
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index d26e0881c53..d0e29983070 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -1388,6 +1388,7 @@ typename std::enable_if<(sizeof(NativeSrcT) != sizeof(NativeDestT)),
                         Literal>::type
 BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
   LOG(FATAL) << "Invalid bitcast between types of different sizes.";
+  std::abort();
 }
 
 template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index e2abd4496d2..825a4f819af 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1644,12 +1644,15 @@ class HloInstruction {
   // Returns the dimension sizes or numbers associated with this instruction.
   virtual const std::vector<int64>& dimensions() const {
     LOG(FATAL) << "Unimplemented method.";
+    std::abort();
   }
   virtual int64 dimensions(int64 index) const {
     LOG(FATAL) << "Unimplemented method.";
+    std::abort();
   }
   virtual std::vector<int64>* mutable_dimensions() {
     LOG(FATAL) << "Unimplemented method.";
+    std::abort();
   }
 
   // Delegates to HloConcatenateInstruction::concatenate_dimension.
@@ -1800,15 +1803,17 @@ class HloInstruction {
   // convolution.
   virtual const Window& window() const {
     LOG(FATAL) << "Unimplemented method.";
+    std::abort();
   }
 
   // Sets the window data in a windowed operation such as convolution.
   virtual void set_window(const Window& window) {
     LOG(FATAL) << "Unimplemented method.";
+    std::abort();
   }
 
   // Returns the unique_indices field.
-  virtual bool unique_indices() const { LOG(FATAL) << "Unimplemented method."; }
+  virtual bool unique_indices() const { LOG(FATAL) << "Unimplemented method."; std::abort(); }
 
   // Returns data on the dimension numbers used for a convolution operation,
   // which may be a kConvolution instruction or a kCustomCall that implements a
@@ -1971,6 +1976,7 @@ class HloInstruction {
       HloCloneContext* context) const {
     // TODO(b/80131774): This should be pure virtual.
     LOG(FATAL) << "Unimplemented method.";
+    std::abort();
   }
 
   // Implementation for non-common logic of ExtraAttributesToString.
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index df225e27aad..6f8a5e619df 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -1363,6 +1363,7 @@ std::unique_ptr<HloInstruction> HloTraceInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
   LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode());
+  std::abort();
 }
 
 HloFusionInstruction::HloFusionInstruction(const Shape& shape,
diff --git a/tensorflow/core/kernels/concat_lib_gpu.cc b/tensorflow/core/kernels/concat_lib_gpu.cc
index de029397847..df080142f58 100644
--- a/tensorflow/core/kernels/concat_lib_gpu.cc
+++ b/tensorflow/core/kernels/concat_lib_gpu.cc
@@ -107,6 +107,35 @@ TF_CALL_GPU_ALL_TYPES(REGISTER);
 
 #undef REGISTER
 
+#if defined(_MSC_VER)
+// Required by MSVC non-release build
+#define FORCE_CONCAT(TYPE) \
+template <> \
+void ConcatGPU<TYPE>( \
+    OpKernelContext* c, \
+    const std::vector<std::unique_ptr<typename TTypes<TYPE, 2>::ConstMatrix>>& \
+        inputs_flat, \
+    Tensor* output, typename TTypes<TYPE, 2>::Tensor* output_flat) { \
+  LOG(FATAL) << "Should not be called";   \
+}
+
+FORCE_CONCAT(tensorflow::Variant)
+FORCE_CONCAT(tensorflow::ResourceHandle)
+FORCE_CONCAT(unsigned short)
+FORCE_CONCAT(signed char)
+FORCE_CONCAT(tensorflow::tstring)
+FORCE_CONCAT(Eigen::QUInt8)
+FORCE_CONCAT(Eigen::QInt8)
+FORCE_CONCAT(Eigen::QUInt16)
+FORCE_CONCAT(Eigen::QInt16)
+FORCE_CONCAT(Eigen::QInt32)
+FORCE_CONCAT(unsigned int)
+FORCE_CONCAT(unsigned __int64)
+
+#undef FORCE_CONCAT
+
+#endif
+
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
index e674d5af227..1cbde17fa85 100644
--- a/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
+++ b/tensorflow/core/kernels/cwise_op_gpu_conj.cu.cc
@@ -21,6 +21,14 @@ namespace tensorflow {
 namespace functor {
 DEFINE_UNARY1(conj, complex64);
 DEFINE_UNARY1(conj, complex128);
+
+#if defined(_MSC_VER)
+// Non-release build with MSVC needs these symbols
+DEFINE_UNARY1(conj, float);
+DEFINE_UNARY1(conj, double);
+#endif
+
+
 }  // namespace functor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 40dbfba1e58..237c7b4f88b 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -71,6 +71,25 @@ TF_CALL_int8(DEFINE_GPU_KERNELS);
 TF_CALL_uint32(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
+#if defined(_MSC_VER)
+
+template<> struct functor::DenseUpdate<GPUDevice, tensorflow::Variant, ASSIGN> {
+  void operator()(const GPUDevice& d, typename TTypes<tensorflow::Variant>::Flat params,
+                  typename TTypes<tensorflow::Variant>::ConstFlat update) {
+    LOG(FATAL) << "Not handling type tensorflow::Variant";
+  }
+};
+
+// The function is required to force above template specialization. Without it msvc compiler
+// doesn't include the functor in the object file
+void _force_instantiation(const GPUDevice& d, typename TTypes<tensorflow::Variant>::Flat params,
+                  typename TTypes<tensorflow::Variant>::ConstFlat update)
+{
+   functor::DenseUpdate<GPUDevice, tensorflow::Variant, ASSIGN> x;
+   x(d, params, update);
+}
+#endif // _MSC_VER
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
index 7ed6eee17f5..1faa88af56e 100644
--- a/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthtospace_op_gpu.cu.cc
@@ -22,6 +22,10 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
+#if defined(_MSC_VER)
+#include "tensorflow/core/framework/register_types.h"
+#endif
+
 namespace tensorflow {
 namespace {
 
@@ -251,6 +255,57 @@ template struct functor::DepthToSpaceOpFunctor<GPUDevice, Eigen::half,
 // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
 template struct functor::DepthToSpaceOpFunctor<GPUDevice, int32, FORMAT_NCHW>;
 
+#if defined(_MSC_VER)
+#define FORCE_DEPTH(TYPE, NAME, NUM, DEVICE) \
+template <> \
+struct functor::DepthToSpaceOpFunctor<DEVICE, TYPE, NUM> { \
+  void operator()(const DEVICE& d, typename TTypes<TYPE, 4>::ConstTensor input, \
+                  int block_size, typename TTypes<TYPE, 4>::Tensor output) { \
+    LOG(FATAL) << "Should not be called."; \
+  } \
+  void operator()(const DEVICE& d, typename TTypes<TYPE, 5>::ConstTensor input, \
+                  int block_size, typename TTypes<TYPE, 5>::Tensor output) { \
+    LOG(FATAL) << "Should not be called."; \
+  } \
+}; \
+void _force_DepthToSpaceOpFunctor##NAME(const DEVICE& d, typename TTypes<TYPE, 4>::ConstTensor input, \
+                  int block_size, typename TTypes<TYPE, 4>::Tensor output) { \
+  functor::DepthToSpaceOpFunctor<DEVICE, TYPE, NUM> op; \
+  op(d, input, block_size, output); \
+} \
+void _force_DepthToSpaceOpFunctor##NAME##_2(const DEVICE& d, typename TTypes<TYPE, 5>::ConstTensor input, \
+                  int block_size, typename TTypes<TYPE, 5>::Tensor output) { \
+  functor::DepthToSpaceOpFunctor<DEVICE, TYPE, NUM> op; \
+  op(d, input, block_size, output); \
+} 
+
+FORCE_DEPTH(__int64, int64, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(unsigned __int64, uint64, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(unsigned int, uint, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(int, int, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(unsigned short, ushort, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(short, short, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(unsigned char, uchar, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(signed char, char, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(bfloat16, bfloat16, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(double, double, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(complex64, complex64, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(complex128, complex128, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(bool, bool, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(tensorflow::tstring, tstring, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(tensorflow::ResourceHandle, ResourceHandle, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(tensorflow::Variant, variant, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(Eigen::QInt8, qint8, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(Eigen::QInt8, qint8_2, FORMAT_NHWC, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(Eigen::half, half, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(float, float, FORMAT_NCHW, Eigen::ThreadPoolDevice)
+FORCE_DEPTH(Eigen::QInt8, qint8, FORMAT_NCHW, GPUDevice)
+FORCE_DEPTH(Eigen::QInt8, qint8_2, FORMAT_NHWC, GPUDevice)
+
+#undef FORCE_DEPTH
+
+#endif
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
index c8dfe754060..1f1d0cddf26 100644
--- a/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/image/resize_bilinear_op_gpu.cu.cc
@@ -451,6 +451,11 @@ TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPEC);
 
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(DEFINE_GRAD_GPU_SPEC);
 
+#if defined(_MSC_VER)
+// Required for MSVC debug build
+TF_CALL_half(DEFINE_GRAD_GPU_SPEC)
+#endif
+
 #undef DEFINE_GPU_SPEC
 #undef DEFINE_GRAD_GPU_SPEC
 
diff --git a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
index 610cb5eed59..cf1cb56dcae 100644
--- a/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/spacetodepth_op_gpu.cu.cc
@@ -22,6 +22,10 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/gpu_kernel_helper.h"
 
+#if defined(_MSC_VER)
+#include "tensorflow/core/framework/register_types.h"
+#endif
+
 namespace tensorflow {
 
 typedef Eigen::GpuDevice GPUDevice;
@@ -252,6 +256,63 @@ template struct functor::SpaceToDepthOpFunctor<GPUDevice, uint8, FORMAT_NHWC>;
 // NCHW_VECT_C with 4 x qint8 can be treated as NCHW int32.
 template struct functor::SpaceToDepthOpFunctor<GPUDevice, int32, FORMAT_NCHW>;
 
+#if defined(_MSC_VER)
+#define FORCE_DEPTH(TYPE, NAME, NUM, DEVICE) \
+template <> \
+struct functor::SpaceToDepthOpFunctor<DEVICE, TYPE, NUM> { \
+  void operator()(const DEVICE& d, typename TTypes<TYPE, 4>::ConstTensor input, \
+                  int block_size, typename TTypes<TYPE, 4>::Tensor output) { \
+    LOG(FATAL) << "Should not be called."; \
+  } \
+}; \
+void _force_SpaceToDepthOpFunctor##NAME(const DEVICE& d, typename TTypes<TYPE, 4>::ConstTensor input, \
+                  int block_size, typename TTypes<TYPE, 4>::Tensor output) { \
+  functor::SpaceToDepthOpFunctor<DEVICE, TYPE, NUM> op; \
+  op(d, input, block_size, output); \
+} 
+
+#define FORCE_DEPTH2(TYPE, NAME, DEVICE) FORCE_DEPTH(TYPE, NAME, FORMAT_NCHW, DEVICE) FORCE_DEPTH(TYPE, NAME ## _2, FORMAT_NHWC, DEVICE)
+
+FORCE_DEPTH2(__int64, int64, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(unsigned __int64, uint64, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(unsigned int, uint, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(unsigned short, ushort, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(short, short, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(signed char, char, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(unsigned char, char, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(bfloat16, bfloat16, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(double, double, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(complex64, complex64, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(complex128, complex128, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(bool, bool, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(tensorflow::tstring, tstring, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(tensorflow::ResourceHandle, ResourceHandle, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(tensorflow::Variant, variant, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(Eigen::QInt8, qint8, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(Eigen::half, half, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(float, float, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(int, int, Eigen::ThreadPoolDevice)
+FORCE_DEPTH2(Eigen::QInt8, qint8gpu, GPUDevice)
+
+// Special case for int, FORMAT_NHWC
+template <> 
+struct functor::SpaceToDepthOpFunctor<GPUDevice, int, FORMAT_NHWC> { 
+  void operator()(const GPUDevice& d, typename TTypes<int, 4>::ConstTensor input, 
+                  int block_size, typename TTypes<int, 4>::Tensor output) { 
+    LOG(FATAL) << "Should not be called."; \
+  } 
+}; 
+void _force_SpaceToDepthOpFunctor_int(const GPUDevice& d, typename TTypes<int, 4>::ConstTensor input, 
+                  int block_size, typename TTypes<int, 4>::Tensor output) { 
+  functor::SpaceToDepthOpFunctor<GPUDevice, int, FORMAT_NHWC> op; 
+  op(d, input, block_size, output); 
+} 
+
+#undef FORCE_DEPTH
+#undef FORCE_DEPTH2
+
+#endif
+
 }  // end namespace tensorflow
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
index 7e0674963bf..e406ffd9d4f 100644
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -117,7 +117,9 @@ def InvokeNvcc(argv, log=False):
   nvcc_compiler_options, argv = GetNvccOptions(argv)
 
   opt_option, argv = GetOptionValue(argv, '/O')
-  opt = ['-g']
+  # Originally '-g' was provided as an initial switch. Howerver nvcc expands it for MSVC
+  # to /Zi which generates vcXXX.pdb file not known to bazel.
+  opt = []
   if (len(opt_option) > 0 and opt_option[0] != 'd'):
     opt = ['-O2']
 

From 643c9bd6ede0ea65fbbeb3d3b89e2d3ef4e655f7 Mon Sep 17 00:00:00 2001
From: ml-0 <manuel.loew.infineon@gmail.com>
Date: Wed, 29 Jul 2020 16:02:46 +0200
Subject: [PATCH 02/54] adds target for generic cortex M4F device. Allows
 registering of a callback for DebugLog such that target is really independent
 of any chip and board specifics.

---
 .../micro/cortex-m4f-gcc-generic/README.md    | 19 +++++
 .../micro/cortex-m4f-gcc-generic/debug_log.cc | 39 +++++++++
 .../micro/cortex-m4f-gcc-generic/debug_log.h  | 38 +++++++++
 .../cortex-m4f-gcc-generic_makefile.inc       | 80 +++++++++++++++++++
 4 files changed, 176 insertions(+)
 create mode 100644 tensorflow/lite/micro/cortex-m4f-gcc-generic/README.md
 create mode 100644 tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.cc
 create mode 100644 tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.h
 create mode 100644 tensorflow/lite/micro/tools/make/targets/cortex-m4f-gcc-generic_makefile.inc

diff --git a/tensorflow/lite/micro/cortex-m4f-gcc-generic/README.md b/tensorflow/lite/micro/cortex-m4f-gcc-generic/README.md
new file mode 100644
index 00000000000..4fb5c5d552e
--- /dev/null
+++ b/tensorflow/lite/micro/cortex-m4f-gcc-generic/README.md
@@ -0,0 +1,19 @@
+# Generic Cortex-M4F customizations
+The customization requires a definition where the debug log goes to. The purpose of the generic Cortex-M4F target is to generate a TFLu library file for use in application projects outside of this repo. As the chip HAL and the board specific layer are only defined in the application project, the TFLu library cannot write the debug log anywhere. Instead, we allow the application layer to register a callback function for writing the TFLu kernel debug log.
+
+# Usage
+The application layer must implement and register the callback before calling the network in a way similar to
+
+    void debug_log_printf(const char* s)
+    {
+        printf(s);
+    }
+
+    int main(void)
+    {
+        // Register callback for printing debug log
+        DebugLog_register_callback(debug_log_printf);
+        
+        // now call the network
+        TfLiteStatus invoke_status = interpreter->Invoke();
+    }
diff --git a/tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.cc b/tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.cc
new file mode 100644
index 00000000000..0fd1e894db2
--- /dev/null
+++ b/tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.cc
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implementation for the DebugLog() function that prints to the debug logger on an
+// generic cortex-m4f device.
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#include "tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.h"
+
+static void (*DebugLog_callback)(const char* s) = nullptr;
+
+extern void DebugLog_register_callback(void (*cb)(const char* s)) {
+  DebugLog_callback = cb;
+}
+
+extern void DebugLog(const char* s) {
+  if (DebugLog_callback) {
+	  DebugLog_callback(s);
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.h b/tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.h
new file mode 100644
index 00000000000..c15a05eaf62
--- /dev/null
+++ b/tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.h
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_MICRO_CORTEX_M4F_GENERIC_DEBUG_LOG_H_
+#define TENSORFLOW_LITE_MICRO_CORTEX_M4F_GENERIC_DEBUG_LOG_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// This function is used to register a callback for debug logging.
+// It must be called before the first call to DebugLog().
+extern void DebugLog_register_callback(void (*cb)(const char* s));
+
+// This function should be implemented by each target platform, and provide a
+// way for strings to be output to some text stream. For more information, see
+// tensorflow/lite/micro/debug_log.cc.
+// Note that before the first call to DebugLog()
+// a callback function must be registered by calling DebugLog_register_callback().
+extern void DebugLog(const char* s);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_MICRO_CORTEX_M4F_GENERIC_DEBUG_LOG_H_
diff --git a/tensorflow/lite/micro/tools/make/targets/cortex-m4f-gcc-generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex-m4f-gcc-generic_makefile.inc
new file mode 100644
index 00000000000..c31b5089513
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/cortex-m4f-gcc-generic_makefile.inc
@@ -0,0 +1,80 @@
+# Settings for cortex-m4f generic device, gcc build.
+ifeq ($(TARGET),$(filter $(TARGET),\
+  cortex-m4f-gcc-generic\
+  ))
+  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
+  TARGET_ARCH := cortex-m4
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+  # Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
+  # with the hard interfaces.
+  GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/
+
+  $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
+  $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
+
+  # Use the faster depthwise conv implementation.
+  ALL_TAGS += portable_optimized
+
+  PLATFORM_FLAGS = \
+    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+    -DTF_LITE_STATIC_MEMORY \
+    -DTF_LITE_MCU_DEBUG_LOG \
+    -D __FPU_PRESENT=1 \
+    -DARM_MATH_CM4 \
+    -fmessage-length=0 \
+    -fno-exceptions \
+    -fno-unwind-tables \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -MMD \
+    -mcpu=cortex-m4 \
+    -mthumb \
+    -mfpu=fpv4-sp-d16 \
+    -mfloat-abi=hard \
+    -Wall \
+    -Wextra \
+    -Wno-shadow \
+    -Wno-vla \
+    -Wno-strict-aliasing \
+    -Wno-type-limits \
+    -Wno-unused-parameter \
+    -Wno-missing-field-initializers \
+    -Wno-write-strings \
+    -Wno-sign-compare \
+    -Wunused-function \
+    -fno-delete-null-pointer-checks \
+    -fomit-frame-pointer \
+    -ggdb \
+    -O3
+  CXXFLAGS += $(PLATFORM_FLAGS) -std=gnu++11 -fno-rtti -fno-use-cxa-atexit
+  CCFLAGS += $(PLATFORM_FLAGS)
+
+  BUILD_TYPE := micro
+
+  MICROLITE_LIBS := \
+    -lm
+  INCLUDES += \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
+    -I$(GCC_ARM)/arm-none-eabi/ \
+
+  CMSIS_SRC_DIR := $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source
+  THIRD_PARTY_CC_SRCS := \
+  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_dot_prod_q15.c \
+  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
+  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
+  $(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
+  $(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \
+  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \
+  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c
+
+  # These are tests that don't currently work on the generic cortex-m4f.
+  EXCLUDED_TESTS :=
+  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
+
+endif

From 59b2628d355c7ad232712177d82948b7892d2477 Mon Sep 17 00:00:00 2001
From: ml-0 <manuel.loew.infineon@gmail.com>
Date: Thu, 24 Sep 2020 17:49:11 +0200
Subject: [PATCH 03/54] adds input from review and agreements from discussion.

---
 .../lite/micro/cortex_m4_generic/debug_log.cc | 26 ------
 .../README.md                                 |  4 +-
 .../debug_log.cc                              | 12 +--
 .../debug_log.h                               | 11 ++-
 .../cortex-m4f-gcc-generic_makefile.inc       | 80 -------------------
 .../targets/cortex_m4_generic_makefile.inc    | 51 ------------
 .../targets/cortex_m_gcc_generic_makefile.inc | 68 ++++++++++++++++
 7 files changed, 82 insertions(+), 170 deletions(-)
 delete mode 100644 tensorflow/lite/micro/cortex_m4_generic/debug_log.cc
 rename tensorflow/lite/micro/{cortex-m4f-gcc-generic => cortex_m_gcc_generic}/README.md (53%)
 rename tensorflow/lite/micro/{cortex-m4f-gcc-generic => cortex_m_gcc_generic}/debug_log.cc (75%)
 rename tensorflow/lite/micro/{cortex-m4f-gcc-generic => cortex_m_gcc_generic}/debug_log.h (80%)
 delete mode 100644 tensorflow/lite/micro/tools/make/targets/cortex-m4f-gcc-generic_makefile.inc
 delete mode 100644 tensorflow/lite/micro/tools/make/targets/cortex_m4_generic_makefile.inc
 create mode 100644 tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc

diff --git a/tensorflow/lite/micro/cortex_m4_generic/debug_log.cc b/tensorflow/lite/micro/cortex_m4_generic/debug_log.cc
deleted file mode 100644
index 615b5a0f393..00000000000
--- a/tensorflow/lite/micro/cortex_m4_generic/debug_log.cc
+++ /dev/null
@@ -1,26 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/lite/micro/debug_log.h"
-
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-#include <cstdio>
-#endif
-
-extern "C" void DebugLog(const char* s) {
-#ifndef TF_LITE_STRIP_ERROR_STRINGS
-  fprintf(stderr, "%s", s);
-#endif
-}
diff --git a/tensorflow/lite/micro/cortex-m4f-gcc-generic/README.md b/tensorflow/lite/micro/cortex_m_gcc_generic/README.md
similarity index 53%
rename from tensorflow/lite/micro/cortex-m4f-gcc-generic/README.md
rename to tensorflow/lite/micro/cortex_m_gcc_generic/README.md
index 4fb5c5d552e..f739ec0c1c2 100644
--- a/tensorflow/lite/micro/cortex-m4f-gcc-generic/README.md
+++ b/tensorflow/lite/micro/cortex_m_gcc_generic/README.md
@@ -1,5 +1,5 @@
-# Generic Cortex-M4F customizations
-The customization requires a definition where the debug log goes to. The purpose of the generic Cortex-M4F target is to generate a TFLu library file for use in application projects outside of this repo. As the chip HAL and the board specific layer are only defined in the application project, the TFLu library cannot write the debug log anywhere. Instead, we allow the application layer to register a callback function for writing the TFLu kernel debug log.
+# Generic Cortex-Mx customizations
+The customization requires a definition where the debug log goes to. The purpose of the generic Cortex-Mx target is to generate a TFLu library file for use in application projects outside of this repo. As the chip HAL and the board specific layer are only defined in the application project, the TFLu library cannot write the debug log anywhere. Instead, we allow the application layer to register a callback function for writing the TFLu kernel debug log.
 
 # Usage
 The application layer must implement and register the callback before calling the network in a way similar to
diff --git a/tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.cc b/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc
similarity index 75%
rename from tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.cc
rename to tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc
index 0fd1e894db2..45c40b3eee7 100644
--- a/tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.cc
+++ b/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc
@@ -1,4 +1,4 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,24 +14,26 @@ limitations under the License.
 ==============================================================================*/
 
 // Implementation for the DebugLog() function that prints to the debug logger on an
-// generic cortex-m4f device.
+// generic cortex-m device.
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
-#include "tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.h"
+#include "tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.h"
 
 static void (*DebugLog_callback)(const char* s) = nullptr;
 
-extern void DebugLog_register_callback(void (*cb)(const char* s)) {
+extern "C" void DebugLog_register_callback(void (*cb)(const char* s)) {
   DebugLog_callback = cb;
 }
 
-extern void DebugLog(const char* s) {
+extern "C" void DebugLog(const char* s) {
+#ifndef TF_LITE_STRIP_ERROR_STRINGS
   if (DebugLog_callback) {
 	  DebugLog_callback(s);
   }
+#endif
 }
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.h b/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.h
similarity index 80%
rename from tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.h
rename to tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.h
index c15a05eaf62..c6c1c71f1c3 100644
--- a/tensorflow/lite/micro/cortex-m4f-gcc-generic/debug_log.h
+++ b/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.h
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_MICRO_CORTEX_M4F_GENERIC_DEBUG_LOG_H_
-#define TENSORFLOW_LITE_MICRO_CORTEX_M4F_GENERIC_DEBUG_LOG_H_
+#ifndef TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_H_
+#define TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -22,17 +21,17 @@ extern "C" {
 
 // This function is used to register a callback for debug logging.
 // It must be called before the first call to DebugLog().
-extern void DebugLog_register_callback(void (*cb)(const char* s));
+void DebugLog_register_callback(void (*cb)(const char* s));
 
 // This function should be implemented by each target platform, and provide a
 // way for strings to be output to some text stream. For more information, see
 // tensorflow/lite/micro/debug_log.cc.
 // Note that before the first call to DebugLog()
 // a callback function must be registered by calling DebugLog_register_callback().
-extern void DebugLog(const char* s);
+void DebugLog(const char* s);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // TENSORFLOW_LITE_MICRO_CORTEX_M4F_GENERIC_DEBUG_LOG_H_
+#endif  // TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_H_
diff --git a/tensorflow/lite/micro/tools/make/targets/cortex-m4f-gcc-generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex-m4f-gcc-generic_makefile.inc
deleted file mode 100644
index c31b5089513..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/cortex-m4f-gcc-generic_makefile.inc
+++ /dev/null
@@ -1,80 +0,0 @@
-# Settings for cortex-m4f generic device, gcc build.
-ifeq ($(TARGET),$(filter $(TARGET),\
-  cortex-m4f-gcc-generic\
-  ))
-  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
-  TARGET_ARCH := cortex-m4
-  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
-  # Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions
-  # with the hard interfaces.
-  GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/
-
-  $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
-  $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis))
-
-  # Use the faster depthwise conv implementation.
-  ALL_TAGS += portable_optimized
-
-  PLATFORM_FLAGS = \
-    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
-    -DTF_LITE_STATIC_MEMORY \
-    -DTF_LITE_MCU_DEBUG_LOG \
-    -D __FPU_PRESENT=1 \
-    -DARM_MATH_CM4 \
-    -fmessage-length=0 \
-    -fno-exceptions \
-    -fno-unwind-tables \
-    -ffunction-sections \
-    -fdata-sections \
-    -funsigned-char \
-    -MMD \
-    -mcpu=cortex-m4 \
-    -mthumb \
-    -mfpu=fpv4-sp-d16 \
-    -mfloat-abi=hard \
-    -Wall \
-    -Wextra \
-    -Wno-shadow \
-    -Wno-vla \
-    -Wno-strict-aliasing \
-    -Wno-type-limits \
-    -Wno-unused-parameter \
-    -Wno-missing-field-initializers \
-    -Wno-write-strings \
-    -Wno-sign-compare \
-    -Wunused-function \
-    -fno-delete-null-pointer-checks \
-    -fomit-frame-pointer \
-    -ggdb \
-    -O3
-  CXXFLAGS += $(PLATFORM_FLAGS) -std=gnu++11 -fno-rtti -fno-use-cxa-atexit
-  CCFLAGS += $(PLATFORM_FLAGS)
-
-  BUILD_TYPE := micro
-
-  MICROLITE_LIBS := \
-    -lm
-  INCLUDES += \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \
-    -I$(GCC_ARM)/arm-none-eabi/ \
-
-  CMSIS_SRC_DIR := $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source
-  THIRD_PARTY_CC_SRCS := \
-  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_dot_prod_q15.c \
-  $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \
-  $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \
-  $(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \
-  $(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \
-  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \
-  $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c
-
-  # These are tests that don't currently work on the generic cortex-m4f.
-  EXCLUDED_TESTS :=
-  MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS))
-
-endif
diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m4_generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m4_generic_makefile.inc
deleted file mode 100644
index cc34d684054..00000000000
--- a/tensorflow/lite/micro/tools/make/targets/cortex_m4_generic_makefile.inc
+++ /dev/null
@@ -1,51 +0,0 @@
-# Generic Makefile target for ARM Cortex M4 builds.
-# REQUIRED:
-#   - TOOLCHAIN_PATH: The path to the ARM GCC toolchain to use.
-
-ifeq ($(TARGET), cortex_m4_generic)
-  TARGET_ARCH := arm
-  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
-  export PATH := $(TOOLCHAIN_PATH):$(PATH)
-
-  PLATFORM_FLAGS = \
-    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
-    -DTF_LITE_STATIC_MEMORY \
-    -DNDEBUG \
-    -DTF_LITE_MCU_DEBUG_LOG \
-    -D __FPU_PRESENT=1 \
-    -DARM_MATH_CM4 \
-    -fno-rtti \
-    -fmessage-length=0 \
-    -fno-exceptions \
-    -fno-unwind-tables \
-    -ffunction-sections \
-    -fdata-sections \
-    -funsigned-char \
-    -MMD \
-    -mcpu=cortex-m4 \
-    -mthumb \
-    -mfpu=fpv4-sp-d16 \
-    -mfloat-abi=softfp \
-    -std=gnu++11 \
-    -Wvla \
-    -Wall \
-    -Wextra \
-    -Wno-shadow \
-    -Wno-missing-field-initializers \
-    -Wno-strict-aliasing \
-    -Wno-type-limits \
-    -Wno-unused-function \
-    -Wno-unused-parameter \
-    -fno-delete-null-pointer-checks \
-    -fno-threadsafe-statics \
-    -fomit-frame-pointer \
-    -fno-use-cxa-atexit \
-    -O3
-
-  CXXFLAGS += $(PLATFORM_FLAGS)
-  CCFLAGS += $(PLATFORM_FLAGS)
-
-  LDFLAGS += -Wl,--gc-sections
-
-endif
-
diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc
new file mode 100644
index 00000000000..732a11b50aa
--- /dev/null
+++ b/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc
@@ -0,0 +1,68 @@
+# Generic Makefile target for ARM Cortex Mx gcc builds.
+ifeq ($(TARGET),$(filter $(TARGET),\
+  cortex_m_gcc_generic\
+  ))
+  TARGET_ARCH := arm
+  TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
+  export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
+
+  $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,))
+
+  PLATFORM_FLAGS = \
+    -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
+    -DTF_LITE_STATIC_MEMORY \
+    -DNDEBUG \
+    -DTF_LITE_MCU_DEBUG_LOG \
+    -D __FPU_PRESENT=1 \
+    -DARM_MATH_CM4 \
+    -fmessage-length=0 \
+    -fno-exceptions \
+    -fno-unwind-tables \
+    -ffunction-sections \
+    -fdata-sections \
+    -funsigned-char \
+    -MMD \
+    -mcpu=cortex-m4 \
+    -mthumb \
+    -mfpu=fpv4-sp-d16 \
+    -Wvla \
+    -Wall \
+    -Wextra \
+    -Wno-shadow \
+    -Wno-missing-field-initializers \
+    -Wno-double-promotion \
+    -Wno-strict-aliasing \
+    -Wno-type-limits \
+    -Wno-unused-function \
+    -Wno-unused-variable \
+    -Wno-unused-parameter \
+    -Wno-write-strings \
+    -Wno-sign-compare \
+    -fno-delete-null-pointer-checks \
+    -fomit-frame-pointer \
+    -nostdlib \
+    -ggdb \
+    -O3
+
+ifdef CORTEX_M_CORE
+  ifeq ($(CORTEX_M_CORE), M4F)
+    PLATFORM_FLAGS += -mfloat-abi=hard
+  else ifeq ($(CORTEX_M_CORE), M4)
+    PLATFORM_FLAGS += -mfloat-abi=softfp
+  else
+    $(error invalid target defined in command line option CORTEX_M_CORE=[M4|M4F])
+  endif
+else
+  $(error CORTEX_M_CORE=[M4|M4F] not defined on the command line)
+endif
+
+  CXXFLAGS += $(PLATFORM_FLAGS) -std=gnu++11 -fno-rtti -fpermissive -fno-use-cxa-atexit -fno-threadsafe-statics
+  CCFLAGS += $(PLATFORM_FLAGS)
+  CXXFLAGS := $(CXXFLAGS:-DTF_LITE_STATIC_MEMORY=)
+  CCFLAGS := $(CCFLAGS:-DTF_LITE_STATIC_MEMORY=)
+  
+  INCLUDES += \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
+    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/
+
+endif
\ No newline at end of file

From b04adc3bd6e49f0aa795a3d85cc3250a6772f38b Mon Sep 17 00:00:00 2001
From: ml-0 <manuel.loew.infineon@gmail.com>
Date: Tue, 29 Sep 2020 10:59:46 +0200
Subject: [PATCH 04/54] reduces number of platform flags, adds CI test, adds
 suggestions from review.

---
 .../lite/micro/cortex_m_gcc_generic/README.md | 18 +------
 .../micro/cortex_m_gcc_generic/debug_log.cc   | 15 +++---
 .../{debug_log.h => debug_log_callback.h}     | 34 +++++++-----
 tensorflow/lite/micro/debug_log.h             | 10 +++-
 .../lite/micro/tools/ci_build/test_all.sh     |  3 ++
 .../ci_build/test_cortex_m_gcc_generic.sh     | 49 +++++++++++++++++
 .../targets/cortex_m_gcc_generic_makefile.inc | 54 ++++---------------
 7 files changed, 104 insertions(+), 79 deletions(-)
 rename tensorflow/lite/micro/cortex_m_gcc_generic/{debug_log.h => debug_log_callback.h} (50%)
 create mode 100644 tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh

diff --git a/tensorflow/lite/micro/cortex_m_gcc_generic/README.md b/tensorflow/lite/micro/cortex_m_gcc_generic/README.md
index f739ec0c1c2..9ec2468c8d9 100644
--- a/tensorflow/lite/micro/cortex_m_gcc_generic/README.md
+++ b/tensorflow/lite/micro/cortex_m_gcc_generic/README.md
@@ -1,19 +1,5 @@
 # Generic Cortex-Mx customizations
-The customization requires a definition where the debug log goes to. The purpose of the generic Cortex-Mx target is to generate a TFLu library file for use in application projects outside of this repo. As the chip HAL and the board specific layer are only defined in the application project, the TFLu library cannot write the debug log anywhere. Instead, we allow the application layer to register a callback function for writing the TFLu kernel debug log.
+The customization requires a definition where the debug log goes to. The purpose of the generic Cortex-Mx target is to generate a TFLM library file for use in application projects outside of this repo. As the chip HAL and the board specific layer are only defined in the application project, the TFLM library cannot write the debug log anywhere. Instead, we allow the application layer to register a callback function for writing the TFLM kernel debug log.
 
 # Usage
-The application layer must implement and register the callback before calling the network in a way similar to
-
-    void debug_log_printf(const char* s)
-    {
-        printf(s);
-    }
-
-    int main(void)
-    {
-        // Register callback for printing debug log
-        DebugLog_register_callback(debug_log_printf);
-        
-        // now call the network
-        TfLiteStatus invoke_status = interpreter->Invoke();
-    }
+See debug_log_callback.h
diff --git a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc b/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc
index 45c40b3eee7..a94563aa939 100644
--- a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc
+++ b/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc
@@ -20,18 +20,19 @@ limitations under the License.
 extern "C" {
 #endif  // __cplusplus
 
-#include "tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.h"
+#include "tensorflow/lite/micro/debug_log.h"
+#include "tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h"
 
-static void (*DebugLog_callback)(const char* s) = nullptr;
+static DebugLogCallback debug_log_callback = nullptr;
 
-extern "C" void DebugLog_register_callback(void (*cb)(const char* s)) {
-  DebugLog_callback = cb;
+void RegisterDebugLogCallback(void (*cb)(const char* s)) {
+  debug_log_callback = cb;
 }
 
-extern "C" void DebugLog(const char* s) {
+void DebugLog(const char* s) {
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
-  if (DebugLog_callback) {
-	  DebugLog_callback(s);
+  if (debug_log_callback) {
+	  debug_log_callback(s);
   }
 #endif
 }
diff --git a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.h b/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h
similarity index 50%
rename from tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.h
rename to tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h
index c6c1c71f1c3..e246350db99 100644
--- a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.h
+++ b/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h
@@ -12,26 +12,36 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_H_
-#define TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_H_
+#ifndef TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_CALLBACK_H_
+#define TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_CALLBACK_H_
+
+// The application layer must implement and register a callback before calling the network in a way similar to
+//
+//    void debug_log_printf(const char* s)
+//    {
+//        printf(s);
+//    }
+//
+//    int main(void)
+//    {
+//        // Register callback for printing debug log
+//        RegisterDebugLogCallback(debug_log_printf);
+//        
+//        // now call the network
+//        TfLiteStatus invoke_status = interpreter->Invoke();
+//    }
 
 #ifdef __cplusplus
 extern "C" {
 #endif  // __cplusplus
 
-// This function is used to register a callback for debug logging.
-// It must be called before the first call to DebugLog().
-void DebugLog_register_callback(void (*cb)(const char* s));
+typedef void (*DebugLogCallback)(const char* s);
 
-// This function should be implemented by each target platform, and provide a
-// way for strings to be output to some text stream. For more information, see
-// tensorflow/lite/micro/debug_log.cc.
-// Note that before the first call to DebugLog()
-// a callback function must be registered by calling DebugLog_register_callback().
-void DebugLog(const char* s);
+// Registers and application-specific callback for debug logging. It must be called before the first call to DebugLog().
+void RegisterDebugLogCallback(DebugLogCallback callback);
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 
-#endif  // TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_H_
+#endif  // TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_CALLBACK_H_
diff --git a/tensorflow/lite/micro/debug_log.h b/tensorflow/lite/micro/debug_log.h
index 1004ab9f5db..c2840d0f4b5 100644
--- a/tensorflow/lite/micro/debug_log.h
+++ b/tensorflow/lite/micro/debug_log.h
@@ -15,9 +15,17 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_MICRO_DEBUG_LOG_H_
 #define TENSORFLOW_LITE_MICRO_DEBUG_LOG_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
 // This function should be implemented by each target platform, and provide a
 // way for strings to be output to some text stream. For more information, see
 // tensorflow/lite/micro/debug_log.cc.
-extern "C" void DebugLog(const char* s);
+void DebugLog(const char* s);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
 
 #endif  // TENSORFLOW_LITE_MICRO_DEBUG_LOG_H_
diff --git a/tensorflow/lite/micro/tools/ci_build/test_all.sh b/tensorflow/lite/micro/tools/ci_build/test_all.sh
index e79d0d4d1ad..354d26d9102 100755
--- a/tensorflow/lite/micro/tools/ci_build/test_all.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_all.sh
@@ -52,4 +52,7 @@ tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh PRESUBMIT
 echo "Running Arduino tests at `date`"
 tensorflow/lite/micro/tools/ci_build/test_arduino.sh
 
+echo "Running cortex_m_gcc_generic tests at `date`"
+tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh
+
 echo "Finished all micro tests at `date`"
diff --git a/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh
new file mode 100644
index 00000000000..77d364bde2d
--- /dev/null
+++ b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh
@@ -0,0 +1,49 @@
+#!/usr/bin/env bash
+# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Tests the microcontroller code using a Cortex-M4/M4F platform.
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT_DIR=${SCRIPT_DIR}/../../../../..
+cd "${ROOT_DIR}"
+
+source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
+
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean_downloads
+
+TARGET=cortex_m_gcc_generic
+
+# TODO(b/143715361): downloading first to allow for parallel builds.
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=cmsis-nn TARGET=${TARGET} CORTEX_M_CORE=M4F third_party_downloads
+
+# Build for Cortex-M4 (no FPU) without CMSIS
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} CORTEX_M_CORE=M4 microlite
+
+# Build for Cortex-M4F (FPU present) without CMSIS
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} CORTEX_M_CORE=M4F microlite
+
+# Build for Cortex-M4 (no FPU) with CMSIS
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TAGS=cmsis-nn TARGET=${TARGET} CORTEX_M_CORE=M4 microlite
+
+# Build for Cortex-M4 (FPU present) with CMSIS
+readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
+readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TAGS=cmsis-nn TARGET=${TARGET} CORTEX_M_CORE=M4F microlite
diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc
index 732a11b50aa..af8dd732684 100644
--- a/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc
+++ b/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc
@@ -1,7 +1,5 @@
 # Generic Makefile target for ARM Cortex Mx gcc builds.
-ifeq ($(TARGET),$(filter $(TARGET),\
-  cortex_m_gcc_generic\
-  ))
+ifeq ($(TARGET), cortex_m_gcc_generic)
   TARGET_ARCH := arm
   TARGET_TOOLCHAIN_PREFIX := arm-none-eabi-
   export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH)
@@ -10,59 +8,29 @@ ifeq ($(TARGET),$(filter $(TARGET),\
 
   PLATFORM_FLAGS = \
     -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
-    -DTF_LITE_STATIC_MEMORY \
-    -DNDEBUG \
     -DTF_LITE_MCU_DEBUG_LOG \
-    -D __FPU_PRESENT=1 \
-    -DARM_MATH_CM4 \
     -fmessage-length=0 \
     -fno-exceptions \
     -fno-unwind-tables \
     -ffunction-sections \
     -fdata-sections \
     -funsigned-char \
-    -MMD \
     -mcpu=cortex-m4 \
-    -mthumb \
     -mfpu=fpv4-sp-d16 \
-    -Wvla \
-    -Wall \
-    -Wextra \
-    -Wno-shadow \
-    -Wno-missing-field-initializers \
-    -Wno-double-promotion \
-    -Wno-strict-aliasing \
-    -Wno-type-limits \
-    -Wno-unused-function \
-    -Wno-unused-variable \
-    -Wno-unused-parameter \
-    -Wno-write-strings \
-    -Wno-sign-compare \
-    -fno-delete-null-pointer-checks \
-    -fomit-frame-pointer \
-    -nostdlib \
-    -ggdb \
-    -O3
+    -mthumb \
+    -fomit-frame-pointer
 
-ifdef CORTEX_M_CORE
-  ifeq ($(CORTEX_M_CORE), M4F)
-    PLATFORM_FLAGS += -mfloat-abi=hard
-  else ifeq ($(CORTEX_M_CORE), M4)
-    PLATFORM_FLAGS += -mfloat-abi=softfp
-  else
-    $(error invalid target defined in command line option CORTEX_M_CORE=[M4|M4F])
-  endif
-else
+ifeq ($(CORTEX_M_CORE), M4F)
+  PLATFORM_FLAGS += -mfloat-abi=hard
+else ifeq ($(CORTEX_M_CORE), M4)
+  PLATFORM_FLAGS += -mfloat-abi=softfp
+else ifeq ($(CORTEX_M_CORE), )
   $(error CORTEX_M_CORE=[M4|M4F] not defined on the command line)
+else
+  $(error invalid target defined in command line option CORTEX_M_CORE=[M4|M4F])
 endif
 
-  CXXFLAGS += $(PLATFORM_FLAGS) -std=gnu++11 -fno-rtti -fpermissive -fno-use-cxa-atexit -fno-threadsafe-statics
+  CXXFLAGS += $(PLATFORM_FLAGS)
   CCFLAGS += $(PLATFORM_FLAGS)
-  CXXFLAGS := $(CXXFLAGS:-DTF_LITE_STATIC_MEMORY=)
-  CCFLAGS := $(CCFLAGS:-DTF_LITE_STATIC_MEMORY=)
   
-  INCLUDES += \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \
-    -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/
-
 endif
\ No newline at end of file

From 9e45a817d4f45b584d2fd38947f9e207232dd539 Mon Sep 17 00:00:00 2001
From: ml-0 <manuel.loew.infineon@gmail.com>
Date: Thu, 1 Oct 2020 07:53:44 +0200
Subject: [PATCH 05/54] moves CMSIS Core include into cmsis.inc and some minor
 changes to provide better (cleaner) code.

---
 tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc    | 2 +-
 .../lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh | 1 -
 tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc        | 7 +++++++
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc b/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc
index a94563aa939..91b176c005f 100644
--- a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc
+++ b/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc
@@ -31,7 +31,7 @@ void RegisterDebugLogCallback(void (*cb)(const char* s)) {
 
 void DebugLog(const char* s) {
 #ifndef TF_LITE_STRIP_ERROR_STRINGS
-  if (debug_log_callback) {
+  if (debug_log_callback != nullptr) {
 	  debug_log_callback(s);
   }
 #endif
diff --git a/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh
index 77d364bde2d..324ea0f7ab9 100644
--- a/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh
@@ -25,7 +25,6 @@ cd "${ROOT_DIR}"
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
 
 readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
-readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean_downloads
 
 TARGET=cortex_m_gcc_generic
 
diff --git a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc b/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
index 8bb0d58bad1..f7b740b30dd 100644
--- a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
+++ b/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc
@@ -98,4 +98,11 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),)
       $(CMSIS_PATH)CMSIS/DSP/Include/arm_math.h \
       $(CMSIS_PATH)CMSIS/DSP/Include/arm_common_tables.h
 
+    # Need to add the CMSIS Core includes path.
+    # All other CMSIS header files are included with their relative path
+    # in the CMSIS-NN micro kernel source files in
+    # tensorflow/lite/micro/kernels/cmsis-nn
+    INCLUDES += \
+      -I$(CMSIS_PATH)/CMSIS/Core/Include
+
 endif

From be9727ea6d6d7c2883bb343154f69274887ee171 Mon Sep 17 00:00:00 2001
From: Advait Jain <advaitjain@users.noreply.github.com>
Date: Thu, 1 Oct 2020 12:14:59 -0700
Subject: [PATCH 06/54] set test_cortex_m_gcc_generic.sh to executable and
 removed unnecessary make clean

CI checks should no longer fail after this change.
---
 .../lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh      | 2 --
 1 file changed, 2 deletions(-)
 mode change 100644 => 100755 tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh

diff --git a/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh
old mode 100644
new mode 100755
index 324ea0f7ab9..596c88965e7
--- a/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh
+++ b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh
@@ -24,8 +24,6 @@ cd "${ROOT_DIR}"
 
 source tensorflow/lite/micro/tools/ci_build/helper_functions.sh
 
-readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean
-
 TARGET=cortex_m_gcc_generic
 
 # TODO(b/143715361): downloading first to allow for parallel builds.

From 12833639d512dbb9ccb73089dbf35e7f497b0ffe Mon Sep 17 00:00:00 2001
From: Revan Sopher <rsopher@google.com>
Date: Fri, 2 Oct 2020 11:02:37 -0700
Subject: [PATCH 07/54] Add sensible __repr__ implementation for TableConfig,
 FeatureConfig.

PiperOrigin-RevId: 335060379
Change-Id: I456fae3f07fd74b08db8f069ab817ac18f458fb7
---
 .../python/tpu/tpu_embedding_v2_utils.py      | 34 +++++++++++++++++++
 .../python/tpu/tpu_embedding_v2_utils_test.py | 28 +++++++++++++++
 2 files changed, 62 insertions(+)

diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
index 8487581346b..e04f1f0281a 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -21,6 +21,7 @@ from __future__ import unicode_literals
 
 import abc
 import math
+import typing
 from typing import Any, Dict, Callable, List, Optional, Text, Tuple, TypeVar, Union
 import six
 
@@ -620,6 +621,29 @@ class TableConfig(object):
     self.combiner = combiner
     self.name = name
 
+  def __repr__(self):
+    # If using the default initializer, just print "None" for clarity.
+    initializer = self.initializer
+
+    if isinstance(initializer, init_ops_v2.TruncatedNormal):
+      # PY2 type checking can't infer type of initializer even after if.
+      initializer = typing.cast(init_ops_v2.TruncatedNormal, initializer)
+      if (initializer.mean == 0.0
+          and math.isclose(initializer.stddev, 1/math.sqrt(self.dim))):  # pytype: disable=module-attr (math.isclose not in PY2)
+        initializer = None
+
+    return (
+        "TableConfig(vocabulary_size={vocabulary_size!r}, dim={dim!r}, "
+        "initializer={initializer!r}, optimizer={optimizer!r}, "
+        "combiner={combiner!r}, name={name!r})".format(
+            vocabulary_size=self.vocabulary_size,
+            dim=self.dim,
+            initializer=initializer,
+            optimizer=self.optimizer,
+            combiner=self.combiner,
+            name=self.name,)
+    )
+
 
 @tf_export("tpu.experimental.embedding.FeatureConfig")
 class FeatureConfig(object):
@@ -697,3 +721,13 @@ class FeatureConfig(object):
     self.table = table
     self.max_sequence_length = max_sequence_length
     self.name = name
+
+  def __repr__(self):
+    return (
+        "FeatureConfig(table={table!r}, "
+        "max_sequence_length={max_sequence_length!r}, name={name!r})"
+        .format(
+            table=self.table,
+            max_sequence_length=self.max_sequence_length,
+            name=self.name)
+    )
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py b/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
index 14dfb32e075..48797b00009 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils_test.py
@@ -60,6 +60,34 @@ class TPUEmbeddingOptimizerTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(1., opt.clip_gradient_max)
 
 
+class ConfigTest(test.TestCase):
+
+  def test_table_config_repr(self):
+    table = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=2, dim=4, initializer=None,
+        combiner='sum', name='table')
+
+    self.assertEqual(
+        repr(table),
+        'TableConfig(vocabulary_size=2, dim=4, initializer=None, '
+        'optimizer=None, combiner=\'sum\', name=\'table\')')
+
+  def test_feature_config_repr(self):
+    table = tpu_embedding_v2_utils.TableConfig(
+        vocabulary_size=2, dim=4, initializer=None,
+        combiner='sum', name='table')
+
+    feature_config = tpu_embedding_v2_utils.FeatureConfig(
+        table=table, name='feature')
+
+    self.assertEqual(
+        repr(feature_config),
+        'FeatureConfig(table=TableConfig(vocabulary_size=2, dim=4, '
+        'initializer=None, optimizer=None, combiner=\'sum\', name=\'table\'), '
+        'max_sequence_length=0, name=\'feature\')'
+    )
+
+
 if __name__ == '__main__':
   v2_compat.enable_v2_behavior()
   test.main()

From 6015d64eab2bc9473e6d35b60d19d37ad8cce9ca Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 2 Oct 2020 11:03:12 -0700
Subject: [PATCH 08/54] [XLA] Switch implementation of erf to use the same
 rational polynomial approximation as Eigen.

PiperOrigin-RevId: 335060505
Change-Id: I58d091031c28ab5649553de58bd113846ba7e10c
---
 tensorflow/compiler/xla/client/lib/math.cc | 30 +++++++++++++++++-----
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 410c86732d6..76cc6f0159b 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -203,7 +203,7 @@ static XlaOp ErfcImpl32(XlaOp x) {
 // Precondition: abs(x) <= 1.  Otherwise, use ErfcImpl.
 //
 // This follows Cephes's f32 implementation of erf.
-static XlaOp ErfImpl32(XlaOp x) {
+static XlaOp ErfImpl32Cephes(XlaOp x) {
   // Coefficients for by erf(f32), from Cephes.
   //
   // erf(x) = x P(x^2), 0 < x < 1
@@ -291,11 +291,31 @@ XlaOp Erfc(XlaOp x) {
     // (not surprising!), so upcast to f32 in this case.
     return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
       return Select(Gt(Abs(x), ScalarLike(x, 1)), ErfcImpl32(x),
-                    ScalarLike(x, 1) - ErfImpl32(x));
+                    ScalarLike(x, 1) - ErfImpl32Cephes(x));
     });
   });
 }
 
+// Compute a polynomial approximation of the error function.
+// This is the same approximation used by Eigen.
+static XlaOp ErfImpl32(XlaOp x) {
+  static const std::array<float, 7> kAlpha{
+      -2.72614225801306e-10f, 2.77068142495902e-08f,  -2.10102402082508e-06f,
+      -5.69250639462346e-05f, -7.34990630326855e-04f, -2.95459980854025e-03f,
+      -1.60960333262415e-02f,
+  };
+
+  static const std::array<float, 5> kBeta{
+      -1.45660718464996e-05f, -2.13374055278905e-04f, -1.68282697438203e-03f,
+      -7.37332916720468e-03f, -1.42647390514189e-02f,
+  };
+
+  x = Clamp(ScalarLike(x, -4.f), x, ScalarLike(x, 4.f));
+  auto x2 = x * x;
+  return x * EvaluatePolynomial<float>(x2, kAlpha) /
+         EvaluatePolynomial<float>(x2, kBeta);
+}
+
 XlaOp Erf(XlaOp x) {
   auto& b = *x.builder();
   return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -310,10 +330,8 @@ XlaOp Erf(XlaOp x) {
     }
     // Erf(c)Impl don't have enough precision when run with bf16 intermediates
     // (not surprising!), so upcast to f32 in this case.
-    return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
-      return Select(Lt(Abs(x), ScalarLike(x, 1)), ErfImpl32(x),
-                    ScalarLike(x, 1) - ErfcImpl32(x));
-    });
+    return DoWithUpcastToF32(x, {BF16, F16},
+                             [](XlaOp x) { return ErfImpl32(x); });
   });
 }
 

From 05316029214b567a8afaf875b9e2e4ce3e93dd27 Mon Sep 17 00:00:00 2001
From: Haoyu Zhang <haoyuzhang@google.com>
Date: Fri, 2 Oct 2020 11:03:37 -0700
Subject: [PATCH 09/54] Add Ref/Unref to make sure rendezvous outlives its
 cancellation callback.

Reenable test case on GPU.

PiperOrigin-RevId: 335060618
Change-Id: I59730027765febbc086438401ba2002abe58423f
---
 .../core/common_runtime/rendezvous_mgr.cc     |  4 +--
 tensorflow/core/framework/local_rendezvous.cc | 29 +++++++++++++++++--
 tensorflow/core/framework/local_rendezvous.h  |  9 +++++-
 tensorflow/core/framework/rendezvous.cc       |  2 +-
 tensorflow/python/eager/function_test.py      |  3 +-
 5 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 4af624ae1d2..2ee74477231 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -151,7 +151,7 @@ void IntraProcessRecvAsyncImpl(const DeviceMgr* device_mgr,
 
 RefCountedIntraProcessRendezvous::RefCountedIntraProcessRendezvous(
     const DeviceMgr* device_mgr)
-    : device_mgr_(device_mgr) {}
+    : device_mgr_(device_mgr), local_(this) {}
 
 RefCountedIntraProcessRendezvous::~RefCountedIntraProcessRendezvous() {}
 
@@ -176,7 +176,7 @@ void RefCountedIntraProcessRendezvous::StartAbort(const Status& s) {
 
 PrivateIntraProcessRendezvous::PrivateIntraProcessRendezvous(
     const DeviceMgr* device_mgr)
-    : device_mgr_(device_mgr) {}
+    : device_mgr_(device_mgr), local_(nullptr) {}
 
 PrivateIntraProcessRendezvous::~PrivateIntraProcessRendezvous() {}
 
diff --git a/tensorflow/core/framework/local_rendezvous.cc b/tensorflow/core/framework/local_rendezvous.cc
index 3535b57e7db..34053808b4a 100644
--- a/tensorflow/core/framework/local_rendezvous.cc
+++ b/tensorflow/core/framework/local_rendezvous.cc
@@ -187,6 +187,20 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
     CancellationToken token = CancellationManager::kInvalidToken;
     bool already_cancelled = false;
     if (cm != nullptr) {
+      // Increment the refcount when cancellation manager is present, to make
+      // sure the rendezvous outlives the recv and its cancel callbacks.
+      // This refcount is dropped in exactly one of the following cases:
+      // (1) Recv registers cancellation callback to cm, and then cm is
+      //     cancelled, unref in the cancellation callback;
+      // (2) Recv registers cancellation callback to cm, but cm is already
+      //     cancelled, unref in the already_cancelled check;
+      // (3) Recv is successful, and item done callback finishes deregistering
+      //     the cancellation callback, unref in the item done callback;
+      // (4) Recv is successful, but the item done callback fails to deregister
+      //     the cancellation callback because cm already StartCancel, in this
+      //     case the cancellation callback will be invoked by the cm anyway,
+      //     unref in the cancellation callback.
+      if (rc_owner_) rc_owner_->Ref();
       token = cm->get_cancellation_token();
       already_cancelled = !cm->RegisterCallback(token, [this, token, key_hash] {
         Item* item = nullptr;
@@ -230,10 +244,14 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
               Rendezvous::Args(), item->args, Tensor(), /*is_dead=*/false);
           delete item;
         }
+        // Unref case (1) and (4)
+        if (rc_owner_) rc_owner_->Unref();
       });
     }
     if (already_cancelled) {
       mu_.unlock();
+      // Unref case (2)
+      if (rc_owner_) rc_owner_->Unref();
       done(StatusGroup::MakeDerived(
                errors::Cancelled("RecvAsync is cancelled.")),
            Rendezvous::Args(), recv_args, Tensor(), /*is_dead=*/false);
@@ -250,10 +268,17 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
       // cancellation manager may no longer be live after `done` is called.
       queue->push_back(new Item(
           recv_args,
-          [cm, token, done = std::move(done)](
+          [this, cm, token, done = std::move(done)](
               const Status& s, const Rendezvous::Args& send_args,
               const Rendezvous::Args& recv_args, const Tensor& v, bool dead) {
-            cm->TryDeregisterCallback(token);
+            // TryDeregisterCallback returns true when the cancellation callback
+            // is successfully deregistered. If it fails because the CM already
+            // StartAbort, Unref will happen inside the cancellation callback
+            // when called by the CM.
+            if (cm->TryDeregisterCallback(token)) {
+              // Unref case (3)
+              if (this->rc_owner_) this->rc_owner_->Unref();
+            }
             done(s, send_args, recv_args, v, dead);
           },
           token));
diff --git a/tensorflow/core/framework/local_rendezvous.h b/tensorflow/core/framework/local_rendezvous.h
index 19c218793b6..ed3a5d4dd73 100644
--- a/tensorflow/core/framework/local_rendezvous.h
+++ b/tensorflow/core/framework/local_rendezvous.h
@@ -35,7 +35,11 @@ namespace tensorflow {
 // is not expected to be needed.
 class LocalRendezvous {
  public:
-  LocalRendezvous() = default;
+  // If the class wrapping LocalRendezvous is refcounted (i.e., extending
+  // Rendezvous), pass in its pointer in constructor so the LocalRendezvous
+  // can make sure it outlives the async recv requests.
+  // Pass in nullptr if the wrapping class is not refcounted.
+  explicit LocalRendezvous(Rendezvous* owner) : rc_owner_(owner) {}
   ~LocalRendezvous();
 
   Status Send(const Rendezvous::ParsedKey& key,
@@ -62,6 +66,9 @@ class LocalRendezvous {
 
   typedef gtl::FlatMap<uint64, ItemQueue> Table;
 
+  // Pointer to the owner class of this LocalRendezvous if it is refcounted.
+  const Rendezvous* rc_owner_;
+
   // TODO(zhifengc): shard table_.
   mutex mu_;
   Table table_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/framework/rendezvous.cc b/tensorflow/core/framework/rendezvous.cc
index 764f8995d02..9d63265af62 100644
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@@ -151,7 +151,7 @@ Status RendezvousInterface::Recv(const ParsedKey& key, const Args& args,
 namespace {
 class LocalRendezvousWrapper : public Rendezvous {
  public:
-  LocalRendezvousWrapper() = default;
+  LocalRendezvousWrapper() : impl_(this) {}
 
   Status Send(const ParsedKey& key, const Args& send_args, const Tensor& val,
               const bool is_dead) override {
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index f825d6db971..69433bc9ce3 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -3364,8 +3364,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(errors.CancelledError):
       cancelable_func()
 
-  # TODO(b/162544929): Enable this test.
-  def DISABLE_testCancelBlockedFunctionExecution(self):
+  def testCancelBlockedFunctionExecution(self):
     if not context.executing_eagerly():
       self.skipTest('eager only')
 

From 6597cff0a3ee53a99af17d60c7f0cfe2bdb141bc Mon Sep 17 00:00:00 2001
From: Chuanhao Zhuge <chuanhao@google.com>
Date: Fri, 2 Oct 2020 11:10:13 -0700
Subject: [PATCH 10/54] Support more DTypes in c_api_tfrt.

PiperOrigin-RevId: 335062120
Change-Id: Iae99c18d70f98c608c382a3b78a9185f2be2463f
---
 tensorflow/python/eager/tensor_test.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index 7539b111f19..f0226435a72 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -348,7 +348,6 @@ class TFETensorTest(test_util.TensorFlowTestCase):
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   @test_util.run_in_graph_and_eager_modes
-  @test_util.disable_tfrt("b/169372865: support CreateTensor for half.")
   def testConvertToTensorNumpyZeroDim(self):
     for np_type, dtype in [(np.int32, dtypes.int32),
                            (np.half, dtypes.half),
@@ -545,7 +544,6 @@ class TFETensorUtilTest(test_util.TensorFlowTestCase):
       constant_op.constant(l)
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
-  @test_util.disable_tfrt("b/169372865: support CreateTensor for complex128.")
   def testFloatAndIntAreConvertibleToComplex(self):
     a = [[1., 1], [1j, 2j]]
     np_value = np.array(a, dtype=np.complex128)

From 02d75f241baa1e558419ea42aa4da6104b3e550b Mon Sep 17 00:00:00 2001
From: Cesar Crusius <ccrusius@google.com>
Date: Fri, 2 Oct 2020 11:11:13 -0700
Subject: [PATCH 11/54] Fixes is_initialized() behavior under custom devices.

This is a "safe low blast radius" fix for a particular instance of a
device placement shortcoming: an explicit device placement is
never overridden by a custom device. This change fixes it for calls to
is_initialized() while a complete solution is not available.

PiperOrigin-RevId: 335062303
Change-Id: Iab5b32d371d1217690fdcb656598f98005d6c232
---
 tensorflow/python/ops/resource_variable_ops.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 4806617c1af..548eb30df97 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -800,7 +800,13 @@ class BaseResourceVariable(variables.VariableV1, core.Tensor):
     Returns:
       A `Tensor` of type `bool`.
     """
-    return gen_resource_variable_ops.var_is_initialized_op(self.handle, name)
+    # TODO(b/169792703): The current device placement logic never overrides an
+    # explicit placement with a custom device, causing `v.is_initalized()` to
+    # fail under a non-custom device context if `v` is in a custom device. The
+    # explicit placement below makes this work, but should not be necessary once
+    # the logic is updated to handle cases like this.
+    with ops.device(self.device):
+      return gen_resource_variable_ops.var_is_initialized_op(self.handle, name)
 
   def assign_sub(self, delta, use_locking=None, name=None, read_value=True):
     """Subtracts a value from this variable.

From 91a10946704e9ae670375c079c1e9bb09ab3c58c Mon Sep 17 00:00:00 2001
From: Jared Duke <jdduke@google.com>
Date: Fri, 2 Oct 2020 11:37:22 -0700
Subject: [PATCH 12/54] Remove x86_64 builds from legacy TF Android build
 scripts

This build script is effectively deprecated, and the x86_64
backend is causing issues with lengthy build times

PiperOrigin-RevId: 335067884
Change-Id: I5d9a0f396fe97ecfab3d03a1a28b26bab90686c4
---
 tensorflow/tools/ci_build/builds/android_full.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/builds/android_full.sh b/tensorflow/tools/ci_build/builds/android_full.sh
index 8e555a3c71c..165683de63a 100755
--- a/tensorflow/tools/ci_build/builds/android_full.sh
+++ b/tensorflow/tools/ci_build/builds/android_full.sh
@@ -28,7 +28,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 source "${SCRIPT_DIR}/builds_common.sh"
 configure_android_workspace
 
-CPUS=armeabi-v7a,arm64-v8a,x86,x86_64
+CPUS=armeabi-v7a,arm64-v8a,x86
 
 OUT_DIR="$(pwd)/out/"
 AAR_LIB_TMP="$(pwd)/aar_libs"

From eed3ab97e5cba16ef36e9ba4f9a67e5f19244e23 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Fri, 2 Oct 2020 11:52:49 -0700
Subject: [PATCH 13/54] Internal refactoring, noop in oss.

PiperOrigin-RevId: 335070962
Change-Id: I3dc0767cc10e44833471bbbce21b33dd1be16530
---
 tensorflow/cc/saved_model/BUILD | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 6c8b92790c6..056c99eed8e 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -42,13 +42,15 @@ cc_library(
     name = "reader",
     srcs = ["reader.cc"],
     hdrs = ["reader.h"],
-    deps = [":constants"] + if_not_mobile([
+    deps = [
+        ":constants",
+        "//tensorflow/core:protos_all_cc",
+    ] + if_not_mobile([
         # TODO(b/111634734): :lib and :protos_all contain dependencies that
         # cannot be built on mobile platforms. Instead, include the appropriate
         # tf_lib depending on the build platform.
         "@com_google_absl//absl/memory:memory",
         "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
     ]),
 )
 

From 27d26a8d86bceda282ad9ba3e3116a00759d4ebc Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 2 Oct 2020 11:54:14 -0700
Subject: [PATCH 14/54] Roll forward of
 https://github.com/tensorflow/tensorflow/pull/42970 with fix for 5D tensors
 and unsupported axis that cannot use fused batch norm.

PiperOrigin-RevId: 335071264
Change-Id: Id9ab44bdba870336f03522e1ca76d88b1f305a10
---
 tensorflow/core/framework/common_shape_fns.cc | 40 ++++----
 .../generic_layout_optimizer_transposer.cc    | 50 +++++++++-
 .../core/grappler/optimizers/remapper.cc      | 38 +++++---
 .../core/kernels/fused_batch_norm_op.cc       | 73 ++++++++++++---
 tensorflow/core/ops/nn_ops.cc                 |  4 +-
 .../python/grappler/layout_optimizer_test.py  | 88 ++++++++++++++++++
 .../python/keras/layers/normalization.py      | 23 +++--
 .../python/keras/layers/normalization_test.py | 11 ++-
 .../python/ops/nn_fused_batchnorm_test.py     | 93 +++++++++++++++----
 tensorflow/python/ops/nn_grad.py              | 35 +++++--
 tensorflow/python/ops/nn_impl.py              |  7 +-
 11 files changed, 379 insertions(+), 83 deletions(-)

diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index c54418ba648..2d30d41c7a6 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1121,8 +1121,17 @@ Status AvgPoolShape(shape_inference::InferenceContext* c) {
 }
 
 Status FusedBatchNormShape(shape_inference::InferenceContext* c) {
+  string data_format_str;
+  TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
+  TensorFormat data_format;
+  if (!FormatFromString(data_format_str, &data_format)) {
+    return errors::InvalidArgument("Invalid data format string: ",
+                                   data_format_str);
+  }
+  const int rank =
+      (data_format_str == "NDHWC" or data_format_str == "NCDHW") ? 5 : 4;
   ShapeHandle x;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &x));
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), rank, &x));
 
   bool is_training;
   TF_RETURN_IF_ERROR(c->GetAttr("is_training", &is_training));
@@ -1131,14 +1140,8 @@ Status FusedBatchNormShape(shape_inference::InferenceContext* c) {
     exponential_avg_factor = 1.0f;  // default value
   }
   int number_inputs = (is_training && exponential_avg_factor == 1.0f) ? 3 : 5;
-  string data_format_str;
-  TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
-  TensorFormat data_format;
-  if (!FormatFromString(data_format_str, &data_format)) {
-    return errors::InvalidArgument("Invalid data format string: ",
-                                   data_format_str);
-  }
-  int channel_dim_index = GetTensorFeatureDimIndex(4, data_format);
+
+  int channel_dim_index = GetTensorFeatureDimIndex(rank, data_format);
   DimensionHandle channel_dim = c->Dim(x, channel_dim_index);
 
   // covers scale, offset, and if is_training is false, mean, variance
@@ -1191,13 +1194,6 @@ Status FusedBatchNormExShape(shape_inference::InferenceContext* c) {
 }
 
 Status FusedBatchNormGradShape(shape_inference::InferenceContext* c) {
-  ShapeHandle y_backprop;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &y_backprop));
-  ShapeHandle x;
-  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 4, &x));
-
-  bool is_training;
-  TF_RETURN_IF_ERROR(c->GetAttr("is_training", &is_training));
   string data_format_str;
   TF_RETURN_IF_ERROR(c->GetAttr("data_format", &data_format_str));
   TensorFormat data_format;
@@ -1205,7 +1201,17 @@ Status FusedBatchNormGradShape(shape_inference::InferenceContext* c) {
     return errors::InvalidArgument("Invalid data format string: ",
                                    data_format_str);
   }
-  int channel_dim_index = GetTensorFeatureDimIndex(4, data_format);
+  const int rank =
+      (data_format_str == "NDHWC" or data_format_str == "NCDHW") ? 5 : 4;
+  ShapeHandle y_backprop;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(0), rank, &y_backprop));
+  ShapeHandle x;
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(1), rank, &x));
+
+  bool is_training;
+  TF_RETURN_IF_ERROR(c->GetAttr("is_training", &is_training));
+
+  int channel_dim_index = GetTensorFeatureDimIndex(rank, data_format);
   DimensionHandle channel_dim = c->Dim(y_backprop, channel_dim_index);
   TF_RETURN_IF_ERROR(
       c->Merge(channel_dim, c->Dim(x, channel_dim_index), &channel_dim));
diff --git a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
index 10253f187c0..3c466edc69b 100644
--- a/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
+++ b/tensorflow/core/grappler/optimizers/generic_layout_optimizer_transposer.cc
@@ -670,7 +670,25 @@ Status LayoutSensitiveOpTransposer::UpdateNode(TransposeContext* context,
 Status DefaultLayoutSensitiveOpTransposer::TransposeNode(
     TransposeContext* context, utils::MutableNodeView* node) {
   DCHECK(IsDefaultLayoutSensitiveOp(*node->node()));
-  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4)) {
+  const auto* output_shape_attr = node->GetAttr(kAttrOutputShape);
+  const auto& shape = output_shape_attr->list().shape(0);
+  const int rank = shape.dim_size();
+  std::string src_format = context->src_format;
+  std::string dst_format = context->dst_format;
+  // Update the format from 4D to 5D layout if necessary.
+  bool allow_5d = rank == 5 && (src_format == "NHWC" || src_format == "NCHW");
+  if (allow_5d) {
+    std::string src_format_3d = src_format == "NHWC" ? "NDHWC" : "NCDHW";
+    std::string dst_format_3d = dst_format == "NHWC" ? "NDHWC" : "NCDHW";
+    context->AssignDeviceAndDataFormats(context->target_device, src_format_3d,
+                                        dst_format_3d);
+  }
+  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, rank)) {
+    // Change back to the original layout due to early exit.
+    if (allow_5d) {
+      context->AssignDeviceAndDataFormats(context->target_device, src_format,
+                                          dst_format);
+    }
     return Status::OK();
   }
   VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
@@ -679,6 +697,11 @@ Status DefaultLayoutSensitiveOpTransposer::TransposeNode(
   TF_RETURN_IF_ERROR(UpdateNode(context, node));
   TF_RETURN_IF_ERROR(UpdateFaninEdgesWithOp(context, {0}, node, kOpTranspose));
   TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
+  // Change back the format from 5D to 4D layout.
+  if (allow_5d) {
+    context->AssignDeviceAndDataFormats(context->target_device, src_format,
+                                        dst_format);
+  }
   return context->graph_view->GetMutationBuilder()->Apply();
 }
 
@@ -881,8 +904,26 @@ bool FusedBatchNormGradTransposer::IsTraining(
 Status FusedBatchNormGradTransposer::TransposeNode(
     TransposeContext* context, utils::MutableNodeView* node) {
   DCHECK(IsFusedBatchNormGrad(*node->node()));
-  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, 4) ||
+  const auto* output_shape_attr = node->GetAttr(kAttrOutputShape);
+  const auto& shape = output_shape_attr->list().shape(0);
+  const int rank = shape.dim_size();
+  std::string src_format = context->src_format;
+  std::string dst_format = context->dst_format;
+  // Update the format from 4D to 5D layout if necessary.
+  bool allow_5d = rank == 5 && (src_format == "NHWC" || src_format == "NCHW");
+  if (allow_5d) {
+    std::string src_format_3d = src_format == "NHWC" ? "NDHWC" : "NCDHW";
+    std::string dst_format_3d = dst_format == "NHWC" ? "NDHWC" : "NCDHW";
+    context->AssignDeviceAndDataFormats(context->target_device, src_format_3d,
+                                        dst_format_3d);
+  }
+  if (!ShouldProcess(*context, *node) || !IsFanoutPortRankN(*node, 0, rank) ||
       !IsTraining(*node)) {
+    // Change back to the original layout due to early exit.
+    if (allow_5d) {
+      context->AssignDeviceAndDataFormats(context->target_device, src_format,
+                                          dst_format);
+    }
     return Status::OK();
   }
   VLOG(3) << "GenericLayoutOptimizer: transforming node '" << node->GetName()
@@ -892,6 +933,11 @@ Status FusedBatchNormGradTransposer::TransposeNode(
   TF_RETURN_IF_ERROR(
       UpdateFaninEdgesWithOp(context, {0, 1}, node, kOpTranspose));
   TF_RETURN_IF_ERROR(UpdateFanoutEdgesWithOp(context, {0}, node, kOpTranspose));
+  // Change back the format from 5D to 4D layout.
+  if (allow_5d) {
+    context->AssignDeviceAndDataFormats(context->target_device, src_format,
+                                        dst_format);
+  }
   return context->graph_view->GetMutationBuilder()->Apply();
 }
 
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 115428ff5ef..db528da2f6d 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -1438,29 +1438,41 @@ Status AddBatchNormNodes(RemapperContext* ctx, const FusedBatchNorm& matched) {
   utils::Mutation* mutation = ctx->graph_view.GetMutationBuilder();
   Status status;
 
-  if (fused_node.attr().at(kDataFormat).s() == "NCHW") {
+  string x_format = fused_node.attr().at(kDataFormat).s();
+  if (x_format == "NCHW" or x_format == "NCDHW") {
     // Need to reshape the last 4 inputs
     NodeDef new_shape;
     const string new_shape_name =
-        AddPrefixToNodeName("NCHWShape", fused_node.name());
+        AddPrefixToNodeName(x_format + "Shape", fused_node.name());
     new_shape.set_name(new_shape_name);
     new_shape.set_op("Const");
     new_shape.set_device(fused_node.device());
     *new_shape.add_input() = AsControlDependency(scale);
     (*new_shape.mutable_attr())["dtype"].set_type(DT_INT32);
-    Tensor t(DT_INT32, {4});
-    t.flat<int32>()(0) = 1;
-    t.flat<int32>()(1) = -1;
-    t.flat<int32>()(2) = 1;
-    t.flat<int32>()(3) = 1;
-    t.AsProtoTensorContent(
-        (*new_shape.mutable_attr())["value"].mutable_tensor());
+    if (x_format == "NCHW") {
+      Tensor t(DT_INT32, {4});
+      t.flat<int32>()(0) = 1;
+      t.flat<int32>()(1) = -1;
+      t.flat<int32>()(2) = 1;
+      t.flat<int32>()(3) = 1;
+      t.AsProtoTensorContent(
+          (*new_shape.mutable_attr())["value"].mutable_tensor());
+    } else {
+      Tensor t(DT_INT32, {5});
+      t.flat<int32>()(0) = 1;
+      t.flat<int32>()(1) = -1;
+      t.flat<int32>()(2) = 1;
+      t.flat<int32>()(3) = 1;
+      t.flat<int32>()(4) = 1;
+      t.AsProtoTensorContent(
+          (*new_shape.mutable_attr())["value"].mutable_tensor());
+    }
     mutation->AddNode(std::move(new_shape), &status);
     TF_RETURN_IF_ERROR(status);
 
     NodeDef reshaped_scale;
     reshaped_scale.set_name(
-        AddPrefixToNodeName("NCHWShapedScale", fused_node.name()));
+        AddPrefixToNodeName(x_format + "ShapedScale", fused_node.name()));
     reshaped_scale.set_op("Reshape");
     reshaped_scale.set_device(fused_node.device());
     *reshaped_scale.add_input() = scale;
@@ -1473,7 +1485,7 @@ Status AddBatchNormNodes(RemapperContext* ctx, const FusedBatchNorm& matched) {
 
     NodeDef reshaped_offset;
     reshaped_offset.set_name(
-        AddPrefixToNodeName("NCHWShapedOffset", fused_node.name()));
+        AddPrefixToNodeName(x_format + "ShapedOffset", fused_node.name()));
     reshaped_offset.set_op("Reshape");
     reshaped_offset.set_device(fused_node.device());
     *reshaped_offset.add_input() = offset;
@@ -1486,7 +1498,7 @@ Status AddBatchNormNodes(RemapperContext* ctx, const FusedBatchNorm& matched) {
 
     NodeDef reshaped_mean;
     reshaped_mean.set_name(
-        AddPrefixToNodeName("NCHWShapedMean", fused_node.name()));
+        AddPrefixToNodeName(x_format + "ShapedMean", fused_node.name()));
     reshaped_mean.set_op("Reshape");
     reshaped_mean.set_device(fused_node.device());
     *reshaped_mean.add_input() = mean;
@@ -1499,7 +1511,7 @@ Status AddBatchNormNodes(RemapperContext* ctx, const FusedBatchNorm& matched) {
 
     NodeDef reshaped_variance;
     reshaped_variance.set_name(
-        AddPrefixToNodeName("NCHWShapedVariance", fused_node.name()));
+        AddPrefixToNodeName(x_format + "ShapedVariance", fused_node.name()));
     reshaped_variance.set_op("Reshape");
     reshaped_variance.set_device(fused_node.device());
     *reshaped_variance.add_input() = variance;
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 00ac9be6dcd..d8e58093b07 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -1241,15 +1241,15 @@ class FusedBatchNormOpBase : public OpKernel {
   // If use_reserved_space is false, we don't have 5th output.
   virtual void ComputeWithReservedSpace(OpKernelContext* context,
                                         bool use_reserved_space) {
-    const Tensor& x = context->input(0);
+    Tensor x = context->input(0);
     const Tensor& scale = context->input(1);
     const Tensor& offset = context->input(2);
     const Tensor& estimated_mean = context->input(3);
     const Tensor& estimated_variance = context->input(4);
     const Tensor* side_input = has_side_input_ ? &context->input(5) : nullptr;
 
-    OP_REQUIRES(context, x.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
+    OP_REQUIRES(context, x.dims() == 4 or x.dims() == 5,
+                errors::InvalidArgument("input must be 4 or 5-dimensional",
                                         x.shape().DebugString()));
     OP_REQUIRES(context, scale.dims() == 1,
                 errors::InvalidArgument("scale must be 1-dimensional",
@@ -1264,6 +1264,21 @@ class FusedBatchNormOpBase : public OpKernel {
         context, estimated_variance.dims() == 1,
         errors::InvalidArgument("estimated_variance must be 1-dimensional",
                                 estimated_variance.shape().DebugString()));
+    bool use_reshape = (x.dims() == 5);
+    auto x_shape = x.shape();
+    TensorShape dest_shape;
+    if (use_reshape) {
+      const int64 in_batch = GetTensorDim(x, tensor_format_, 'N');
+      int64 in_planes = GetTensorDim(x, tensor_format_, '0');
+      int64 in_rows = GetTensorDim(x, tensor_format_, '1');
+      int64 in_cols = GetTensorDim(x, tensor_format_, '2');
+      const int64 in_depth = GetTensorDim(x, tensor_format_, 'C');
+      dest_shape = ShapeFromFormat(tensor_format_, in_batch,
+                                   {{in_planes, in_rows * in_cols}}, in_depth);
+      OP_REQUIRES(context, x.CopyFrom(x, dest_shape),
+                  errors::InvalidArgument("Error during tensor copy."));
+    }
+
     if (has_side_input_) {
       OP_REQUIRES(context, side_input->shape() == x.shape(),
                   errors::InvalidArgument(
@@ -1282,8 +1297,10 @@ class FusedBatchNormOpBase : public OpKernel {
     }
 
     Tensor* y = nullptr;
+    auto alloc_shape = use_reshape ? dest_shape : x_shape;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
-                                {0}, 0, x.shape(), &y));
+                                {0}, 0, alloc_shape, &y));
+
     Tensor* batch_mean = nullptr;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                 {3}, 1, scale.shape(), &batch_mean));
@@ -1310,6 +1327,10 @@ class FusedBatchNormOpBase : public OpKernel {
           batch_mean, batch_var, saved_mean, saved_maybe_inv_var,
           tensor_format_, use_reserved_space);
     }
+    if (use_reshape) {
+      OP_REQUIRES(context, y->CopyFrom(*y, x_shape),
+                  errors::InvalidArgument("Error during tensor copy."));
+    }
   }
 
  private:
@@ -1375,8 +1396,8 @@ class FusedBatchNormGradOpBase : public OpKernel {
 
   virtual void ComputeWithReservedSpace(OpKernelContext* context,
                                         bool use_reserved_space) {
-    const Tensor& y_backprop = context->input(0);
-    const Tensor& x = context->input(1);
+    Tensor y_backprop = context->input(0);
+    Tensor x = context->input(1);
     const Tensor& scale = context->input(2);
     // When is_training=True, batch mean and variance/inverted variance are
     // saved in the forward pass to be reused here. When is_training=False,
@@ -1387,11 +1408,11 @@ class FusedBatchNormGradOpBase : public OpKernel {
     // saves inverted variance.
     const Tensor& saved_maybe_inv_var_or_pop_var = context->input(4);
 
-    OP_REQUIRES(context, y_backprop.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
+    OP_REQUIRES(context, y_backprop.dims() == 4 or y_backprop.dims() == 5,
+                errors::InvalidArgument("input must be 4 or 5-dimensional",
                                         y_backprop.shape().DebugString()));
-    OP_REQUIRES(context, x.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
+    OP_REQUIRES(context, x.dims() == 4 or x.dims() == 5,
+                errors::InvalidArgument("input must be 4 or 5-dimensional",
                                         x.shape().DebugString()));
     OP_REQUIRES(context, scale.dims() == 1,
                 errors::InvalidArgument("scale must be 1-dimensional",
@@ -1404,10 +1425,27 @@ class FusedBatchNormGradOpBase : public OpKernel {
                 errors::InvalidArgument(
                     "saved variance must be 1-dimensional",
                     saved_maybe_inv_var_or_pop_var.shape().DebugString()));
+    bool use_reshape = (x.dims() == 5);
+    auto x_shape = x.shape();
+    TensorShape dest_shape;
+    if (use_reshape) {
+      const int64 in_batch = GetTensorDim(x, tensor_format_, 'N');
+      int64 in_planes = GetTensorDim(x, tensor_format_, '0');
+      int64 in_rows = GetTensorDim(x, tensor_format_, '1');
+      int64 in_cols = GetTensorDim(x, tensor_format_, '2');
+      const int64 in_depth = GetTensorDim(x, tensor_format_, 'C');
+      dest_shape = ShapeFromFormat(tensor_format_, in_batch,
+                                   {{in_planes, in_rows * in_cols}}, in_depth);
+      OP_REQUIRES(context, x.CopyFrom(x, dest_shape),
+                  errors::InvalidArgument("Error during tensor copy."));
+      OP_REQUIRES(context, y_backprop.CopyFrom(y_backprop, dest_shape),
+                  errors::InvalidArgument("Error during tensor copy."));
+    }
 
     Tensor* x_backprop = nullptr;
+    auto alloc_shape = use_reshape ? dest_shape : x_shape;
     OP_REQUIRES_OK(context,
-                   context->allocate_output(0, x.shape(), &x_backprop));
+                   context->allocate_output(0, alloc_shape, &x_backprop));
 
     const TensorShape& scale_offset_shape = scale.shape();
     Tensor* scale_backprop = nullptr;
@@ -1441,15 +1479,20 @@ class FusedBatchNormGradOpBase : public OpKernel {
           offset_backprop, use_reserved_space, tensor_format_);
     } else {
       // Necessary layout conversion is currently done in python.
-      CHECK(tensor_format_ == FORMAT_NHWC)
-          << "The implementation of FusedBatchNormGrad with is_training=False "
-             "only support "
-          << "NHWC tensor format for now.";
+      OP_REQUIRES(context, tensor_format_ == FORMAT_NHWC,
+                  errors::InvalidArgument(
+                      "The implementation of "
+                      "FusedBatchNormGrad with is_training=False only support "
+                      "NHWC tensor format for now."));
       functor::FusedBatchNormFreezeGrad<Device, T, U>()(
           context, y_backprop, x, scale, saved_mean_or_pop_mean,
           saved_maybe_inv_var_or_pop_var, epsilon_, x_backprop, scale_backprop,
           offset_backprop);
     }
+    if (use_reshape) {
+      OP_REQUIRES(context, x_backprop->CopyFrom(*x_backprop, x_shape),
+                  errors::InvalidArgument("Error during tensor copy."));
+    }
   }
 
  private:
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 2b6330db4aa..759bf0f0ddf 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -221,7 +221,7 @@ REGISTER_OP("FusedBatchNormV3")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
     .Attr("exponential_avg_factor: float = 1.0")
-    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetConvnetDataFormat2D3DAttrString())
     .Attr("is_training: bool = true")
     .SetShapeFn(shape_inference::FusedBatchNormV3Shape);
 
@@ -308,7 +308,7 @@ REGISTER_OP("FusedBatchNormGradV3")
     .Attr("T: {half, bfloat16, float}")
     .Attr("U: {float}")
     .Attr("epsilon: float = 0.0001")
-    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetConvnetDataFormat2D3DAttrString())
     .Attr("is_training: bool = true")
     .SetShapeFn(shape_inference::FusedBatchNormGradShape);
 // --------------------------------------------------------------------------
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index c80ab536588..263b05047da 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -1275,6 +1275,94 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_ndhwc_to_ncdhw('batchnorm/mul_1-1', nodes)
       self._assert_trans_ndhwc_to_ncdhw('batchnorm/add_1-1', nodes)
       self._assert_trans_ncdhw_to_ndhwc('batchnorm/add_1-0-0', nodes)
+
+  @test_util.deprecated_graph_mode_only
+  def testBatchNorm3D(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x_3d = random_ops.truncated_normal([1, 4, 2, 3, 3], seed=0)
+      filters = random_ops.truncated_normal([2, 2, 2, 3, 3], seed=0)
+      strides_val = [1, 1, 1, 1, 1]
+      scale = constant_op.constant(0.1, shape=[3])
+      offset = constant_op.constant(0.3, shape=[3])
+      conv3d = gen_nn_ops.conv3d(x_3d, filters, strides_val, 'SAME')
+      y, _, _ = nn.fused_batch_norm(conv3d, scale, offset, data_format='NDHWC')
+      output = array_ops.identity(y)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 2
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3D-0', nodes)
+      self._assert_trans_ncdhw_to_ndhwc('FusedBatchNormV3-0-0', nodes)
+      self.assertAllClose(output_val_ref, output_val, atol=1e-3)
+
+  @test_util.deprecated_graph_mode_only
+  def testBatchNormGrad3D(self):
+    if test.is_gpu_available(cuda_only=True):
+      random_seed.set_random_seed(0)
+      x_3d = random_ops.truncated_normal([1, 4, 2, 3, 3], seed=0)
+      filters = random_ops.truncated_normal([2, 2, 2, 3, 3], seed=0)
+      strides_val = [1, 1, 1, 1, 1]
+      scale = constant_op.constant(0.1, shape=[3])
+      offset = constant_op.constant(0.3, shape=[3])
+      mean = constant_op.constant(0.1, shape=[3])
+      variance = constant_op.constant(0.3, shape=[3])
+      conv3d = gen_nn_ops.conv3d(x_3d, filters, strides_val, 'SAME')
+      y, running_mean, running_var, r0, r1, r2 = gen_nn_ops.fused_batch_norm_v3(
+          conv3d,
+          scale,
+          offset,
+          mean,
+          variance,
+          epsilon=1.001e-5,
+          exponential_avg_factor=1.0,
+          data_format='NDHWC',
+          is_training=True,
+          name='batch_norm')
+      dx, dscale, doffset, _, _ = gen_nn_ops.fused_batch_norm_grad_v3(
+          y,
+          x_3d,
+          scale,
+          r0,
+          r1,
+          r2,
+          epsilon=1.001e-5,
+          data_format='NDHWC',
+          is_training=True)
+      output = array_ops.identity(dx)
+
+      with session.Session(config=_get_config(False)) as sess:
+        output_val_ref = sess.run(output)
+
+      with session.Session(config=_get_config()) as sess:
+        metadata = config_pb2.RunMetadata()
+        output_val = sess.run(output, run_metadata=metadata)
+
+      nodes = []
+      num_transposes = 0
+      for node in metadata.cost_graph.node:
+        if _is_transpose(node.name):
+          num_transposes += 1
+        nodes.append(node.name)
+
+      expected_num_transposes = 3
+      self.assertEqual(expected_num_transposes, num_transposes)
+      self._assert_trans_ndhwc_to_ncdhw('Conv3D-0', nodes)
+      self._assert_trans_ndhwc_to_ncdhw('FusedBatchNormGradV3-1', nodes)
+      self._assert_trans_ncdhw_to_ndhwc('FusedBatchNormGradV3-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   @test_util.deprecated_graph_mode_only
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index dc6eda6dcc3..2809cbb0108 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -330,13 +330,13 @@ class BatchNormalizationBase(Layer):
       # output back to its original shape accordingly.
       if self._USE_V2_BEHAVIOR:
         if self.fused is None:
-          self.fused = (ndims == 4)
-        elif self.fused and ndims != 4:
+          self.fused = ndims in (4, 5)
+        elif self.fused and ndims not in (4, 5):
           raise ValueError('Batch normalization layers with fused=True only '
-                           'support 4D input tensors.')
+                           'support 4D or 5D input tensors.')
       else:
         assert self.fused is not None
-        self.fused = (ndims == 4 and self._fused_can_be_used())
+        self.fused = (ndims in (4, 5) and self._fused_can_be_used())
       # TODO(chrisying): fused batch norm is currently not supported for
       # multi-axis batch norm and by extension virtual batches. In some cases,
       # it might be possible to use fused batch norm but would require reshaping
@@ -345,13 +345,22 @@ class BatchNormalizationBase(Layer):
       # common use case (turning 5D w/ virtual batch to NCHW)
 
     if self.fused:
-      if self.axis == [1]:
+      if self.axis == [1] and ndims == 4:
         self._data_format = 'NCHW'
-      elif self.axis == [3]:
+      elif self.axis == [1] and ndims == 5:
+        self._data_format = 'NCDHW'
+      elif self.axis == [3] and ndims == 4:
         self._data_format = 'NHWC'
+      elif self.axis == [4] and ndims == 5:
+        self._data_format = 'NDHWC'
+      elif ndims == 5:
+        # 5D tensors that can be passed in but should not use fused batch norm
+        # due to unsupported axis.
+        self.fused = False
       else:
         raise ValueError('Unsupported axis, fused batch norm only supports '
-                         'axis == [1] or axis == [3]')
+                         'axis == [1] or axis == [3] for 4D input tensors or '
+                         'axis == [1] or axis == [4] for 5D input tensors')
 
     axis_to_dim = {x: input_shape.dims[x].value for x in self.axis}
     for x in axis_to_dim:
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index f89a615bee5..79ecc3c3fe1 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -66,6 +66,15 @@ class BatchNormalizationTest(keras_parameterized.TestCase):
         kwargs={'scale': False,
                 'center': False},
         input_shape=(3, 3))
+    testing_utils.layer_test(
+        keras.layers.BatchNormalization,
+        kwargs={
+            'gamma_initializer': 'ones',
+            'beta_initializer': 'ones',
+            'moving_mean_initializer': 'zeros',
+            'moving_variance_initializer': 'ones'
+        },
+        input_shape=(3, 2, 4, 2))
 
   @combinations.generate(combinations.combine(mode=['graph', 'eager']))
   def test_batchnorm_weights(self):
@@ -319,7 +328,7 @@ class BatchNormalizationV2Test(keras_parameterized.TestCase):
     norm = normalization_v2.BatchNormalization(fused=True)
     self.assertEqual(norm.fused, True)
     inp = keras.layers.Input(shape=(4, 4))
-    with self.assertRaisesRegex(ValueError, '4D input tensors'):
+    with self.assertRaisesRegex(ValueError, '4D or 5D input tensors'):
       norm(inp)
 
   def test_updates_in_wrap_function(self):
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 1742a919216..0421829bff3 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -43,14 +43,18 @@ class BatchNormalizationTest(test.TestCase):
     return math_ops.cast(y, x.dtype)
 
   def _inference_ref(self, x, scale, offset, mean, var, epsilon, data_format):
-    if data_format not in ['NHWC', 'NCHW']:
-      raise ValueError('data_format must be NCHW or NHWC, '
-                       'got %s.' % data_format)
+    if data_format not in ['NHWC', 'NCHW', 'NDHWC', 'NCDHW']:
+      raise ValueError('data_format must be NCHW or NHWC for 4D tensors or'
+                       'NCDHW or NDHWC for 5D tensors, got %s.' % data_format)
     if data_format == 'NCHW':
       x = array_ops.transpose(x, [0, 2, 3, 1])
+    elif data_format == 'NCDHW':
+      x = array_ops.transpose(x, [0, 2, 3, 4, 1])
     y = self._batch_norm(x, mean, var, offset, scale, epsilon)
     if data_format == 'NCHW':
       y = array_ops.transpose(y, [0, 3, 1, 2])
+    elif data_format == 'NCDHW':
+      y = array_ops.transpose(y, [0, 4, 1, 2, 3])
     return self.evaluate(y)
 
   def _test_inference(self,
@@ -102,17 +106,24 @@ class BatchNormalizationTest(test.TestCase):
 
   def _training_ref(self, x, scale, offset, old_mean, old_var,
                     exponential_avg_factor, epsilon, data_format):
-    if data_format not in ['NHWC', 'NCHW']:
-      raise ValueError('data_format must be NCHW or NHWC, '
-                       'got %s.' % data_format)
+    if data_format not in ['NHWC', 'NCHW', 'NDHWC', 'NCDHW']:
+      raise ValueError('data_format must be NCHW or NHWC for 4D tensors or'
+                       'NCDHW or NDHWC for 5D tensors, got %s.' % data_format)
+    use_4d_tensor = (x.shape.ndims == 4)
     if data_format == 'NCHW':
       x = array_ops.transpose(x, [0, 2, 3, 1])
+    elif data_format == 'NCDHW':
+      x = array_ops.transpose(x, [0, 2, 3, 4, 1])
+
+    mean_axis = [0, 1, 2] if use_4d_tensor else [0, 1, 2, 3]
     batch_mean, batch_var = nn_impl.moments(
-        math_ops.cast(x, scale.dtype), [0, 1, 2], keep_dims=False)
+        math_ops.cast(x, scale.dtype), mean_axis, keep_dims=False)
 
     y = self._batch_norm(x, batch_mean, batch_var, offset, scale, epsilon)
     if data_format == 'NCHW':
       y = array_ops.transpose(y, [0, 3, 1, 2])
+    elif data_format == 'NCDHW':
+      y = array_ops.transpose(y, [0, 4, 1, 2, 3])
 
     # This is for Bessel's correction. tf.nn.moments uses n, instead of n-1, as
     # the denominator in the formula to calculate variance, while
@@ -377,14 +388,18 @@ class BatchNormalizationTest(test.TestCase):
 
   def _runtests(self, x_shape, is_training, gradient_test=False,
                 cpu_only=False):
+    if len(x_shape) == 4:
+      data_format_list = ['NHWC', 'NCHW']
+    else:
+      data_format_list = ['NCDHW', 'NDHWC']
     use_gpu_vals = [False]
     if test.is_gpu_available(cuda_only=True) and not cpu_only:
       use_gpu_vals += [True]
     factors = [1.0, 0.6]
     for dtype in [np.float16, np.float32]:
       for use_gpu in use_gpu_vals:
-        for data_format in ['NHWC', 'NCHW']:
-          if data_format == 'NHWC':
+        for data_format in data_format_list:
+          if data_format == 'NHWC' or data_format == 'NDHWC':
             scale_shape = x_shape[-1:]
           else:
             scale_shape = x_shape[1:2]
@@ -444,6 +459,10 @@ class BatchNormalizationTest(test.TestCase):
     # GPU kernel doesn't properly handle case where non-channel dimensions are 1
     self._runtests(x_shape, False, cpu_only=True)
 
+  def testInferenceShape7(self):
+    x_shape = [1, 2, 6, 1, 3]
+    self._runtests(x_shape, False)
+
   def testTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
     self._runtests(x_shape, True)
@@ -465,11 +484,16 @@ class BatchNormalizationTest(test.TestCase):
     x_shape = [0, 131, 127, 6]
     self._runtests(x_shape, True)
 
+  @test_util.run_deprecated_v1
   def testTrainingShape6(self):
     x_shape = [1, 1, 1, 1]
     # GPU kernel doesn't properly handle case where non-channel dimensions are 1
     self._runtests(x_shape, True, cpu_only=True)
 
+  def testTrainingShape7(self):
+    x_shape = [1, 2, 6, 1, 3]
+    self._runtests(x_shape, True)
+
   @test_util.run_deprecated_v1
   def testBatchNormGradInferenceShape1(self):
     x_shape = [1, 1, 6, 1]
@@ -503,6 +527,11 @@ class BatchNormalizationTest(test.TestCase):
     self._runtests(x_shape, is_training=False, gradient_test=True,
                    cpu_only=True)
 
+  @test_util.run_deprecated_v1
+  def testBatchNormGradInferenceShape7(self):
+    x_shape = [1, 2, 6, 1, 3]
+    self._runtests(x_shape, is_training=False, gradient_test=True)
+
   @test_util.run_deprecated_v1
   def testBatchNormGradTrainingShape1(self):
     x_shape = [1, 1, 6, 1]
@@ -535,42 +564,54 @@ class BatchNormalizationTest(test.TestCase):
     # GPU kernel doesn't properly handle case where non-channel dimensions are 1
     self._runtests(x_shape, is_training=True, gradient_test=True, cpu_only=True)
 
+  @test_util.run_deprecated_v1
+  def testBatchNormGradTrainingShape7(self):
+    x_shape = [1, 2, 6, 1, 3]
+    self._runtests(x_shape, is_training=True, gradient_test=True)
+
   def _testBatchNormGradGrad(self, config):
     shape = config['shape']
     err_tolerance = config['err_tolerance']
     dtype = config['dtype']
+    rank = len(shape)
+    if rank == 4:
+      data_format_nhwc, features_nhwc = 'NHWC', shape[3]
+      data_format_nchw, features_nchw = 'NCHW', shape[1]
+    else:
+      data_format_nhwc, features_nhwc = 'NDHWC', shape[4]
+      data_format_nchw, features_nchw = 'NCDHW', shape[1]
     for is_training in [True, False]:
       if test.is_gpu_available(cuda_only=True):
         self._test_grad_grad(
             shape,
-            dtype, [shape[3]],
+            dtype, [features_nhwc],
             np.float32,
             use_gpu=True,
-            data_format='NHWC',
+            data_format=data_format_nhwc,
             is_training=is_training,
             err_tolerance=err_tolerance)
         self._test_grad_grad(
             shape,
-            dtype, [shape[1]],
+            dtype, [features_nchw],
             np.float32,
             use_gpu=True,
-            data_format='NCHW',
+            data_format=data_format_nchw,
             is_training=is_training,
             err_tolerance=err_tolerance)
       self._test_grad_grad(
           shape,
-          dtype, [shape[3]],
+          dtype, [features_nhwc],
           np.float32,
           use_gpu=False,
-          data_format='NHWC',
+          data_format=data_format_nhwc,
           is_training=is_training,
           err_tolerance=err_tolerance)
       self._test_grad_grad(
           shape,
-          dtype, [shape[1]],
+          dtype, [features_nchw],
           np.float32,
           use_gpu=False,
-          data_format='NCHW',
+          data_format=data_format_nchw,
           is_training=is_training,
           err_tolerance=err_tolerance)
 
@@ -610,6 +651,24 @@ class BatchNormalizationTest(test.TestCase):
     }
     self._testBatchNormGradGrad(config)
 
+  @test_util.run_deprecated_v1
+  def testBatchNormGradGradConfig5(self):
+    config = {
+        'shape': [2, 3, 2, 2, 2],
+        'err_tolerance': 2e-3,
+        'dtype': np.float32,
+    }
+    self._testBatchNormGradGrad(config)
+
+  @test_util.run_deprecated_v1
+  def testBatchNormGradGradConfig6(self):
+    config = {
+        'shape': [2, 3, 2, 2, 2],
+        'err_tolerance': 3e-3,
+        'dtype': np.float16,
+    }
+    self._testBatchNormGradGrad(config)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 58dd1852cc5..a02e31f80a5 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -897,6 +897,11 @@ def _BaseFusedBatchNormGrad(op, version, *grad):
     if data_format == b"NCHW":
       x = array_ops.transpose(x, [0, 2, 3, 1])
       grad_y = array_ops.transpose(grad_y, [0, 2, 3, 1])
+    elif data_format == b"NCDHW":
+      x = array_ops.transpose(x, [0, 2, 3, 4, 1])
+      grad_y = array_ops.transpose(grad_y, [0, 2, 3, 4, 1])
+    target_data_format = ("NHWC" if data_format in (b"NCHW",
+                                                    b"NHWC") else "NDHWC")
     args = {
         "y_backprop": grad_y,
         "x": x,
@@ -904,7 +909,7 @@ def _BaseFusedBatchNormGrad(op, version, *grad):
         "reserve_space_1": pop_mean,
         "reserve_space_2": pop_var,
         "epsilon": epsilon,
-        "data_format": "NHWC",
+        "data_format": target_data_format,
         "is_training": is_training
     }
     if version == 2:
@@ -912,6 +917,8 @@ def _BaseFusedBatchNormGrad(op, version, *grad):
     dx, dscale, doffset, _, _ = grad_fun(**args)
     if data_format == b"NCHW":
       dx = array_ops.transpose(dx, [0, 3, 1, 2])
+    elif data_format == b"NCDHW":
+      dx = array_ops.transpose(dx, [0, 4, 1, 2, 3])
   return dx, dscale, doffset, None, None
 
 
@@ -941,8 +948,8 @@ def _BatchNormGrad(grad_y,
   """Returns the gradients for the 3 inputs of BatchNorm.
 
   Args:
-    grad_y: A `Tensor` of 4 dimensions for gradient for y.
-    x: A `Tensor` of 4 dimensions for x.
+    grad_y: A `Tensor` of 4 or 5 dimensions for gradient for y.
+    x: A `Tensor` of 4 or 5 dimensions for x.
     scale: A `Tensor` of 1 dimension for scaling.
     pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
       is_training=False.
@@ -968,11 +975,19 @@ def _BatchNormGrad(grad_y,
     if data_format == b"NHWC":
       keepdims = False
       reduce_axis = [0, 1, 2]
-    else:
+    elif data_format == b"NDHWC":
+      keepdims = False
+      reduce_axis = [0, 1, 2, 3]
+    elif data_format == b"NCHW":
       keepdims = True
       reduce_axis = [0, 2, 3]
       shape = [1, array_ops.size(scale), 1, 1]
       scale = array_ops.reshape(scale, shape)
+    else:
+      keepdims = True
+      reduce_axis = [0, 2, 3, 4]
+      shape = [1, array_ops.size(scale), 1, 1, 1]
+      scale = array_ops.reshape(scale, shape)
     mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims)
     mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
     var_x = math_ops.reduce_mean(
@@ -987,19 +1002,27 @@ def _BatchNormGrad(grad_y,
         grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
     grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
         grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
-    if data_format == b"NCHW":
+    if data_format == b"NCHW" or data_format == b"NCDHW":
       grad_scale = array_ops.squeeze(grad_scale)
     grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
     return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
   else:
     if data_format == b"NHWC":
       reduce_axis = [0, 1, 2]
-    else:
+    elif data_format == b"NDHWC":
+      reduce_axis = [0, 1, 2, 3]
+    elif data_format == b"NCHW":
       reduce_axis = [0, 2, 3]
       shape = [1, array_ops.size(pop_mean), 1, 1]
       pop_mean = array_ops.reshape(pop_mean, shape)
       pop_var = array_ops.reshape(pop_var, shape)
       scale = array_ops.reshape(scale, shape)
+    else:
+      reduce_axis = [0, 2, 3, 4]
+      shape = [1, array_ops.size(pop_mean), 1, 1, 1]
+      pop_mean = array_ops.reshape(pop_mean, shape)
+      pop_var = array_ops.reshape(pop_var, shape)
+      scale = array_ops.reshape(scale, shape)
 
     grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
     var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 89174b29336..d22fbf3fa4e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -1585,7 +1585,7 @@ def fused_batch_norm(
   (http://arxiv.org/abs/1502.03167).
 
   Args:
-    x: Input `Tensor` of 4 dimensions.
+    x: Input `Tensor` of 4 or 5 dimensions.
     scale: A `Tensor` of 1 dimension for scaling.
     offset: A `Tensor` of 1 dimension for bias.
     mean: A `Tensor` of 1 dimension for population mean. The shape and meaning
@@ -1611,7 +1611,8 @@ def fused_batch_norm(
             Variance must be a `Tensor` of the same shape as scale containing
             the exponential running variance.
     epsilon: A small float number added to the variance of x.
-    data_format: The data format for x. Either "NHWC" (default) or "NCHW".
+    data_format: The data format for x. Support "NHWC" (default) or "NCHW" for
+                 4D tenors and "NDHWC" or "NCDHW" for 5D tensors.
     is_training: A bool value to specify if the operation is used for
                  training or inference.
     name: A name for this operation (optional).
@@ -1622,7 +1623,7 @@ def fused_batch_norm(
                             returned.
 
   Returns:
-    y: A 4D Tensor for the normalized, scaled, offsetted x.
+    y: A 4D or 5D Tensor for the normalized, scaled, offsetted x.
     running_mean: A 1D Tensor for the exponential running mean of x.
                   The output value is (1 - exponential_avg_factor) * mean +
                   exponential_avg_factor * batch_mean), where batch_mean

From 9cd28f8096a27de5886610cfac9bbcd879dffb58 Mon Sep 17 00:00:00 2001
From: Reed Wanderman-Milne <reedwm@google.com>
Date: Fri, 2 Oct 2020 12:11:13 -0700
Subject: [PATCH 15/54] Delegate hyperparameter accesses in LossScaleOptimizer.

This change is described in the RFC: https://github.com/tensorflow/community/pull/293. If an alternative approach ends up being used in the RFC, I will revert this change and implement the other approach.

PiperOrigin-RevId: 335074869
Change-Id: I07aca34b8a475500107944498b4769d57cbd1bac
---
 .../experimental/loss_scale_optimizer.py      | 93 +++++++++++++++----
 .../experimental/loss_scale_optimizer_test.py | 86 ++++++++++-------
 ...n.experimental.-loss-scale-optimizer.pbtxt |  8 --
 ...n.experimental.-loss-scale-optimizer.pbtxt |  8 --
 4 files changed, 128 insertions(+), 67 deletions(-)

diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
index eb31c647ca3..dd7bf6a682d 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py
@@ -229,6 +229,41 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
   >>> opt.apply_gradients([(grad, var)])  # Loss scale is updated here
   >>> var.numpy()
   0.25
+
+  Hyperparameters can be accessed and set on the LossScaleOptimizer, which will
+  be delegated to the wrapped optimizer.
+
+  >>> opt = tf.keras.optimizers.Adam(beta_1=0.8, epsilon=1e-5)
+  >>> lso = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt,
+  ...                                                                "dynamic")
+  >>> opt.beta_1
+  0.8
+  >>> lso.beta_1  # Equivalent to `opt.beta_1`
+  0.8
+  >>> lso.beta_1 = 0.7  # Equivalent to `opt.beta_1 = 0.7`
+  >>> opt.beta_1
+  0.7
+  >>> lso.beta_1
+  0.7
+
+  However, accessing or setting non-hyperparameters is not delegated to the
+  LossScaleOptimizer. In an Adam optimizer, `beta_1` is a hyperparameter but
+  `epsilon` is not, as the Adam optimizer only calls `Optimizer._set_hyper` on
+  `beta_1`.
+
+  >>> opt.epsilon
+  1e-5
+  >>> lso.epsilon
+  Traceback (most recent call last):
+  ...
+  AttributeError: 'LossScaleOptimizer' object has no attribute 'epsilon'
+  >>> lso.epsilon = 1e-4
+  >>> opt.epsilon
+  >>> 1e-5
+
+  In the above example, despite epsilon being set on the LossScaleOptimizer, the
+  old epsilon value will still be used when training as epsilon was not set on
+  the Adam optimizer.
   """
 
   _HAS_AGGREGATE_GRAD = True
@@ -268,9 +303,6 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
       backend.track_variable(weight)
     self._track_trackable(self._loss_scale, 'loss_scale')
 
-    # Needed because the superclass's __getattribute__ checks this.
-    self._hyper = {}
-
     # To support restoring TensorFlow 2.2 checkpoints.
     self._track_trackable(FakeOptimizerForRestoration(self._optimizer),
                           'base_optimizer')
@@ -516,26 +548,47 @@ class LossScaleOptimizer(_DelegatingTrackableMixin, optimizer_v2.OptimizerV2):
   def add_slot(self, var, slot_name, initializer='zeros'):
     return self._optimizer.add_slot(var, slot_name, initializer)
 
-  # For the most part, we only expose methods in the base OptimizerV2, not
-  # individual subclasses like Adam. However, although "learning_rate" and "lr"
-  # properties are not part of the base OptimizerV2 class, they are part of most
-  # subclasses, so we expose them here for convenience.
+  def __getattribute__(self, name):
+    try:
+      return object.__getattribute__(self, name)
+    except AttributeError as e:
+      if name == '_optimizer' or name == '_hyper':
+        # Avoid infinite recursion
+        raise e
 
-  @property
-  def learning_rate(self):
-    return self._optimizer.learning_rate
+      # Delegate hyperparameter accesses to inner optimizer.
+      if name == 'lr':
+        name = 'learning_rate'
+      if name in self._optimizer._hyper:
+        return self._optimizer._get_hyper(name)
+      raise e
 
-  @learning_rate.setter
-  def learning_rate(self, lr):
-    self._optimizer.learning_rate = lr
+  def __dir__(self):
+    result = set(super(LossScaleOptimizer, self).__dir__())
+    if '_optimizer' in result:
+      result |= self._optimizer._hyper.keys()
+      if 'learning_rate' in self._optimizer._hyper.keys():
+        result.add('lr')
+    return list(result)
 
-  @property
-  def lr(self):
-    return self._optimizer.lr
-
-  @lr.setter
-  def lr(self, lr):
-    self._optimizer.lr = lr
+  def __setattr__(self, name, value):
+    if name == 'lr':
+      name = 'learning_rate'
+    # Delegate setting hyperparameter to inner optimizer if the attribute does
+    # not exist on the LossScaleOptimizer
+    try:
+      # We cannot check for the 'iterations' attribute as it cannot be set after
+      # it is accessed.
+      if name != 'iterations':
+        object.__getattribute__(self, name)
+      has_attribute = True
+    except AttributeError:
+      has_attribute = False
+    if (name != '_optimizer' and name in self._optimizer._hyper
+        and not has_attribute):
+      self._optimizer._set_hyper(name, value)
+    else:
+      super(LossScaleOptimizer, self).__setattr__(name, value)
 
   # We do not override some OptimizerV2 methods. For each, we describe why we do
   # not delegate them to self._optimizer:
diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
index e375fcf557b..fe3a237ef83 100644
--- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
+++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py
@@ -382,47 +382,71 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegex(ValueError, r'loss_scale cannot be None'):
       loss_scale_optimizer.LossScaleOptimizer(opt, None)
 
-  @parameterized.named_parameters(*TESTCASES)
-  def testGettingAndSettingLearningRate(self, strategy_fn):
-    with self.test_session(), strategy_fn().scope() as strategy:
-      var = variables.Variable([5.0])
-      opt = adam.Adam(learning_rate=1.0)
-      loss = lambda: var * 2.0
-      run_fn = lambda: opt.minimize(loss, [var])
-      run_op = strategy.experimental_run(run_fn)
+  def testHyperParametersExposed(self):
+    with self.cached_session():
+      opt = adam.Adam(learning_rate=1.0, beta_1=0.5, beta_2=0.9)
+      lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
+      # Force hyperparameters to be created
+      opt.lr  # pylint: disable=pointless-statement
       self.evaluate(variables.global_variables_initializer())
-      self._run_if_in_graph_mode(run_op)
 
-      lr = self.evaluate(opt.lr)
-      self.assertEqual(1.0, lr)
+      self.assertEqual(self.evaluate(lso.beta_1), 0.5)
+      self.assertIsInstance(lso.beta_1, variables.Variable)
+      self.assertEqual(self.evaluate(lso.lr), 1.0)
+      self.assertIs(lso.lr, opt.lr)
+      self.assertIs(lso.lr, lso.learning_rate)
 
-      opt.lr = 2.0
-      lr = self.evaluate(opt.lr)
-      self.assertEqual(2.0, lr)
+      lso.beta_1 = 0.25
+      self.assertEqual(self.evaluate(lso.beta_1), 0.25)
+      self.assertEqual(self.evaluate(opt.beta_1), 0.25)
+      self.assertIs(lso.beta_1, opt.beta_1)
+      opt.beta_1 = 0.75
+      self.assertEqual(self.evaluate(lso.beta_1), 0.75)
+      self.assertEqual(self.evaluate(opt.beta_1), 0.75)
+      self.assertIs(lso.beta_1, opt.beta_1)
+      lso.lr = 2.0
+      self.assertEqual(self.evaluate(lso.lr), 2.0)
+      self.assertEqual(self.evaluate(lso.learning_rate), 2.0)
+      self.assertEqual(self.evaluate(opt.lr), 2.0)
+      self.assertEqual(self.evaluate(opt.learning_rate), 2.0)
+      self.assertIs(lso.lr, opt.lr)
 
-      self.evaluate(opt.lr.assign(3.0))
-      lr = self.evaluate(opt.lr)
-      self.assertEqual(3.0, lr)
+      # Test setting attribute that is both attribute on LossScaleOptimizer and
+      # hyperparameter on wrapped optimizer.
+      class MyOpt(gradient_descent.SGD):
 
+        def __init__(self):
+          super().__init__()
+          self._set_hyper('loss_scale', 123.)
+
+      opt = MyOpt()
+      lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
       with self.assertRaises(AttributeError):
-        opt.not_an_attr += 3
+        lso.loss_scale = loss_scale_module.FixedLossScale(2.)
 
   def testArbitraryAttributesNotExposed(self):
-    opt = adam.Adam(learning_rate=1.0)
-    # Test that Adam has attributes 'epsilon' and 'beta1'
-    opt.epsilon  # pylint: disable=pointless-statement
-    opt.beta_1  # pylint: disable=pointless-statement
-    opt = loss_scale_optimizer.LossScaleOptimizer(opt, loss_scale=10.)
-    # Test that attributes defined by OptimizerV2 subclasses are not exposed in
-    # LossScaleOptimizer, and that the error message is sensible.
+    opt = gradient_descent.SGD()
+    lso = loss_scale_optimizer.LossScaleOptimizer(opt, 'dynamic')
+    self.assertFalse(opt.nesterov)
     with self.assertRaisesRegex(
         AttributeError,
-        "'LossScaleOptimizer' object has no attribute 'epsilon'"):
-      opt.epsilon  # pylint: disable=pointless-statement
-    with self.assertRaisesRegex(
-        AttributeError,
-        "'LossScaleOptimizer' object has no attribute 'beta_1'"):
-      opt.beta_1  # pylint: disable=pointless-statement
+        "'LossScaleOptimizer' object has no attribute 'nesterov'"):
+      lso.nesterov  # pylint: disable=pointless-statement
+
+    lso.nesterov = True
+    self.assertTrue(lso.nesterov)
+    self.assertFalse(opt.nesterov)
+
+  def testDir(self):
+    lso = loss_scale_optimizer.LossScaleOptimizer(gradient_descent.SGD(),
+                                                  'dynamic')
+    dir_result = dir(lso)
+    self.assertIn('learning_rate', dir_result)  # Hyperparameter
+    self.assertIn('lr', dir_result)  # Hyperparameter
+    self.assertIn('minimize', dir_result)  # Attribute
+    self.assertIn('loss_scale', dir_result)  # Attribute
+    self.assertNotIn('nesterov', dir_result)  # Attribute on inner optimizer
+    self.assertIn('nesterov', dir(lso._optimizer))
 
   def testApplyGradientsGetsUnwrappedTensors(self):
     # Tests that gradients passed to apply_gradients are not wrapped in a
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index e3c8c7e8a65..3c016d331de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -21,18 +21,10 @@ tf_class {
     name: "iterations"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "learning_rate"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "loss_scale"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "lr"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "weights"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
index e3c8c7e8a65..3c016d331de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.mixed_precision.experimental.-loss-scale-optimizer.pbtxt
@@ -21,18 +21,10 @@ tf_class {
     name: "iterations"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "learning_rate"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "loss_scale"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "lr"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "weights"
     mtype: "<type \'property\'>"

From 0332cf8e04b83563030e66aa8e5b656113c4eb6e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Oct 2020 12:32:26 -0700
Subject: [PATCH 16/54] Fix tf.test.assert_equal_graph_def(...,
 hash_table_shared_name=True) for PY3. The value of str(b"foo") has changed
 from foo to b"foo".

PiperOrigin-RevId: 335079075
Change-Id: I9f03a907c6b949cc6e82669f9c3c33cd6dd655e3
---
 tensorflow/python/BUILD                       |  1 +
 tensorflow/python/framework/test_util.py      |  6 +++--
 tensorflow/python/framework/test_util_test.py | 22 +++++++++++++++++++
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 71a41c66246..961c845805b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2825,6 +2825,7 @@ tf_py_test(
         ":framework_combinations",
         ":framework_for_generated_wrappers",
         ":framework_test_lib",
+        ":lookup_ops",
         ":platform_test",
         ":random_ops",
         ":resource_variable_ops",
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 04e4d6b73f5..ee078dd0455 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -290,7 +290,8 @@ def _strip_checkpoint_v2_randomized(graph_def):
       if attr_tensor_value and len(attr_tensor_value.string_val) == 1:
         attr_tensor_string_value = attr_tensor_value.string_val[0]
         if (attr_tensor_string_value and
-            re.match(_SHARDED_SAVE_OP_PATTERN, str(attr_tensor_string_value))):
+            re.match(compat.as_bytes(_SHARDED_SAVE_OP_PATTERN),
+                     attr_tensor_string_value)):
           delete_keys.append(attr_key)
     for attr_key in delete_keys:
       del node.attr[attr_key]
@@ -303,7 +304,8 @@ def _strip_hash_table_shared_name(graph_def):
   for node in graph_def.node:
     delete_keys = []
     if node.op == "HashTableV2" and "shared_name" in node.attr:
-      if re.match(_TABLE_SHARED_NAME_PATTERN, str(node.attr["shared_name"].s)):
+      if re.match(compat.as_bytes(_TABLE_SHARED_NAME_PATTERN),
+                  node.attr["shared_name"].s):
         delete_keys.append("shared_name")
     for attr_key in delete_keys:
       del node.attr[attr_key]
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 00ea1aa32b0..fd1b782d621 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -43,6 +43,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -108,6 +109,27 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                 r"^Found unexpected node '{{node seven}}"):
       test_util.assert_equal_graph_def(def_57, def_empty)
 
+  def test_assert_equal_graph_def_hash_table(self):
+    def get_graph_def():
+      with ops.Graph().as_default() as g:
+        x = constant_op.constant([2, 9], name="x")
+        keys = constant_op.constant([1, 2], name="keys")
+        values = constant_op.constant([3, 4], name="values")
+        default = constant_op.constant(-1, name="default")
+        table = lookup_ops.StaticHashTable(
+            lookup_ops.KeyValueTensorInitializer(keys, values), default)
+        _ = table.lookup(x)
+      return g.as_graph_def()
+    def_1 = get_graph_def()
+    def_2 = get_graph_def()
+    # The unique shared_name of each table makes the graph unequal.
+    with self.assertRaisesRegex(AssertionError, "hash_table_"):
+      test_util.assert_equal_graph_def(def_1, def_2,
+                                       hash_table_shared_name=False)
+    # That can be ignored. (NOTE: modifies GraphDefs in-place.)
+    test_util.assert_equal_graph_def(def_1, def_2,
+                                     hash_table_shared_name=True)
+
   def testIsGoogleCudaEnabled(self):
     # The test doesn't assert anything. It ensures the py wrapper
     # function is generated correctly.

From 87ce933e2fffe7fc533cebfa92881de2eee7b614 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Fri, 2 Oct 2020 12:32:41 -0700
Subject: [PATCH 17/54] Clarify compression remat's names.

Clarify compression remat's names so we don't confuse them with layout assignment copies.

PiperOrigin-RevId: 335079133
Change-Id: I6ef1686154fb676b8cdee66659c6ace3cdcb6c1e
---
 tensorflow/compiler/xla/service/hlo_rematerialization.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index e9d6191f945..790d4bfc2fb 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -1524,9 +1524,11 @@ StatusOr<int64> CompressInstruction(MemoryUsageTracker* memory_tracker,
 
   HloInstruction* compressed = computation->AddInstruction(
       HloInstruction::CreateUnary(compact_shape, HloOpcode::kCopy, best));
+  compressed->SetAndSanitizeName(best->name() + ".remat_compressed");
 
   HloInstruction* uncompressed = computation->AddInstruction(
       HloInstruction::CreateUnary(best->shape(), HloOpcode::kCopy, compressed));
+  uncompressed->SetAndSanitizeName(best->name() + ".remat_uncompressed");
 
   Item* compressed_item = instruction_list->CreateItem(compressed);
   compressed_item->placed = true;

From 724b82d94c2fc0a5a76575311763e8b34ababc60 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Oct 2020 13:18:27 -0700
Subject: [PATCH 18/54] Update ops-related pbtxt files.

PiperOrigin-RevId: 335087976
Change-Id: I2e2641f3cfe65ed7065f557176f52ea090a762a2
---
 .../ops_history_v2/SnapshotDatasetV2.pbtxt    | 74 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 14 ++++
 2 files changed, 88 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
index f0868514182..4356d954c8f 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/SnapshotDatasetV2.pbtxt
@@ -58,3 +58,77 @@ op {
     has_minimum: true
   }
 }
+op {
+  name: "SnapshotDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "path"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "reader_func_other_args"
+    type_list_attr: "Treader_func_args"
+  }
+  input_arg {
+    name: "shard_func_other_args"
+    type_list_attr: "Tshard_func_args"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "compression"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "writer_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "reader_func"
+    type: "func"
+  }
+  attr {
+    name: "shard_func"
+    type: "func"
+  }
+  attr {
+    name: "Treader_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tshard_func_args"
+    type: "list(type)"
+    has_minimum: true
+  }
+}
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index aa28282fb12..07d29506004 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -44435,6 +44435,20 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "reader_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "writer_prefix"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   attr {
     name: "reader_func"
     type: "func"

From a70cc367bc02f85348896f63065165171907c016 Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Fri, 2 Oct 2020 13:35:19 -0700
Subject: [PATCH 19/54] [MLIR] Add a switch for enabling layouts in MLIR -> XLA
 HLO

Previously MLIR may carry an optional attribute "minor_to_major", to indicate a
layout. With layouts, the lack of such attribute means descending layout. However,
XLA builders don't put descending layouts by default. Sometimes the shape
inferencer forwards layouts from the input. Those layouts are not meant to be forwarded.

Add a switch to explicitly assign layouts to all ops.

Also fixed a bug that literal's layout is not correctly set.

PiperOrigin-RevId: 335091078
Change-Id: I4649afc78e401806dadc6165ba819d1282a18147
---
 .../mlir/xla/hlo_function_importer.cc         |  9 +--
 .../compiler/mlir/xla/mlir_hlo_to_hlo.cc      | 81 +++++++++++--------
 .../compiler/mlir/xla/mlir_hlo_to_hlo.h       | 20 ++++-
 .../tests/translate/layouts_and_names.mlir    |  4 +
 .../compiler/mlir/xla/xla_mlir_translate.cc   |  5 +-
 5 files changed, 76 insertions(+), 43 deletions(-)

diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index 6005fe6e6dd..a3f68411cc3 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -762,17 +762,10 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
                       ImportInstructionImpl(instruction, func_builder));
   if (op == nullptr) return op;
 
-  // Best-effort propagation of the layouts. These layouts serve as performance
-  // hints to the backend.
+  // See MlirToHloConversionOptions for more about layouts.
   //
   // Minor-to-major is a permutation of [0, rank), presenting tensor dimensions
   // in physical minor-to-major order.
-  //
-  // Note that non-array shapes are not carrying layouts, and users have to
-  // figure out the proper layouts of them through context. This is one of the
-  // reasons why the attribute-based solution is temporary.
-  //
-  // TODO(timshen): Investigate the necessity of having layouts in MHLO.
   if (instruction->shape().IsArray() &&
       instruction->shape().layout() !=
           LayoutUtil::MakeDescendingLayout(
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index c1d07702100..0923f247cd2 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -499,12 +499,14 @@ class ConvertToHloModule {
   // single value.
   explicit ConvertToHloModule(
       mlir::ModuleOp module, bool use_tuple_args, bool return_tuple,
-      tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn)
+      tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+      MlirToHloConversionOptions options)
       : module_(module),
         module_builder_("main"),
         use_tuple_args_(use_tuple_args),
         return_tuple_(return_tuple),
-        shape_representation_fn_(shape_representation_fn) {
+        shape_representation_fn_(shape_representation_fn),
+        options_(options) {
     if (!shape_representation_fn_)
       shape_representation_fn_ = tensorflow::IdentityShapeRepresentationFn();
   }
@@ -585,6 +587,8 @@ class ConvertToHloModule {
 
   // Unique suffix to give to the name of the next lowered region.
   size_t region_id_ = 0;
+
+  MlirToHloConversionOptions options_;
 };
 
 }  // namespace
@@ -1087,18 +1091,19 @@ LogicalResult ExportXlaOp(FusionOp op, OpLoweringContext ctx) {
 namespace mlir {
 namespace {
 
-StatusOr<xla::Literal> CreateLiteralFromAttr(ElementsAttr attr) {
+StatusOr<xla::Literal> CreateArrayLiteralFromAttr(ElementsAttr attr,
+                                                  xla::Layout layout) {
   if (attr.isa<OpaqueElementsAttr>())
     return tensorflow::errors::Unimplemented(
         "Opaque elements attr not supported");
 
   xla::Shape shape = xla::TypeToShape(attr.getType());
 
-#define ELEMENTS_ATTR_TO_LITERAL(xla_type, cpp_type)       \
-  case xla_type: {                                         \
-    xla::Array<cpp_type> source_data(shape.dimensions());  \
-    source_data.SetValues(attr.getValues<cpp_type>());     \
-    return xla::LiteralUtil::CreateFromArray(source_data); \
+#define ELEMENTS_ATTR_TO_LITERAL(xla_type, cpp_type)                         \
+  case xla_type: {                                                           \
+    xla::Array<cpp_type> source_data(shape.dimensions());                    \
+    source_data.SetValues(attr.getValues<cpp_type>());                       \
+    return xla::LiteralUtil::CreateFromArrayWithLayout(source_data, layout); \
   }
 
   switch (shape.element_type()) {
@@ -1128,7 +1133,7 @@ StatusOr<xla::Literal> CreateLiteralFromAttr(ElementsAttr attr) {
       }
       xla::Array<xla::half> source_data(shape.dimensions());
       source_data.SetValues(values);
-      return xla::LiteralUtil::CreateFromArray(source_data);
+      return xla::LiteralUtil::CreateFromArrayWithLayout(source_data, layout);
     }
     case xla::PrimitiveType::BF16: {
       xla::Array<double> source_data(shape.dimensions());
@@ -1145,7 +1150,7 @@ StatusOr<xla::Literal> CreateLiteralFromAttr(ElementsAttr attr) {
       }
       source_data.SetValues(values_double);
       return xla::LiteralUtil::ConvertF64ToBF16(
-          xla::LiteralUtil::CreateFromArray(source_data));
+          xla::LiteralUtil::CreateFromArrayWithLayout(source_data, layout));
     }
     default:
       return tensorflow::errors::Internal(absl::StrCat(
@@ -1154,25 +1159,33 @@ StatusOr<xla::Literal> CreateLiteralFromAttr(ElementsAttr attr) {
 #undef ELEMENTS_ATTR_TO_LITERAL
 }
 
+xla::Layout ExtractLayout(mlir::Operation* op, int rank) {
+  if (auto attr =
+          op->getAttrOfType<mlir::DenseIntElementsAttr>("minor_to_major")) {
+    llvm::SmallVector<int64, 4> minor_to_major;
+    minor_to_major.reserve(attr.size());
+    for (const llvm::APInt& i : attr) {
+      minor_to_major.push_back(i.getZExtValue());
+    }
+    return xla::LayoutUtil::MakeLayout(minor_to_major);
+  }
+  return xla::LayoutUtil::MakeDescendingLayout(rank);
+}
+
 LogicalResult ConvertToHloModule::Lower(
     mlir::Operation* inst, bool is_entry_function,
     llvm::ArrayRef<absl::optional<xla::OpSharding>> ret_shardings,
     xla::XlaBuilder* builder,
     ConvertToHloModule::ValueLoweringMap* value_lowering,
     xla::XlaComputation* result) {
-  // See hlo_function_importer.cc for documentation about layouts in MHLO.
-  auto propagate_layouts = [](mlir::Operation* inst, xla::XlaOp xla_op) {
-    auto attr =
-        inst->getAttrOfType<mlir::DenseIntElementsAttr>("minor_to_major");
-    if (!attr) return;
-
-    auto* v = xla::internal::XlaBuilderFriend::GetInstruction(xla_op)
-                  ->mutable_shape()
-                  ->mutable_layout()
-                  ->mutable_minor_to_major();
-    v->Clear();
-    for (const llvm::APInt& i : attr) {
-      *v->Add() = i.getZExtValue();
+  // See MlirToHloConversionOptions for more about layouts.
+  auto propagate_layouts = [this](mlir::Operation* inst, xla::XlaOp xla_op) {
+    if (options_.propagate_layouts) {
+      auto* shape = xla::internal::XlaBuilderFriend::GetInstruction(xla_op)
+                        ->mutable_shape();
+      if (shape->tuple_shapes().empty())
+        *shape->mutable_layout() =
+            ExtractLayout(inst, shape->dimensions().size()).ToProto();
     }
   };
 
@@ -1216,12 +1229,14 @@ LogicalResult ConvertToHloModule::Lower(
   }
 
   if (matchPattern(inst, m_Constant(&const_attr))) {
-    auto literal_or = CreateLiteralFromAttr(const_attr);
+    xla::Layout layout;
+    layout = ExtractLayout(inst, const_attr.getType().getRank());
+    auto literal_or = CreateArrayLiteralFromAttr(const_attr, layout);
     if (!literal_or.ok())
       return inst->emitError(literal_or.status().ToString());
     auto constant = xla::ConstantLiteral(builder, literal_or.ValueOrDie());
     value_map[inst->getResult(0)] = constant;
-    propagate_layouts(inst, constant);
+
     return success();
   }
 
@@ -1674,22 +1689,24 @@ LogicalResult AddDynamicParameterBindings(mlir::ModuleOp module,
 }  // namespace
 
 Status ConvertRegionToComputation(mlir::Region* region,
-                                  xla::XlaComputation* func) {
+                                  xla::XlaComputation* func,
+                                  MlirToHloConversionOptions options) {
   mlir::ModuleOp module;
-  ConvertToHloModule converter(module, true, true, {});
+  ConvertToHloModule converter(module, true, true, {}, options);
   if (failed(converter.LowerRegionAsComputation(region, func)))
     return tensorflow::errors::Internal(
         "failed to convert region to computation");
   return Status::OK();
 }
 
-Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
-                           bool use_tuple_args, bool return_tuple,
-                           const tensorflow::XlaHelpers::ShapeRepresentationFn
-                               shape_representation_fn) {
+Status ConvertMlirHloToHlo(
+    mlir::ModuleOp module, xla::HloProto* hlo_proto, bool use_tuple_args,
+    bool return_tuple,
+    const tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn,
+    MlirToHloConversionOptions options) {
   mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
   ConvertToHloModule converter(module, use_tuple_args, return_tuple,
-                               shape_representation_fn);
+                               shape_representation_fn, options);
   if (failed(converter.Run())) return diag_handler.ConsumeStatus();
   auto hlo_module = converter.ConsumeMainProto();
   hlo_proto->mutable_hlo_module()->Swap(&hlo_module);
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
index 6f2b5a6db95..4ca3e586128 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.h
@@ -25,6 +25,18 @@ limitations under the License.
 
 namespace mlir {
 
+struct MlirToHloConversionOptions {
+  // Best-effort propagation of the layouts. These layouts serve as performance
+  // hints to the backend.
+  //
+  // Note that non-array shapes are not carrying layouts, and users have to
+  // figure out the proper layouts of them through context. This is one of the
+  // reasons why the attribute-based solution is temporary.
+  //
+  // TODO(timshen): Investigate the necessity of having layouts in MHLO.
+  bool propagate_layouts = false;
+};
+
 // Converts a MLIR module in HLO dialect into a HloModuleProto. If
 // use_tuple_args is set, then the entry computations's arguments are converted
 // to a tuple and passed as a single parameter.
@@ -32,15 +44,19 @@ namespace mlir {
 // are converted to a tuple even when there is only a single return value.
 // Multiple return values are always converted to a tuple and returned as a
 // single value.
+//
+// TODO(timshen): move other options into `options`.
 Status ConvertMlirHloToHlo(mlir::ModuleOp module, ::xla::HloProto* hlo_proto,
                            bool use_tuple_args, bool return_tuple,
                            const tensorflow::XlaHelpers::ShapeRepresentationFn
-                               shape_representation_fn = nullptr);
+                               shape_representation_fn = nullptr,
+                           MlirToHloConversionOptions options = {});
 
 // Converts a region to a computation. It returns a standalone module that
 // contains the converted region as the entry computation.
 Status ConvertRegionToComputation(mlir::Region* region,
-                                  ::xla::XlaComputation* func);
+                                  ::xla::XlaComputation* func,
+                                  MlirToHloConversionOptions options = {});
 
 // Creates XlaOp equivalent of a given MLIR operation using the operand info
 // from `value_lowering` map.
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/layouts_and_names.mlir b/tensorflow/compiler/mlir/xla/tests/translate/layouts_and_names.mlir
index 6a7debc8c6c..2ef0aaf3f50 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/layouts_and_names.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/layouts_and_names.mlir
@@ -26,5 +26,9 @@ func @main(%arg0: tensor<128x224x224x4xf16>, %arg1: tensor<64x7x7x4xf16>) -> ten
     rhs_dilations = dense<1> : tensor<2xi64>,
     window_strides = dense<2> : tensor<2xi64>
   } : (tensor<128x224x224x4xf16>, tensor<64x7x7x4xf16>)-> tensor<128x64x112x112xf16> loc("root.42")
+
+  // CHECK: s32[1,1]{0,1} constant({ {42} })
+  %cst_1 = "std.constant"() {value = dense<[[42]]> : tensor<1x1xi32>, minor_to_major = dense<[0, 1]> : tensor<2xindex>} : () -> tensor<1x1xi32>
+
   return %0 : tensor<128x64x112x112xf16>
 }
diff --git a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
index 55833bf9939..3ee70db1813 100644
--- a/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/xla/xla_mlir_translate.cc
@@ -129,8 +129,11 @@ static mlir::LogicalResult MlirHloToHloTextTranslateFunctionImpl(
   if (!module) return mlir::failure();
 
   HloProto hloProto;
+  mlir::MlirToHloConversionOptions options;
+  options.propagate_layouts = with_layouts;
   Status status = mlir::ConvertMlirHloToHlo(
-      module, &hloProto, emit_use_tuple_arg, emit_return_tuple);
+      module, &hloProto, emit_use_tuple_arg, emit_return_tuple,
+      /*shape_representation_fn=*/nullptr, options);
   if (!status.ok()) {
     LOG(ERROR) << "Module conversion failed: " << status;
     return mlir::failure();

From ad1b06b53a9dd5b049da9202df9bed07c7040fff Mon Sep 17 00:00:00 2001
From: Yanhui Liang <yhliang@google.com>
Date: Fri, 2 Oct 2020 13:38:33 -0700
Subject: [PATCH 20/54] Update lstm/gru tests.

PiperOrigin-RevId: 335091645
Change-Id: I812f6e629f3c3e850b2a8418a572fd3be1661868
---
 tensorflow/python/keras/layers/gru_v2_test.py  | 3 ++-
 tensorflow/python/keras/layers/lstm_v2_test.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/layers/gru_v2_test.py b/tensorflow/python/keras/layers/gru_v2_test.py
index 4fd8d23e0fd..db2b0a2e7b9 100644
--- a/tensorflow/python/keras/layers/gru_v2_test.py
+++ b/tensorflow/python/keras/layers/gru_v2_test.py
@@ -641,8 +641,9 @@ class GRUV2Test(keras_parameterized.TestCase):
     model.predict(inputs)
 
   # TODO (b/169895267): test with xla_gpu is disabled.
-  @testing_utils.run_v2_only
   def test_deepcopy(self):
+    if not context.executing_eagerly():
+      self.skipTest('v2-only test')
     original_layer = rnn.GRU(5)
     copied_layer = copy.deepcopy(original_layer)
     self.assertEqual(copied_layer.units, 5)
diff --git a/tensorflow/python/keras/layers/lstm_v2_test.py b/tensorflow/python/keras/layers/lstm_v2_test.py
index b4fc865bc9c..c6cb9208357 100644
--- a/tensorflow/python/keras/layers/lstm_v2_test.py
+++ b/tensorflow/python/keras/layers/lstm_v2_test.py
@@ -842,8 +842,9 @@ class LSTMV2Test(keras_parameterized.TestCase):
     model.predict(inputs)
 
   # TODO (b/169895267): test with xla_gpu is disabled.
-  @testing_utils.run_v2_only
   def test_deepcopy(self):
+    if not context.executing_eagerly():
+      self.skipTest('v2-only test')
     original_layer = rnn.LSTM(5)
     copied_layer = copy.deepcopy(original_layer)
     self.assertEqual(copied_layer.units, 5)

From 621872ca7096604cb3128075f68334ebfc26e8af Mon Sep 17 00:00:00 2001
From: Ken Franko <kfranko@google.com>
Date: Fri, 2 Oct 2020 13:58:56 -0700
Subject: [PATCH 21/54] Enable presubmit testing for the mlir bridge for
 distribute tests passing.

These tests will run with both the new bridge and the old bridge.

PiperOrigin-RevId: 335095507
Change-Id: I3d86a94f5398793a63d82424a2c28f4c7ad07b35
---
 tensorflow/python/distribute/BUILD | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 004297d50ab..0fc54b0c8d1 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1338,6 +1338,7 @@ distribute_py_test(
 distribute_py_test(
     name = "strategy_reduce_test",
     srcs = ["strategy_reduce_test.py"],
+    disable_mlir_bridge = False,
     main = "strategy_reduce_test.py",
     tags = [
         "multi_and_single_gpu",
@@ -1768,6 +1769,7 @@ py_library(
 distribute_py_test(
     name = "test_util_test",
     srcs = ["test_util_test.py"],
+    disable_mlir_bridge = False,
     tags = [
         "multi_and_single_gpu",
     ],

From 7ae3aed902725fbcd2258164778c0dd8eb6dd689 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 2 Oct 2020 14:03:03 -0700
Subject: [PATCH 22/54] Internal build changes.

PiperOrigin-RevId: 335096321
Change-Id: If49e4a5d8a0720b1b9c679aa83f1ed3b8dfb0e2e
---
 .../mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cpp        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cpp b/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cpp
index 255ed378e64..5633f639f04 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cpp
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cpp
@@ -20,8 +20,8 @@ limitations under the License.
 #include <cassert>
 #include <numeric>
 
-#include "third_party/llvm/llvm-project/llvm/include/llvm/ADT/ArrayRef.h"
-#include "third_party/llvm/llvm-project/llvm/include/llvm/Support/raw_ostream.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/raw_ostream.h"
 #include "third_party/llvm/llvm-project/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h"
 
 #if GOOGLE_CUDA

From 99d1481fc6948b3cd9b20a289bf5523586f0c669 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Fri, 2 Oct 2020 14:04:41 -0700
Subject: [PATCH 23/54] Add extra checks for shape and dtype for control flow
 conditionals.

PiperOrigin-RevId: 335096700
Change-Id: I742518d56648aa4d99bba374c824587f25c2c220
---
 .../autograph/operators/control_flow.py       |  31 ++++-
 .../autograph/operators/control_flow_test.py  | 125 ++++++++++++++++++
 tensorflow/python/autograph/utils/testing.py  |  27 +++-
 3 files changed, 178 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 9bd139c031f..aaa4808cb0a 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -116,6 +116,33 @@ def _is_none_or_undef(value):
           or isinstance(value, variables.Undefined))
 
 
+def _verify_tf_condition(cond, tag):
+  """Ensures that the condition can be used in a TF control flow."""
+  extra_hint = 'to check for None, use `is not None`'
+  cond = ops.convert_to_tensor_v2(cond)
+
+  if cond.dtype != dtypes.bool:
+    raise ValueError(
+        'condition of {} expected to be `tf.bool` scalar, got {}'
+        '; to use as boolean Tensor, use `tf.cast`'
+        '; {}'.format(tag, cond, extra_hint))
+
+  if cond.shape is None or cond.shape.ndims is None:
+    # TODO(mdan): Consider a explicit size check, if not too slow.
+    cond = array_ops.reshape(cond, ())
+
+  elif cond.shape.ndims > 0:
+    known_dims = [d for d in cond.shape.as_list() if d is not None]
+    if np.prod(known_dims) > 1:
+      raise ValueError(
+          'condition of {} expected to be `tf.bool` scalar, got {}'
+          '; {}'.format(tag, cond, extra_hint))
+    else:
+      cond = array_ops.reshape(cond, ())
+
+  return cond
+
+
 def _verify_loop_init_vars(init_vars, symbol_names, first_iter_vars=None):
   """Ensures that all values in the state are valid to use in a TF loop.
 
@@ -1038,7 +1065,7 @@ def _tf_while_stmt(test, body, get_state, set_state, symbol_names, opts):
       loop_vars = loop_vars[1:]
 
     set_state(loop_vars)
-    return test()
+    return _verify_tf_condition(test(), 'while loop')
 
   def aug_body(*loop_vars):
     if require_one_iteration:
@@ -1141,6 +1168,8 @@ def if_stmt(cond, body, orelse, get_state, set_state, symbol_names, nouts):
 def _tf_if_stmt(
     cond, body, orelse, get_state, set_state, symbol_names, nouts):
   """Overload of if_stmt that stages a TF cond."""
+  cond = _verify_tf_condition(cond, 'if statement')
+
   if not nouts:
     prev_get_state, prev_set_state = get_state, set_state
     # Control flow V1 wants at least one output.
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 32b36a29797..8d3c63b5a89 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.autograph.utils import testing
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -46,6 +47,20 @@ from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.platform import test
 
 
+def _unranked_item(value):
+  rand_rank = random_ops.random_uniform(
+      shape=(), minval=3, maxval=4, dtype=dtypes.int32)
+  rand_shape = array_ops.ones([rand_rank], dtype=dtypes.int32)
+  return array_ops.fill(rand_shape, value)
+
+
+def _partial_shaped_bools():
+  rand_vect = math_ops.range(
+      random_ops.random_uniform(
+          shape=(), minval=2, maxval=3, dtype=dtypes.int32))
+  return array_ops.expand_dims_v2(rand_vect, 0) < 0
+
+
 class ForLoopTest(testing.AutoGraphTestCase):
 
   def test_tensor(self):
@@ -871,6 +886,60 @@ class WhileLoopTest(testing.AutoGraphTestCase):
     with self.assertRaisesRegex(ValueError, r"'s'.* shape \(1,\) after"):
       self._basic_loop(0, lambda i, s: np.array([1], dtype=np.int32))
 
+  def _fixed_while_loop(self, cond_fn):
+    def test_():
+      return cond_fn(s)
+
+    def body():
+      nonlocal s
+      s += 1
+
+    def set_state(loop_vars):
+      nonlocal s
+      s, = loop_vars
+
+    s = constant_op.constant(0)
+    control_flow.while_stmt(
+        test=test_,
+        body=body,
+        get_state=lambda: (s,),
+        set_state=set_state,
+        symbol_names=('s',),
+        opts={})
+    return s
+
+  def _assertFixedLoopResult(self, cond, expected):
+    def test_fn():
+      return self._fixed_while_loop(cond)
+    self.assertEqual(test_fn(), expected)
+
+  def test_tensor_legal_cond_scalar(self):
+    self._assertFixedLoopResult(lambda s: constant_op.constant(False), 0)
+    self._assertFixedLoopResult(lambda s: s < 2, 2)
+
+  def test_tensor_legal_cond_single_element_nd(self):
+    self._assertFixedLoopResult(lambda s: constant_op.constant([[False]]), 0)
+    self._assertFixedLoopResult(lambda s: _unranked_item(False), 0)
+
+  def _assertCondCheckFails(self, cond):
+    with self.assertRaisesRegex(
+        ValueError, 'condition of while loop expected to be `tf.bool`'):
+      self._fixed_while_loop(cond)
+
+  def test_tensor_illegal_cond_not_bool(self):
+    self._assertCondCheckFails(lambda s: constant_op.constant(1))
+    self._assertCondCheckFails(lambda s: s)
+
+  def test_tensor_illegal_cond_not_single_element(self):
+    self._assertCondCheckFails(lambda s: constant_op.constant([1, 2, 3]))
+    self._assertCondCheckFails(lambda s: constant_op.constant([True, False]))
+
+  def test_tensor_illegal_cond_not_single_element_dynamic_shape(self):
+    self._fixed_while_loop(lambda s: _partial_shaped_bools())
+    # TODO(mdan): This error is quite bad. Measure the cost of an assertion.
+    self.assertRaisesRuntime(
+        errors_impl.InvalidArgumentError, 'requested shape has 1')
+
 
 class IfStmtTest(testing.AutoGraphTestCase):
 
@@ -1065,6 +1134,62 @@ class IfStmtTest(testing.AutoGraphTestCase):
         TypeError, "'x' has dtype int32.*but.*float32"):
       self._basic_cond(lambda: 1, lambda: 1.0)
 
+  def _fixed_cond(self, cond_val):
+    def body():
+      nonlocal x
+      x = 1
+
+    def orelse():
+      nonlocal x
+      x = -1
+
+    def set_state(cond_vars):
+      nonlocal x
+      x, = cond_vars
+
+    x = 0
+    control_flow.if_stmt(
+        cond=cond_val,
+        body=body,
+        orelse=orelse,
+        get_state=lambda: (x,),
+        set_state=set_state,
+        symbol_names=('x',),
+        nouts=1)
+    return x
+
+  def _assertFixedCondResult(self, cond, expected):
+    def test_fn():
+      return self._fixed_cond(cond)
+    self.assertEqual(test_fn(), expected)
+
+  def test_tensor_legal_cond_scalar(self):
+    self._assertFixedCondResult(constant_op.constant(True), 1)
+    self._assertFixedCondResult(constant_op.constant(False), -1)
+
+  def test_tensor_legal_cond_single_element_nd(self):
+    self._assertFixedCondResult(constant_op.constant([[True]]), 1)
+    self._assertFixedCondResult(constant_op.constant([[False]]), -1)
+    self._assertFixedCondResult(_unranked_item(True), 1)
+    self._assertFixedCondResult(_unranked_item(False), -1)
+
+  def _assertCondCheckFails(self, cond):
+    with self.assertRaisesRegex(
+        ValueError, 'condition of if statement expected to be `tf.bool`'):
+      self._fixed_cond(cond)
+
+  def test_tensor_illegal_cond_not_bool(self):
+    self._assertCondCheckFails(constant_op.constant(1))
+
+  def test_tensor_illegal_cond_not_single_element(self):
+    self._assertCondCheckFails(constant_op.constant([1, 2, 3]))
+    self._assertCondCheckFails(constant_op.constant([True, False]))
+
+  def test_tensor_illegal_cond_not_single_element_dynamic_shape(self):
+    self._fixed_cond(_partial_shaped_bools())
+    # TODO(mdan): This error is quite bad. Measure the cost of an assertion.
+    self.assertRaisesRuntime(
+        errors_impl.InvalidArgumentError, 'requested shape has 1')
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/autograph/utils/testing.py b/tensorflow/python/autograph/utils/testing.py
index bec6966e7cb..5b74496fb74 100644
--- a/tensorflow/python/autograph/utils/testing.py
+++ b/tensorflow/python/autograph/utils/testing.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import re
+import sys
 import types
 import unittest
 
@@ -81,18 +82,29 @@ class AutoGraphTestCase(test.TestCase):
       @def_function.function(autograph=False)  # Testing autograph itself.
       def fn_wrapper():
         self.assertions = []
+        self.raises_cm = None
         self.graph_assertions = []
         self.trace_log = []
         fn()
         targets = [args for _, args in self.assertions]
         return targets
 
-      tensors = fn_wrapper()
+      try:
+        tensors = fn_wrapper()
 
-      for assertion in self.graph_assertions:
-        assertion(fn_wrapper.get_concrete_function().graph)
+        for assertion in self.graph_assertions:
+          assertion(fn_wrapper.get_concrete_function().graph)
+
+        actuals = self.evaluate(tensors)
+
+      except:  # pylint:disable=bare-except
+        if self.raises_cm is not None:
+          # Note: Yes, the Raises and function contexts cross.
+          self.raises_cm.__exit__(*sys.exc_info())
+          return
+        else:
+          raise
 
-      actuals = self.evaluate(tensors)
       for (assertion, _), values in zip(self.assertions, actuals):
         assertion(*values)
 
@@ -109,6 +121,7 @@ class AutoGraphTestCase(test.TestCase):
     super().setUp()
     self.variables = {}
     self.trace_log = []
+    self.raises_cm = None
     op_callbacks.add_op_callback(self._op_callback)
 
   def tearDown(self):
@@ -145,3 +158,9 @@ class AutoGraphTestCase(test.TestCase):
 
   def assertDictEqual(self, *args):
     self.assertions.append((super().assertDictEqual, list(args)))
+
+  def assertRaisesRuntime(self, *args):
+    if self.raises_cm is not None:
+      raise ValueError('cannot use more than one assertRaisesRuntime in a test')
+    self.raises_cm = self.assertRaisesRegex(*args)
+    self.raises_cm.__enter__()

From 17111c114a0a792f202e72f30834525352ce9cfc Mon Sep 17 00:00:00 2001
From: Xiao Yu <fishx@google.com>
Date: Fri, 2 Oct 2020 14:19:51 -0700
Subject: [PATCH 24/54] Diable a failling test.

PiperOrigin-RevId: 335099725
Change-Id: I08a069d82c4bb6a32c98ba3055b5e907f82e03aa
---
 tensorflow/python/kernel_tests/atrous_convolution_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/kernel_tests/atrous_convolution_test.py b/tensorflow/python/kernel_tests/atrous_convolution_test.py
index 2fb8a37e2b9..6297635f91b 100644
--- a/tensorflow/python/kernel_tests/atrous_convolution_test.py
+++ b/tensorflow/python/kernel_tests/atrous_convolution_test.py
@@ -265,6 +265,7 @@ class AtrousConvolutionTest(test.TestCase):
     self.assertLess(err, err_tolerance)
 
   @test_util.run_v1_only("b/120545219")
+  @test_util.disable_xla("b/169963909")
   def testGradient(self):
     with self.cached_session():
       for padding in ["SAME", "VALID"]:

From 0995c980192e61ee90c1baded100b56a1fc26f8b Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Fri, 2 Oct 2020 14:58:06 -0700
Subject: [PATCH 25/54] [NFC] Eliminate references to HLO instr in the
 Convolution/Cholesky Thunk constructor.

- Move GpuConvConfig construction to the thunk emitter instead of the
  ConvolutionThunk constructor.
- Use the passed in `type` in the CholeskyThunk contructor.

PiperOrigin-RevId: 335107107
Change-Id: Ice3832009da338e15bf56bfff5420f41dd2f5f5b
---
 tensorflow/compiler/xla/service/gpu/BUILD                | 1 +
 tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc    | 5 +----
 tensorflow/compiler/xla/service/gpu/convolution_thunk.cc | 7 +++----
 tensorflow/compiler/xla/service/gpu/convolution_thunk.h  | 2 +-
 tensorflow/compiler/xla/service/gpu/thunk_emitter.cc     | 9 +++++++--
 5 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 0cdfad25a44..913247a4299 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -217,6 +217,7 @@ cc_library(
         ":backend_configs_cc",
         ":buffer_allocations",
         ":gpu_constants",
+        ":gpu_conv_runner",
         ":gpu_executable",
         ":ir_emission_utils",
         ":nccl_all_reduce_thunk",
diff --git a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
index c34c299fea8..4ac5784e51a 100644
--- a/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/cholesky_thunk.cc
@@ -45,10 +45,7 @@ CholeskyThunk::CholeskyThunk(ThunkInfo thunk_info,
       info_buffer_(info_buffer),
       type_(type),
       batch_size_(batch_size),
-      a_batch_stride_(
-          n * n *
-          ShapeUtil::ByteSizeOfPrimitiveType(
-              thunk_info.hlo_instruction->operand(0)->shape().element_type())),
+      a_batch_stride_(n * n * ShapeUtil::ByteSizeOfPrimitiveType(type)),
       n_(n) {}
 
 Status CholeskyThunk::ExecuteOnStream(const ExecuteParams& params) {
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index cf193b96676..efa3a5802d6 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -31,7 +31,8 @@ namespace xla {
 namespace gpu {
 
 ConvolutionThunk::ConvolutionThunk(
-    ThunkInfo thunk_info, std::vector<BufferAllocation::Slice> operand_slices,
+    ThunkInfo thunk_info, GpuConvConfig&& config,
+    std::vector<BufferAllocation::Slice> operand_slices,
     BufferAllocation::Slice result_slice, BufferAllocation::Slice scratch_slice,
     BufferAllocation::Slice tuple_result_slice)
     : Thunk(Kind::kConvolution, thunk_info),
@@ -39,9 +40,7 @@ ConvolutionThunk::ConvolutionThunk(
       result_buffer_(result_slice),
       scratch_buffer_(scratch_slice),
       tuple_result_buffer_(tuple_result_slice),
-      config_(GetGpuConvConfig(
-                  Cast<HloCustomCallInstruction>(thunk_info.hlo_instruction))
-                  .ValueOrDie()) {}
+      config_(std::move(config)) {}
 
 Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
   const auto& buffer_allocations = *params.buffer_allocations;
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
index bc4923e38c0..ffefe58e229 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.h
@@ -43,7 +43,7 @@ class ConvolutionThunk : public Thunk {
   // write a tuple (result, scratch_memory) into `tuple_result_buffer`.
   //
   // operand_slices should be in the same order as cudnn_call->operands().
-  ConvolutionThunk(ThunkInfo thunk_info,
+  ConvolutionThunk(ThunkInfo thunk_info, GpuConvConfig&& config,
                    std::vector<BufferAllocation::Slice> operand_slices,
                    BufferAllocation::Slice result_slice,
                    BufferAllocation::Slice scratch_slice,
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
index 690d0c9de56..e5798677071 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_emitter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/cudnn_batchnorm_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/fft_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_thunk.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_conv_runner.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/outfeed_thunk.h"
@@ -238,9 +239,13 @@ Status ThunkEmitter::HandleCustomCall(HloInstruction* custom_call) {
     auto conv_result_slice = GetAllocationSlice(*custom_call, {0});
     auto scratch_slice = GetAllocationSlice(*custom_call, {1});
 
+    TF_ASSIGN_OR_RETURN(
+        GpuConvConfig config,
+        GetGpuConvConfig(Cast<HloCustomCallInstruction>(custom_call)));
     AddThunkToThunkSequence(absl::make_unique<ConvolutionThunk>(
-        context_->GetThunkInfo(custom_call), std::move(operand_slices),
-        conv_result_slice, scratch_slice, tuple_result_slice));
+        context_->GetThunkInfo(custom_call), std::move(config),
+        std::move(operand_slices), conv_result_slice, scratch_slice,
+        tuple_result_slice));
     return Status::OK();
   }
 

From 050bc18321c0a3584492bbd3df475952ef5b0a1d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Oct 2020 14:58:37 -0700
Subject: [PATCH 26/54] [XLA] Switch implementation of erf to use the same
 rational polynomial approximation as Eigen.

PiperOrigin-RevId: 335107203
Change-Id: Ieccd50764f9287fcc74d061b545530cde53b1fe2
---
 tensorflow/compiler/xla/client/lib/math.cc | 30 +++++-----------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 76cc6f0159b..410c86732d6 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -203,7 +203,7 @@ static XlaOp ErfcImpl32(XlaOp x) {
 // Precondition: abs(x) <= 1.  Otherwise, use ErfcImpl.
 //
 // This follows Cephes's f32 implementation of erf.
-static XlaOp ErfImpl32Cephes(XlaOp x) {
+static XlaOp ErfImpl32(XlaOp x) {
   // Coefficients for by erf(f32), from Cephes.
   //
   // erf(x) = x P(x^2), 0 < x < 1
@@ -291,31 +291,11 @@ XlaOp Erfc(XlaOp x) {
     // (not surprising!), so upcast to f32 in this case.
     return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
       return Select(Gt(Abs(x), ScalarLike(x, 1)), ErfcImpl32(x),
-                    ScalarLike(x, 1) - ErfImpl32Cephes(x));
+                    ScalarLike(x, 1) - ErfImpl32(x));
     });
   });
 }
 
-// Compute a polynomial approximation of the error function.
-// This is the same approximation used by Eigen.
-static XlaOp ErfImpl32(XlaOp x) {
-  static const std::array<float, 7> kAlpha{
-      -2.72614225801306e-10f, 2.77068142495902e-08f,  -2.10102402082508e-06f,
-      -5.69250639462346e-05f, -7.34990630326855e-04f, -2.95459980854025e-03f,
-      -1.60960333262415e-02f,
-  };
-
-  static const std::array<float, 5> kBeta{
-      -1.45660718464996e-05f, -2.13374055278905e-04f, -1.68282697438203e-03f,
-      -7.37332916720468e-03f, -1.42647390514189e-02f,
-  };
-
-  x = Clamp(ScalarLike(x, -4.f), x, ScalarLike(x, 4.f));
-  auto x2 = x * x;
-  return x * EvaluatePolynomial<float>(x2, kAlpha) /
-         EvaluatePolynomial<float>(x2, kBeta);
-}
-
 XlaOp Erf(XlaOp x) {
   auto& b = *x.builder();
   return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
@@ -330,8 +310,10 @@ XlaOp Erf(XlaOp x) {
     }
     // Erf(c)Impl don't have enough precision when run with bf16 intermediates
     // (not surprising!), so upcast to f32 in this case.
-    return DoWithUpcastToF32(x, {BF16, F16},
-                             [](XlaOp x) { return ErfImpl32(x); });
+    return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
+      return Select(Lt(Abs(x), ScalarLike(x, 1)), ErfImpl32(x),
+                    ScalarLike(x, 1) - ErfcImpl32(x));
+    });
   });
 }
 

From b7ae0d348aaaf43a9f4ab6415fc6c51003b410a7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Oct 2020 14:58:40 -0700
Subject: [PATCH 27/54] Revise the error message raised in _assert_built_as_v1
 in base_layer_v1.py to remind user checking whether their children have
 called super().__init__(..)

PiperOrigin-RevId: 335107218
Change-Id: I1b992357398c82231987345f056f91215908d71d
---
 tensorflow/python/keras/engine/base_layer_v1.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer_v1.py b/tensorflow/python/keras/engine/base_layer_v1.py
index dc34dd524a3..fd9db0e4346 100644
--- a/tensorflow/python/keras/engine/base_layer_v1.py
+++ b/tensorflow/python/keras/engine/base_layer_v1.py
@@ -835,8 +835,9 @@ class Layer(base_layer.Layer):
   def _assert_built_as_v1(self):
     if not hasattr(self, '_originally_built_as_v1'):
       raise ValueError(
-          'Your Layer or Model is in an invalid state. This can happen if you '
-          'are interleaving estimator/non-estimator models or '
+          'Your Layer or Model is in an invalid state. '
+          'This can happen for the following cases:\n '
+          '1. You might be interleaving estimator/non-estimator models or '
           'interleaving models/layers made in tf.compat.v1.Graph.as_default() '
           'with models/layers created outside of it. '
           'Converting a model to an estimator (via model_to_estimator) '
@@ -844,7 +845,11 @@ class Layer(base_layer.Layer):
           'if they were not the model converted to an estimator). '
           'Similarly, making a layer or a model inside a '
           'a tf.compat.v1.Graph invalidates all layers/models you previously '
-          'made outside of the graph.')
+          'made outside of the graph.\n'
+          '2. You might be using a custom keras layer implementation with '
+          ' custom __init__ which didn\'t call super().__init__. '
+          ' Please check the implementation of %s and its bases.' %
+          (type(self),))
 
   @property
   def dtype(self):

From 70302db2045a8a96af10db481a4f2f062ce05f82 Mon Sep 17 00:00:00 2001
From: Jose Baiocchi <jbaiocchi@google.com>
Date: Fri, 2 Oct 2020 15:04:25 -0700
Subject: [PATCH 28/54] Remove direct includes of gmock.h in profiler

PiperOrigin-RevId: 335108352
Change-Id: I0886cdab14f3b2382f31e3073ce2219e28a8223f
---
 tensorflow/core/profiler/internal/BUILD                    | 1 -
 tensorflow/core/profiler/internal/cpu/BUILD                | 1 -
 tensorflow/core/profiler/internal/cpu/host_tracer_test.cc  | 1 -
 tensorflow/core/profiler/internal/traceme_recorder_test.cc | 1 -
 4 files changed, 4 deletions(-)

diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index bbd73edffe6..52493bc0fc7 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -406,7 +406,6 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "@com_google_absl//absl/strings",
-        "@com_google_googletest//:gtest",
     ],
 )
 
diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD
index 6fb518d413e..dcccf8a8056 100644
--- a/tensorflow/core/profiler/internal/cpu/BUILD
+++ b/tensorflow/core/profiler/internal/cpu/BUILD
@@ -62,7 +62,6 @@ tf_cc_test(
         "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
-        "@com_google_googletest//:gtest",
     ],
 )
 
diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
index 0e4c3dd7a9b..86c06246031 100644
--- a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
+++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
-#include <gmock/gmock.h>
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
diff --git a/tensorflow/core/profiler/internal/traceme_recorder_test.cc b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
index 8d7abc94e8f..0d586219875 100644
--- a/tensorflow/core/profiler/internal/traceme_recorder_test.cc
+++ b/tensorflow/core/profiler/internal/traceme_recorder_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include <gmock/gmock.h>
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/env_time.h"

From c52875771fa395805842b8a4c3914b7e675cf9c3 Mon Sep 17 00:00:00 2001
From: Tim Shen <timshen@google.com>
Date: Fri, 2 Oct 2020 15:12:40 -0700
Subject: [PATCH 29/54] [MLIR] Add cbrt, reduce-precision, and bitcast ops to
 MHLO.

PiperOrigin-RevId: 335109804
Change-Id: I0984a3db18191db07de39107886e626e5b8e090a
---
 .../mlir-hlo/Dialect/mhlo/IR/hlo_ops.td       | 20 +++++++++++++
 .../mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td  | 24 +++++++++++++++
 tensorflow/compiler/mlir/hlo/tests/ops.mlir   | 21 +++++++++++++
 .../mlir/xla/hlo_function_importer.cc         | 15 ++++++++++
 .../compiler/mlir/xla/mlir_hlo_to_hlo.cc      |  9 ++++++
 .../mlir/xla/tests/translate/export.mlir      | 30 +++++++++++++++++++
 .../mlir/xla/tests/translate/import.hlotxt    | 23 ++++++++++++++
 tensorflow/compiler/xla/client/xla_builder.cc | 10 +++++++
 tensorflow/compiler/xla/client/xla_builder.h  |  3 ++
 9 files changed, 155 insertions(+)

diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
index 2e17834e232..ed62ef83167 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.td
@@ -157,6 +157,9 @@ def HLO_AbsOp: HLO_UnaryElementwiseOp<"abs",
   >];
 }
 
+def HLO_CbrtOp: HLO_UnaryElementwiseOp<"cbrt",
+    [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_CbrtOp;
+
 def HLO_CeilOp: HLO_UnaryElementwiseOp<"ceil",
     [NoSideEffect, SameOperandsAndResultType], HLO_FpTensor>, BASE_HLO_CeilOp;
 
@@ -1423,4 +1426,21 @@ def HLO_FusionOp : HLO_Op<"fusion", []> {
   let hasCustomHLOConverter = 1;
 }
 
+// This is an op for purposes internal to XLA/GPU.
+def HLO_BitcastOp : HLO_Op<"bitcast", [NoSideEffect]>, BASE_HLO_BitcastOp {
+  let arguments = (ins HLO_Tensor:$operand);
+  let results = (outs HLO_Tensor);
+  let hasCustomHLOConverter = 1;
+}
+
+def HLO_ReducePrecisionOp: HLO_Op<"reduce_precision", [SameOperandsAndResultShape]>,
+                           BASE_HLO_ReducePrecisionOp {
+  let arguments = (ins
+    HLO_FpTensor:$operand,
+    I32Attr:$exponent_bits,
+    I32Attr:$mantissa_bits
+  );
+  let results = (outs HLO_FpTensor:$output);
+}
+
 #endif // HLO_OPS
diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
index b8378fed01a..6386972809b 100644
--- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
+++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops_base.td
@@ -127,6 +127,17 @@ class BASE_HLO_AbsOp {
   }];
 }
 
+class BASE_HLO_CbrtOp {
+  string summary = "Cubic root operator";
+
+  string description = [{
+    Returns element-wise cubic root of the operand.
+
+    See
+    https://www.tensorflow.org/xla/operation_semantics#element-wise_unary_functions.
+  }];
+}
+
 class BASE_HLO_CeilOp {
   string summary = "Ceil operator";
 
@@ -1336,4 +1347,17 @@ class BASE_HLO_WhileOp {
   }];
 }
 
+class BASE_HLO_BitcastOp {
+  string summary = "Bitcast operator";
+
+  string description = [{
+    This op changes the shape of the input in the way that the physical
+    arranggment of elements are unchanged.
+
+    However, the op needs layout information to make sense of "physical
+    arrangement of elements". Layout support in MHLO is currently under
+    exploration.
+  }];
+}
+
 #endif // HLO_OPS_BASE
diff --git a/tensorflow/compiler/mlir/hlo/tests/ops.mlir b/tensorflow/compiler/mlir/hlo/tests/ops.mlir
index 8cf8dba9eb9..4462d9c45c6 100644
--- a/tensorflow/compiler/mlir/hlo/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/hlo/tests/ops.mlir
@@ -1193,3 +1193,24 @@ func @incompatible_shapes(%arg0: tensor<?xf32>, %shape: tensor<2xindex>) -> tens
   %0 = "mhlo.dynamic_reshape"(%arg0, %shape) : (tensor<?xf32>, tensor<2xindex>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
+
+// -----
+
+func @cbrt(%arg: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = "mhlo.cbrt"(%arg) : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  return %0 : tensor<2x4xf32>
+}
+
+// -----
+
+func @bitcast(%arg: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = "mhlo.bitcast"(%arg) : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  return %0 : tensor<2x4xf32>
+}
+
+// -----
+
+func @bitcast(%arg: tensor<2x4xf32>) -> tensor<2x4xf32> {
+  %0 = "mhlo.reduce_precision"(%arg) {exponent_bits=2 : i32, mantissa_bits=3 : i32} : (tensor<2x4xf32>) -> tensor<2x4xf32>
+  return %0 : tensor<2x4xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
index a3f68411cc3..209a7dfa7fe 100644
--- a/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
+++ b/tensorflow/compiler/mlir/xla/hlo_function_importer.cc
@@ -681,6 +681,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       NoAttributeCase(kAnd, AndOp);
       NoAttributeCase(kAtan2, Atan2Op);
       NoAttributeCase(kBitcastConvert, BitcastConvertOp);
+      NoAttributeCase(kCbrt, CbrtOp);
       NoAttributeCase(kConvert, ConvertOp);
       NoAttributeCase(kCeil, CeilOp);
       NoAttributeCase(kClamp, ClampOp);
@@ -738,6 +739,20 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
                          &fusion.fused_computation()));
       return fusion.getOperation();
     }
+    case HloOpcode::kBitcast:
+      return func_builder
+          ->create<mlir::mhlo::BitcastOp>(loc, result_type, operands,
+                                          attributes)
+          .getOperation();
+    case HloOpcode::kReducePrecision: {
+      auto op = func_builder->create<mlir::mhlo::ReducePrecisionOp>(
+          loc, result_type, operands[0], attributes);
+      op.exponent_bitsAttr(func_builder->getIntegerAttr(
+          func_builder->getI32Type(), instruction->exponent_bits()));
+      op.mantissa_bitsAttr(func_builder->getIntegerAttr(
+          func_builder->getI32Type(), instruction->mantissa_bits()));
+      return op.getOperation();
+    }
     case HloOpcode::kAddDependency:
       // Arbitrary op code that I suspect we will not implement for quite a
       // while and allows testing handling of unknown ops. Selected because it
diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
index 0923f247cd2..ccfcebab60e 100644
--- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc
@@ -1082,6 +1082,15 @@ LogicalResult ExportXlaOp(FusionOp op, OpLoweringContext ctx) {
   return success();
 }
 
+LogicalResult ExportXlaOp(BitcastOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  xla::XlaOp operand;
+  if (failed(GetXlaOp(op.operand(), value_map, &operand, op))) return failure();
+  value_map[op] = xla::internal::XlaBuilderFriend::BuildBitcast(
+      ctx.builder, operand, xla::TypeToShape(op.getType()));
+  return success();
+}
+
 }  // namespace
 }  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
index 84816e6715a..c078191d170 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
+++ b/tensorflow/compiler/mlir/xla/tests/translate/export.mlir
@@ -1102,3 +1102,33 @@ func @main(%arg: tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>> {
   %0 = "mhlo.rng_bit_generator"(%arg) {rng_algorithm = 2 : i32} : (tensor<3xui64>) -> tuple<tensor<3xui64>, tensor<2x2xui32>>
   return %0 : tuple<tensor<3xui64>, tensor<2x2xui32>>
 }
+
+// -----
+
+// CHECK:  HloModule
+func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
+// CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
+// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] cbrt(f32[3,4] %[[ARG0]])
+  %0 = "mhlo.cbrt"(%arg) : (tensor<3x4xf32>) -> tensor<3x4xf32>
+  return %0 : tensor<3x4xf32>
+}
+
+// -----
+
+// CHECK:  HloModule
+func @main(%arg: tensor<3x4xf32>) -> tensor<3x4xf32> {
+// CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
+// CHECK: ROOT %[[RESULT:.*]] = f32[3,4] reduce-precision(f32[3,4] %[[ARG0]]), exponent_bits=8, mantissa_bits=10
+  %0 = "mhlo.reduce_precision"(%arg) {exponent_bits = 8 : i32, mantissa_bits = 10 : i32} : (tensor<3x4xf32>) -> tensor<3x4xf32>
+  return %0 : tensor<3x4xf32>
+}
+
+// -----
+
+// CHECK:  HloModule
+func @main(%arg: tensor<3x4xf32>) -> tensor<3x4x1xf32> {
+// CHECK: %[[ARG0:.*]] = f32[3,4] parameter(0)
+// CHECK: ROOT %[[RESULT:.*]] = f32[3,4,1] bitcast(f32[3,4] %[[ARG0]])
+  %0 = "mhlo.bitcast"(%arg) : (tensor<3x4xf32>) -> tensor<3x4x1xf32>
+  return %0 : tensor<3x4x1xf32>
+}
diff --git a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
index 90034ce8c07..cce49b16c6c 100644
--- a/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
+++ b/tensorflow/compiler/mlir/xla/tests/translate/import.hlotxt
@@ -1014,3 +1014,26 @@ add {
   ROOT %rng-bit-generator.2 = (u64[3], u32[2,2]) rng-bit-generator(u64[3] %Arg_0.1), algorithm=rng_philox
 }
 
+// CHECK-LABEL:  func @cbrt
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3x4xf32>)
+%cbrt (Arg_0.1: f32[3,4]) -> f32[3,4] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  // CHECK: "mhlo.cbrt"(%[[ARG0]]) : (tensor<3x4xf32>) -> tensor<3x4xf32>
+  ROOT %cbrt = f32[3,4] cbrt(f32[3,4] %Arg_0.1)
+}
+
+// CHECK-LABEL:  func @bitcast
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3x4xf32>) -> tensor<3x4x1xf32>
+%bitcast (Arg_0.1: f32[3,4]) -> f32[3,4,1] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  // CHECK: "mhlo.bitcast"(%[[ARG0]]) : (tensor<3x4xf32>) -> tensor<3x4x1xf32>
+  ROOT %bitcast = f32[3,4,1] bitcast(f32[3,4] %Arg_0.1)
+}
+
+// CHECK-LABEL:  func @reduce_precision
+// CHECK-SAME:    (%[[ARG0:.*]]: tensor<3x4xf32>)
+%reduce_precision (Arg_0.1: f32[3,4]) -> f32[3,4] {
+  %Arg_0.1 = f32[3,4] parameter(0)
+  // CHECK: "mhlo.reduce_precision"(%[[ARG0]]) {exponent_bits = 8 : i32, mantissa_bits = 10 : i32} : (tensor<3x4xf32>) -> tensor<3x4xf32>
+  ROOT %reduce_precision = f32[3,4] reduce-precision(f32[3,4] %Arg_0.1), exponent_bits=8, mantissa_bits=10
+}
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 168565e9b50..2ac3200800b 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -149,6 +149,16 @@ XlaOp XlaBuilderFriend::BuildFusion(XlaBuilder* builder,
   });
 }
 
+XlaOp XlaBuilderFriend::BuildBitcast(XlaBuilder* builder, XlaOp operand,
+                                     const Shape& shape) {
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    HloInstructionProto instr;
+    *instr.mutable_shape() = shape.ToProto();
+    return builder->AddInstruction(std::move(instr), HloOpcode::kBitcast,
+                                   {operand});
+  });
+}
+
 HloInstructionProto* XlaBuilderFriend::GetInstruction(XlaOp op) {
   return &op.builder()
               ->instructions_[op.builder()->handle_to_index_[op.handle_]];
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index b3fc3628442..f736ae1d470 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -57,6 +57,9 @@ struct XlaBuilderFriend {
                            absl::string_view fusion_kind,
                            const XlaComputation& fused_computation);
 
+  static XlaOp BuildBitcast(XlaBuilder* builder, XlaOp operand,
+                            const Shape& shape);
+
   static HloInstructionProto* GetInstruction(XlaOp op);
 };
 

From a20b5857915cd573f595942eab6a928ae3837639 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 2 Oct 2020 15:38:31 -0700
Subject: [PATCH 30/54] pfor: Handle TensorList/variant stacking slightly more
 generically

Supports AddN on TensorLists, and fixes TensorLists returned from some vectorized control flow operations.

Adds/fixes a couple handle data copies.

PiperOrigin-RevId: 335114306
Change-Id: I9c98c3c6228922dcb5a0ed1cf07efc22ac90a2af
---
 tensorflow/python/eager/backprop_test.py      |  27 +++++
 tensorflow/python/framework/func_graph.py     |   5 +-
 tensorflow/python/ops/parallel_for/BUILD      |   1 +
 .../ops/parallel_for/control_flow_ops_test.py |  29 +++++
 tensorflow/python/ops/parallel_for/pfor.py    | 114 +++++++++++++-----
 tensorflow/python/ops/while_v2.py             |   6 +-
 6 files changed, 153 insertions(+), 29 deletions(-)

diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 0adb4698529..f192499194c 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -42,6 +42,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -1722,6 +1723,32 @@ class JacobianTest(test.TestCase):
         g.jacobian(output, inp, experimental_use_pfor=True),
         g.jacobian(output, inp, experimental_use_pfor=False))
 
+  def test_foldl_partial_function(self):
+    x = array_ops.zeros([3])
+    # TODO(allenl): Track down missing handle data when persistent=True and
+    # compare to experimental_use_pfor=False.
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      result = def_function.function(
+          functools.partial(functional_ops.foldl_v2, lambda a, b: a + b))(
+              x)
+    self.assertAllClose([1., 1., 1.],
+                        tape.jacobian(result, x, experimental_use_pfor=True))
+
+  def test_foldl_pure_function(self):
+
+    @def_function.function
+    def compute_jacobian(use_pfor):
+      x = array_ops.zeros([3])
+      with backprop.GradientTape(persistent=True) as tape:
+        tape.watch(x)
+        result = functools.partial(functional_ops.foldl_v2, lambda a, b: a + b)(
+            x)
+      return tape.jacobian(result, x, experimental_use_pfor=use_pfor)
+
+    self.assertAllClose(compute_jacobian(use_pfor=True),
+                        compute_jacobian(use_pfor=False))
+
 
 @test_util.run_all_in_graph_and_eager_modes
 class BatchJacobianTest(test.TestCase, parameterized.TestCase):
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index 9cff100dcc6..71c009095a0 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -1205,7 +1205,8 @@ def _get_defun_inputs(args, names, structure, flat_shapes=None):
       # Tensor or not.  For non-tensor entries it should be None.
       shape = next(shapes_iter)
       if isinstance(arg, (ops.Tensor, tensor_spec.TensorSpec)):
-        if isinstance(arg, tensor_spec.TensorSpec) and arg.name:
+        arg_is_spec = isinstance(arg, tensor_spec.TensorSpec)
+        if arg_is_spec and arg.name:
           requested_name = arg.name
         else:
           requested_name = name
@@ -1218,6 +1219,8 @@ def _get_defun_inputs(args, names, structure, flat_shapes=None):
           # Sometimes parameter names are not valid op names, so fall back to
           # unnamed placeholders.
           placeholder = graph_placeholder(arg.dtype, placeholder_shape)
+        if not arg_is_spec:
+          custom_gradient.copy_handle_data(arg, placeholder)
         if name is not None:
           # Record the requested/user-specified name in case it's different than
           # the uniquified name, for validation when exporting signatures.
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index b189ac57bb9..119a944e867 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -22,6 +22,7 @@ py_library(
         ":gradients",
         ":test_util",
         "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:constant_op",
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 96d41f3b359..63b99b28f5e 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -43,6 +43,7 @@ from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_v2_toggles
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_list_ops
 from tensorflow.python.ops import gen_nn_ops
@@ -1073,6 +1074,34 @@ class TensorListTest(PForTestCase):
     if not v2_enabled:
       control_flow_v2_toggles.disable_control_flow_v2()
 
+  def test_tensor_list_addn_already_stacked(self):
+
+    def loop_fn(i):
+      l1 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      l1 = list_ops.tensor_list_set_item(l1, 0, i)
+      l2 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      l2 = list_ops.tensor_list_set_item(l2, 1, i)
+      return list_ops.tensor_list_stack(math_ops.add_n([l1, l2]), dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 2)
+
+  def test_tensor_list_addn_stacking_required(self):
+    l1 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+    l1 = list_ops.tensor_list_set_item(l1, 1, 1)
+
+    def loop_fn(i):
+      l2 = list_ops.tensor_list_reserve([], 2, dtypes.int32)
+      l2 = list_ops.tensor_list_set_item(l2, 1, i)
+      l1_graph = array_ops.identity(l1)
+      # TODO(b/169968286): Typically TensorLists are both created and used in a
+      # graph; creating TensorLists eagerly with handle data doesn't work at the
+      # moment. Copying the handle data manually reproduces the expected case.
+      custom_gradient.copy_handle_data(l2, l1_graph)
+      return list_ops.tensor_list_stack(
+          math_ops.add_n([l1_graph, l2]), dtypes.int32)
+
+    self._test_loop_fn(loop_fn, 2)
+
 
 class StackTest(PForTestCase):
 
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index 3c6b9c0d756..2489ecd713f 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -28,6 +28,7 @@ import numpy as np
 import six
 
 from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.core.framework import types_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import execute
@@ -42,6 +43,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_image_ops
@@ -59,6 +61,7 @@ from tensorflow.python.ops import map_fn
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops import tensor_array_ops
@@ -82,14 +85,28 @@ def _stack(t, length):
   # produces a loop dependent output. Simply stacking the variants may not be
   # suitable since operations on stacked handles may expect a vectorized version
   # of the variant.
-  # Given that variant types are generic, we are currently unable to figure out
-  # which particular variant type is being considered here and hence it may not
-  # be safe to allow stacking it.
   if t.dtype == dtypes.variant:
-    raise NotImplementedError(
-        "Vectorization tried to stack variant tensor %s. "
-        "This is likely because vectorization of that variant "
-        "is not fully supported yet." % t)
+    handle_data = resource_variable_ops.get_eager_safe_handle_data(t)
+    if not handle_data.is_set:
+      raise ValueError("Required handle data not set for {!r}".format(t))
+    if len(handle_data.shape_and_type) != 1:
+      raise ValueError("Expected handle data of length 1, got {!r} of length {}"
+                       .format(handle_data, len(handle_data.shape_and_type)))
+    shape_and_type = handle_data.shape_and_type[0]
+    if shape_and_type.specialized_type == types_pb2.ST_TENSOR_LIST:
+      return wrap(
+          _stack_tensor_list(t, shape_and_type.dtype, length),
+          True)
+    else:
+      if shape_and_type.specialized_type != types_pb2.ST_INVALID:
+        raise ValueError(
+            ("Attempted to stack an unhandled variant-dtype tensor of "
+             "type {!r} ({!r})").format(
+                 shape_and_type.specialized_type, t))
+      else:
+        raise ValueError(
+            "Attempted to stack a variant-dtype tensor with no type set ({!r})"
+            .format(t))
   ones = array_ops.ones_like(array_ops.shape(t))
   ones = array_ops.reshape(ones, [-1])
   length = array_ops.reshape(length, [-1])
@@ -735,20 +752,30 @@ class _PforInput(object):
     self._op = op
     self._inputs = inputs
 
-  def stack_inputs(self, stack_indices=None):
+  def stack_inputs(self, stack_indices=None, tile_variants=False):
     """Stacks unstacked inputs at `stack_indices`.
 
     Args:
       stack_indices: indices of inputs at which stacking is done. If None,
         stacking is done at all indices.
+      tile_variants: If True, affected indices which have a variant dtype will
+        be tiled after this operation to match the expected shape of a
+        vectorized tensor. Variants generally need to be un-tiled when they are
+        inputs to operations and tiled when returned.
     """
     if stack_indices is None:
       stack_indices = range(len(self._inputs))
     length = self.pfor.loop_len_vector
     for i in stack_indices:
       inp = self._inputs[i]
+      is_variant = inp.t.dtype == dtypes.variant
       if not inp.is_stacked:
         self._inputs[i] = _stack(inp.t, length)
+        if tile_variants and is_variant:
+          self._inputs[i] = wrap(
+              _tile_variant_with_length(self._inputs[i].t, length), True)
+      elif not tile_variants and is_variant:
+        self._inputs[i] = wrap(_untile_variant(self._inputs[i].t), True)
 
   def expanddim_inputs_for_broadcast(self):
     """Reshapes stacked inputs to prepare them for broadcast.
@@ -994,6 +1021,12 @@ def wrap(tensor, is_stacked=True, is_sparse_stacked=False):
   return WrappedTensor(tensor, is_stacked, is_sparse_stacked)
 
 
+def _wrap_and_tile_variants(tensor, length):
+  if tensor.dtype == dtypes.variant:
+    tensor = _tile_variant_with_length(tensor, length)
+  return wrap(tensor)
+
+
 def _fallback_converter(pfor_input, warn=True):
   if warn:
     logging.warn("Using a while_loop for converting %s", pfor_input.op_type)
@@ -1496,7 +1529,10 @@ class PFor(object):
           if y is y_op:
             new_outputs = new_op
           else:
-            new_outputs = [wrap(x, False) for x in new_op.outputs]
+            new_outputs = []
+            for old_output, new_output in zip(y_op.outputs, new_op.outputs):
+              custom_gradient.copy_handle_data(old_output, new_output)
+              new_outputs.append(wrap(new_output, False))
         else:
           # Either some inputs are not loop invariant or op is stateful.
           if hasattr(y_op, "pfor_converter"):
@@ -2888,8 +2924,10 @@ def _convert_rank(pfor_input):
 @RegisterPFor("AddN")
 def _convert_addn(pfor_input):
   # AddN does not support broadcasting.
-  pfor_input.stack_inputs()
-  return wrap(math_ops.add_n([x.t for x in pfor_input.inputs]), True)
+  pfor_input.stack_inputs(tile_variants=False)
+  return _wrap_and_tile_variants(
+      math_ops.add_n([x.t for x in pfor_input.inputs]),
+      pfor_input.pfor.loop_len_vector)
 
 
 @RegisterPFor("Cross")
@@ -3518,8 +3556,7 @@ def _convert_tensor_array_grad_v3(pfor_input):
   return [wrap(grad_handle, False), wrap(flow_out, True)]
 
 
-def _stack_tensor_list_shape(shape, pfor_input):
-  first_dim = pfor_input.pfor.loop_len_vector
+def _stack_tensor_list_shape(shape, first_dim):
   shape_value = tensor_util.constant_value(shape)
   # Note that negative values in the shape are used to signify unknown shapes
   # and are handled in a special way.
@@ -3537,12 +3574,22 @@ def _stack_tensor_list_shape(shape, pfor_input):
         lambda: array_ops.concat([first_dim, shape], axis=0))
 
 
-def _tile_variant(t, pfor_input):
+def _tile_variant_with_length(t, length):
   """stacks `t` `length` times."""
+  original_tensor = t
   t.set_shape([])
   t = array_ops.reshape(t, [-1])
   with ops.device("CPU:0"):
-    return array_ops.tile(t, pfor_input.pfor.loop_len_vector)
+    result = array_ops.tile(t, length)
+    # TODO(b/169968286): Should regular shape functions do handle data
+    # propagation here?
+    custom_gradient.copy_handle_data(original_tensor, result)
+    return result
+
+
+def _tile_variant(t, pfor_input):
+  """stacks `t` according to its loop context."""
+  return _tile_variant_with_length(t, pfor_input.pfor.loop_len_vector)
 
 
 def _untile_variant(t):
@@ -3556,7 +3603,8 @@ def _convert_tensor_list_reserve(pfor_input):
   element_dtype = pfor_input.get_attr("element_dtype")
 
   # Prepend a dimension to element_shape.
-  element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+  element_shape = _stack_tensor_list_shape(element_shape,
+                                           pfor_input.pfor.loop_len_vector)
   handle = list_ops.tensor_list_reserve(
       element_shape, num_elements, element_dtype=element_dtype)
 
@@ -3579,16 +3627,16 @@ def _convert_tensor_list_length(pfor_input):
   return wrap(list_ops.tensor_list_length(handle), False)
 
 
-def _stack_tensor_list(handle, dtype, pfor_input, element_shape=None):
+def _stack_tensor_list(handle, dtype, loop_len_vector, element_shape=None):
   if element_shape is None:
     element_shape = list_ops.tensor_list_element_shape(handle, dtypes.int32)
   length = list_ops.tensor_list_length(handle)
   new_handle = list_ops.tensor_list_reserve(
-      _stack_tensor_list_shape(element_shape, pfor_input), length, dtype)
+      _stack_tensor_list_shape(element_shape, loop_len_vector), length, dtype)
 
   def _body_fn(i, h):
     elem = list_ops.tensor_list_get_item(handle, i, dtype, element_shape)
-    elem = _stack(elem, pfor_input.pfor.loop_len_vector).t
+    elem = _stack(elem, loop_len_vector).t
     return i + 1, list_ops.tensor_list_set_item(h, i, elem)
 
   return control_flow_ops.while_loop(lambda i, _: i < length, _body_fn,
@@ -3604,7 +3652,8 @@ def _convert_tensor_list_get_item(pfor_input):
 
   if handle_stacked:
     handle = _untile_variant(handle)
-    element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+    element_shape = _stack_tensor_list_shape(element_shape,
+                                             pfor_input.pfor.loop_len_vector)
     if index_stacked:
       # We use a sequential loop since that may be more efficient than first
       # gathering and concatenating all the element corresponding to `index`,
@@ -3650,7 +3699,8 @@ def _convert_tensor_array_set_item(pfor_input):
       return wrap(
           list_ops.tensor_list_scatter(item, index, input_handle=handle), False)
     else:
-      handle = _stack_tensor_list(handle, item.dtype, pfor_input)
+      handle = _stack_tensor_list(handle, item.dtype,
+                                  pfor_input.pfor.loop_len_vector)
   else:
     handle = _untile_variant(handle)
 
@@ -3714,7 +3764,8 @@ def _convert_tensor_list_stack(pfor_input):
   num_elements = pfor_input.get_attr("num_elements")
 
   handle = _untile_variant(handle)
-  input_shape = _stack_tensor_list_shape(input_shape, pfor_input)
+  input_shape = _stack_tensor_list_shape(input_shape,
+                                         pfor_input.pfor.loop_len_vector)
   output = list_ops.tensor_list_stack(
       handle,
       element_dtype,
@@ -3733,7 +3784,8 @@ def _convert_tensor_list_gather(pfor_input):
 
   if handle_stacked:
     handle = _untile_variant(handle)
-    element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+    element_shape = _stack_tensor_list_shape(element_shape,
+                                             pfor_input.pfor.loop_len_vector)
     if index_stacked:
       # We use a sequential loop since that may be more efficient than first
       # gathering and concatenating all the element corresponding to `index`,
@@ -3776,7 +3828,8 @@ def _convert_tensor_list_scatter(pfor_input):
   if handle_stacked:
     handle = _untile_variant(handle)
   else:
-    handle = _stack_tensor_list(handle, item.dtype, pfor_input)
+    handle = _stack_tensor_list(handle, item.dtype,
+                                pfor_input.pfor.loop_len_vector)
 
   item = _transpose_first_two_dims(item)
   handle = list_ops.tensor_list_scatter(item, indices, input_handle=handle)
@@ -3788,7 +3841,8 @@ def _convert_tensor_list_from_tensor(pfor_input):
   tensor = pfor_input.stacked_input(0)
   element_shape = pfor_input.unstacked_input(1)
   tensor = _transpose_first_two_dims(tensor)
-  element_shape = _stack_tensor_list_shape(element_shape, pfor_input)
+  element_shape = _stack_tensor_list_shape(element_shape,
+                                           pfor_input.pfor.loop_len_vector)
   handle = list_ops.tensor_list_from_tensor(tensor, element_shape)
   return wrap(_tile_variant(handle, pfor_input), True)
 
@@ -4258,7 +4312,12 @@ class WhileV2(object):
           if out.is_stacked != inp.is_stacked:
             stacking_mismatch = True
             mismatching_stacked_indices.append(i)
-            wrapped_inputs[i] = _stack(inp.t, [array_ops.size(new_indices)])
+            stacked = _stack(inp.t, [array_ops.size(new_indices)])
+            if inp.t.dtype == dtypes.variant:
+              stacked = wrap(
+                  _tile_variant_with_length(stacked.t,
+                                            [array_ops.size(new_indices)]))
+            wrapped_inputs[i] = stacked
         if not stacking_mismatch:
           if mismatching_stacked_indices:
             # We needed to stack some inputs. This code will be abandoned and
@@ -4413,7 +4472,8 @@ class WhileV2(object):
     _ = while_fn.get_concrete_function()
     if indices_to_stack:
       # Need to abandon the current conversion, stack some inputs and restart.
-      self._pfor_input.stack_inputs(stack_indices=indices_to_stack)
+      self._pfor_input.stack_inputs(
+          stack_indices=indices_to_stack, tile_variants=True)
       # Note that this call will recurse at most one time. The first call will
       # do the required stacking, based on the iterative procedure in
       # _process_body, and the next invocation to __call__ should not need to do
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 23c24476934..556360ce640 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -161,6 +161,10 @@ def while_loop(cond,
       Returns:
         A list of tensors the same length as args.
       """
+      # The function was created with a signature rather than tensors, so
+      # internal placeholders were created without handle data.
+      _copy_handle_data(nest.flatten(loop_vars[2:], expand_composites=True),
+                        nest.flatten(args, expand_composites=True))
       # Capture the tensors already captured in cond_graph so that they appear
       # in the same order in body_graph.external_captures.
       for t in cond_graph.external_captures:
@@ -372,7 +376,7 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
     while_op._add_while_inputs(new_inputs)
     while_op._add_outputs([t.dtype for t in new_outputs],
                           [t.shape for t in new_outputs])
-    _copy_handle_data(new_outputs, op.outputs[orig_num_params:])
+    _copy_handle_data(new_outputs, while_op.outputs[orig_num_params:])
 
   # Do not ignore grads wrt extra outputs when computing higher order
   # derivatives.

From 9ce847ed140702d1dd4cb204a8afe0ffedb70b15 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 2 Oct 2020 15:58:52 -0700
Subject: [PATCH 31/54] Remove a few check ops that no longer need to run in
 tf.Variable's constructor

VarHandleOp ensures there is no sharing. These aren't a huge part of startup time for replicated models, but there's still no reason to run them.

PiperOrigin-RevId: 335117818
Change-Id: I38a0f944a565907630f1da0ef6d896a633b296c0
---
 .../python/ops/resource_variable_ops.py       | 25 +++++++------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 548eb30df97..162f4057ff0 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -35,12 +35,12 @@ from tensorflow.python.framework import auto_control_deps_utils as acd
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import gen_logging_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
@@ -156,6 +156,12 @@ def _variable_handle_from_shape_and_dtype(shape,
     container = ""
   shape = tensor_shape.as_shape(shape)
   dtype = dtypes.as_dtype(dtype)
+  if not graph_mode:
+    if shared_name is not None:
+      raise errors.InternalError(
+          "Using an explicit shared_name is not supported executing eagerly.")
+    shared_name = context.shared_name()
+
   handle = gen_resource_variable_ops.var_handle_op(
       shape=shape,
       dtype=dtype,
@@ -169,19 +175,6 @@ def _variable_handle_from_shape_and_dtype(shape,
     _set_handle_shapes_and_types(handle, full_handle_data, graph_mode)
     return handle
   else:
-    # We do not want two distinct ResourceVariable objects for the same
-    # underlying resource in the runtime.
-    # When in eager mode, explicitly ensure so here. When in graph mode, it's
-    # ensured by always generating different variable names.
-    exists = gen_resource_variable_ops.var_is_initialized_op(handle)
-
-    # We create an assert Op instead of checking right away in order to be
-    # compatible with ASYNC execution mode. Further, since not all devices
-    # support string tensors, we encode the assertion string in the Op name
-    gen_logging_ops._assert(  # pylint: disable=protected-access
-        math_ops.logical_not(exists), [exists],
-        name="EagerVariableNameReuse")
-
     handle_data = cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData()
     handle_data.is_set = True
     handle_data.shape_and_type.append(
@@ -1709,7 +1702,7 @@ class ResourceVariable(BaseResourceVariable):
           # When in eager mode use a uid for the shared_name, to prevent
           # accidental sharing.
           unique_id = "%s_%d" % (handle_name, ops.uid())
-          shared_name = context.shared_name()
+          shared_name = None  # Never shared
         # Use attr_scope and device(None) to simulate the behavior of
         # colocate_with when the variable we want to colocate with doesn't
         # yet exist.
@@ -1960,7 +1953,7 @@ class UninitializedVariable(BaseResourceVariable):
           unique_id = shared_name
         else:
           unique_id = "%s_%d" % (handle_name, ops.uid())
-          shared_name = context.shared_name()
+          shared_name = None  # Never shared
         handle = _variable_handle_from_shape_and_dtype(
             shape=shape,
             dtype=dtype,

From 36314ba572f743e6bda359c345977583a9b46be8 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 2 Oct 2020 16:06:32 -0700
Subject: [PATCH 32/54] Unbreak windows build

flat_hash_set pulls in variant that does not build on MSVC & nvcc.

external/com_google_absl\absl/types/internal/variant.h(1042): error: parameter pack "H" was referenced but not expanded

          detected during instantiation of class "absl::lts_2020_02_25::variant_internal::VariantStateBase<H, T...> [with H=T..., T=<>]"

PiperOrigin-RevId: 335119227
Change-Id: I3a02b96552efa6aab0c5ede36ecb0392db5cf1e9
---
 tensorflow/stream_executor/BUILD                   | 6 ------
 tensorflow/stream_executor/stream_executor_pimpl.h | 7 ++++---
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 37b426a711d..33ef97f6712 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -92,7 +92,6 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -141,7 +140,6 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -353,7 +351,6 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -492,7 +489,6 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
@@ -532,7 +528,6 @@ cc_library(
         "//tensorflow/stream_executor/platform",
         "//third_party/eigen3",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -584,7 +579,6 @@ cc_library(
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index ce14a7f4a43..f19c76c3790 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -23,7 +23,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/macros.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/optional.h"
@@ -520,7 +519,9 @@ class StreamExecutor {
     std::vector<Stream *> streams;
     {
       absl::MutexLock lock(&mu_);
-      absl::c_copy(streams_, std::back_inserter(streams));
+      for (Stream *stream : streams_) {
+        streams.push_back(stream);
+      }
     }
 
     for (Stream *stream : streams) {
@@ -760,7 +761,7 @@ class StreamExecutor {
   StreamExecutorMemoryAllocator allocator_;
 
   // Set of streams associated with this stream executor.
-  absl::flat_hash_set<Stream *> streams_ TF_GUARDED_BY(mu_);
+  std::set<Stream *> streams_ TF_GUARDED_BY(mu_);
 
   SE_DISALLOW_COPY_AND_ASSIGN(StreamExecutor);
 };

From 9e1ee89c7f1cfdc899cabf1503bae754bff9847e Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 2 Oct 2020 16:07:12 -0700
Subject: [PATCH 33/54] [tf.data] Fixing a bug in `tf.data.Dataset.window`
 handling of `drop_remainder=True`.

Fixes: https://github.com/tensorflow/tensorflow/issues/43703
PiperOrigin-RevId: 335119334
Change-Id: Ie91ef58abac04a756e37724891d70b63ef4765d6
---
 tensorflow/core/kernels/data/window_dataset_op.cc  |  7 +++++--
 tensorflow/python/data/kernel_tests/window_test.py | 11 +++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index 4e239d0895c..69ad7ea3bb5 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -155,14 +155,17 @@ class WindowDatasetOp::Dataset : public DatasetBase {
       std::vector<std::vector<Tensor>> window_elements;
       Status status = Status::OK();
       {
+        const size_t target_size = TargetBufferSize(window_size, window_stride);
+
         mutex_lock l(mu_);
-        if (!input_impl_ && buffer_.empty()) {
+        if (!input_impl_ &&
+            (buffer_.empty() ||
+             (dataset()->drop_remainder_ && buffer_.size() < target_size))) {
           *end_of_sequence = true;
           return Status::OK();
         }
 
         // Add elements to the buffer.
-        size_t target_size = TargetBufferSize(window_size, window_stride);
         if (input_impl_) {
           *end_of_sequence = false;
           for (size_t i = buffer_.size(); i < target_size && !*end_of_sequence;
diff --git a/tensorflow/python/data/kernel_tests/window_test.py b/tensorflow/python/data/kernel_tests/window_test.py
index 98b453a5900..2515bd52f60 100644
--- a/tensorflow/python/data/kernel_tests/window_test.py
+++ b/tensorflow/python/data/kernel_tests/window_test.py
@@ -239,6 +239,17 @@ class WindowTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertDatasetProduces(x, range(i*10, (i+1)*10))
       self.assertDatasetProduces(y, range(i*10, (i+1)*10))
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testDropRemainderOutput(self):
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.window(30, drop_remainder=True)
+    dataset = dataset.flat_map(lambda x: x.batch(30))
+    dataset = dataset.batch(4)
+
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[[[y + 30 * x for y in range(30)] for x in range(3)]])
+
 
 if __name__ == "__main__":
   test.main()

From 20597fff28fb487472806c2a5956b7f101a4e87b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Oct 2020 16:07:34 -0700
Subject: [PATCH 34/54] Integrate LLVM at llvm/llvm-project@0f0cbcc4b166

Updates LLVM usage to match
[0f0cbcc4b166](https://github.com/llvm/llvm-project/commit/0f0cbcc4b166)

PiperOrigin-RevId: 335119390
Change-Id: I5dbc397526ce17ee8290a1c0616c9fabde64bbba
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 7981cb95d68..0a38596f151 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -712,8 +712,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "8825fec37e73eea1bc3e4f5c125e1fd02d002d6c"
-    LLVM_SHA256 = "0ebcb763b49d1eec4115e0db39a4008505d6ada30fd5a3189b08eada6ae8b444"
+    LLVM_COMMIT = "0f0cbcc4b166f32603371fb1d62ef3816cf8425f"
+    LLVM_SHA256 = "60fb8f8e25b31ed4d0697129df5d4bc097f6539b8b5a0ec05c5030e171344c74"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 40ce446a7b75b257118e3dbfd8bdc5a792707600 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Fri, 2 Oct 2020 16:17:17 -0700
Subject: [PATCH 35/54] Add a missing handle data copy in tf.function gradients

PiperOrigin-RevId: 335120852
Change-Id: I592a34fb2369d899eb069811494fa961d823e378
---
 tensorflow/python/eager/backprop_test.py | 15 +++++++++++++--
 tensorflow/python/eager/function.py      |  5 +++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index f192499194c..584fed73158 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -1725,8 +1725,19 @@ class JacobianTest(test.TestCase):
 
   def test_foldl_partial_function(self):
     x = array_ops.zeros([3])
-    # TODO(allenl): Track down missing handle data when persistent=True and
-    # compare to experimental_use_pfor=False.
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(x)
+      result = def_function.function(
+          functools.partial(functional_ops.foldl_v2, lambda a, b: a + b))(
+              x)
+    self.assertAllClose([1., 1., 1.],
+                        tape.jacobian(result, x, experimental_use_pfor=True))
+    self.assertAllClose([1., 1., 1.],
+                        tape.jacobian(result, x, experimental_use_pfor=False))
+
+    # Non-persistent tapes take a different function gradient path, but also
+    # work with pfor=True.
+    x = array_ops.zeros([3])
     with backprop.GradientTape() as tape:
       tape.watch(x)
       result = def_function.function(
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index ba424193532..bc0ee33788c 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -900,8 +900,9 @@ class _TapeGradientFunctions(object):
       for output in trainable_outputs:
         gradient_shape, gradient_dtype = default_gradient.shape_and_dtype(
             output)
-        gradients_wrt_outputs.append(
-            graph_placeholder(gradient_dtype, gradient_shape))
+        gradient_placeholder = graph_placeholder(gradient_dtype, gradient_shape)
+        custom_gradient.copy_handle_data(output, gradient_placeholder)
+        gradients_wrt_outputs.append(gradient_placeholder)
       with ops.device(None):
         gradients_wrt_inputs = gradients_util._GradientsHelper(  # pylint: disable=protected-access
             trainable_outputs,

From b858b15d08429fe6c620a9dc7253ef6658c913fe Mon Sep 17 00:00:00 2001
From: Rahul Joshi <jurahul@google.com>
Date: Fri, 2 Oct 2020 16:35:22 -0700
Subject: [PATCH 36/54] [NFC] Fix build warning.

- Eliminate reference to xla:literal_util from hdrs.

PiperOrigin-RevId: 335123860
Change-Id: If8162098b78845d4a96dc57c496a55d8b42c0363
---
 tensorflow/compiler/xla/service/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index a24ec9a760f..00eef44d2e2 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1708,7 +1708,6 @@ cc_library(
     srcs = ["hlo_creation_utils.cc"],
     hdrs = [
         "hlo_creation_utils.h",
-        "//tensorflow/compiler/xla:literal_util",
     ],
     deps = [
         ":hlo",

From 8c44ebb3146dcc3e6acfd842a26ca4c0a72e1710 Mon Sep 17 00:00:00 2001
From: Rick Chao <rchao@google.com>
Date: Fri, 2 Oct 2020 17:07:23 -0700
Subject: [PATCH 37/54] Fix ReductionV2 documentation symbol discrepancy.

PiperOrigin-RevId: 335128806
Change-Id: I75b2a159cdd7b6d6c6660dc135501f3498bd28f3
---
 tensorflow/python/ops/losses/loss_reduction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/losses/loss_reduction.py b/tensorflow/python/ops/losses/loss_reduction.py
index 829bc2f811e..789a6561bfb 100644
--- a/tensorflow/python/ops/losses/loss_reduction.py
+++ b/tensorflow/python/ops/losses/loss_reduction.py
@@ -44,7 +44,7 @@ class ReductionV2(object):
        loss_obj = tf.keras.losses.CategoricalCrossentropy(
            reduction=tf.keras.losses.Reduction.NONE)
        ....
-       loss = tf.reduce_sum(loss_object(labels, predictions)) *
+       loss = tf.reduce_sum(loss_obj(labels, predictions)) *
            (1. / global_batch_size)
      ```
 

From 99e242e964a22d78634e993acea4b9f9c03adaa7 Mon Sep 17 00:00:00 2001
From: Zhuo Peng <zhuo@google.com>
Date: Fri, 2 Oct 2020 17:17:25 -0700
Subject: [PATCH 38/54] Changed Metric serialization code to use the Keras
 serialization name (which is different from the class name when using the
 register_keras_serializable decorator).

PiperOrigin-RevId: 335130195
Change-Id: Ie8b81045889a7a99b7634f16d299a3b7810f5040
---
 .../saved_model/metric_serialization.py       |  3 ++-
 .../saving/saved_model/saved_model_test.py    | 27 +++++++++++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/saving/saved_model/metric_serialization.py b/tensorflow/python/keras/saving/saved_model/metric_serialization.py
index efe977ec55f..419d02811d5 100644
--- a/tensorflow/python/keras/saving/saved_model/metric_serialization.py
+++ b/tensorflow/python/keras/saving/saved_model/metric_serialization.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.keras.saving.saved_model import layer_serialization
+from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.training.tracking import data_structures
 
 
@@ -31,7 +32,7 @@ class MetricSavedModelSaver(layer_serialization.LayerSavedModelSaver):
 
   def _python_properties_internal(self):
     metadata = dict(
-        class_name=type(self.obj).__name__,
+        class_name=generic_utils.get_registered_name(type(self.obj)),
         name=self.obj.name,
         dtype=self.obj.dtype)
     metadata.update(layer_serialization.get_config(self.obj))
diff --git a/tensorflow/python/keras/saving/saved_model/saved_model_test.py b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
index 9615fef54b9..ac25dfda943 100644
--- a/tensorflow/python/keras/saving/saved_model/saved_model_test.py
+++ b/tensorflow/python/keras/saving/saved_model/saved_model_test.py
@@ -1054,6 +1054,33 @@ class MetricTest(test.TestCase, parameterized.TestCase):
             num_tensor_args,
             test_sample_weight=False)
 
+  def test_registered_custom_metric(self):
+
+    @generic_utils.register_keras_serializable('Testing')
+    class CustomMeanMetric(keras.metrics.Mean):
+
+      def update_state(self, *args):  # pylint: disable=useless-super-delegation
+        # Sometimes built-in metrics return an op in update_state. Custom
+        # metrics don't support returning ops, so wrap the update_state method
+        # while returning nothing.
+        super(CustomMeanMetric, self).update_state(*args)
+
+    with self.cached_session():
+      metric = CustomMeanMetric()
+      save_dir = self._save_model_dir('first_save')
+      self.evaluate([v.initializer for v in metric.variables])
+      loaded = self._test_metric_save_and_load(
+          metric,
+          save_dir,
+          num_tensor_args=1,
+          test_sample_weight=False)
+
+      self._test_metric_save_and_load(
+          loaded,
+          self._save_model_dir('second_save'),
+          num_tensor_args=1,
+          test_sample_weight=False)
+
   def test_custom_metric_wrapped_call(self):
 
     class NegativeMean(keras.metrics.Mean):

From a94461e66c890cdb2ff28a22b9423e2f5bf1c52c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Oct 2020 17:36:08 -0700
Subject: [PATCH 39/54] Add support for direct vocabulary setting in
 TextVectorization.

PiperOrigin-RevId: 335132672
Change-Id: I6369f7cb9dd72a364f2d01a45fde305f8eb244b2
---
 RELEASE.md                                    |  3 ++
 .../preprocessing/text_vectorization.py       | 47 ++++++++++++++++++-
 .../preprocessing/text_vectorization_test.py  | 19 ++++++++
 ...al.preprocessing.-text-vectorization.pbtxt |  2 +-
 4 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 4aa05bc02f1..3e8cb4d4382 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -206,6 +206,9 @@
         `fit()`. Running multiple batches inside a single `tf.function` call can
         greatly improve performance on TPUs or small models with a large Python
         overhead.
+    *   Improvements to Keras preprocessing layers:
+        *   TextVectorization can now accept a vocabulary list or file as an
+            init arg.
 *   `tf.function` / AutoGraph:
 
     *   Added `experimental_follow_type_hints` argument for `tf.function`. When
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
index 36e326bdc5c..6449d8afaf7 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization.py
@@ -155,6 +155,10 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
       the number of unique tokens in the vocabulary is less than max_tokens,
       resulting in a tensor of shape [batch_size, max_tokens] regardless of
       vocabulary size. Defaults to True.
+    vocabulary: An optional list of vocabulary terms, or a path to a text file
+      containing a vocabulary to load into this layer. The file should contain
+      one token per line. If the list or file contains the same token multiple
+      times, an error will be thrown.
 
   Example:
   This example instantiates a TextVectorization layer that lowercases text,
@@ -196,6 +200,43 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
   array([[2, 1, 4, 0],
          [1, 3, 0, 0]])
 
+  Example:
+  This example instantiates a TextVectorization layer by passing a list
+  of vocabulary terms to the layer's __init__ method.
+
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(
+        max_tokens=None,
+        standardize=None,
+        split=None,
+        output_mode=text_vectorization.INT,
+        vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+
+    output_dataset = model.predict(input_array)
+  >>> vocab_data = ["earth", "wind", "and", "fire"]
+  >>> max_len = 4  # Sequence length to pad the outputs to.
+  >>>
+  >>> # Create the layer, passing the vocab directly. You can also pass the
+  >>> # vocabulary arg a path to a file containing one vocabulary word per
+  >>> # line.
+  >>> vectorize_layer = TextVectorization(
+  ...  max_tokens=max_features,
+  ...  output_mode='int',
+  ...  output_sequence_length=max_len,
+  ...  vocabulary=vocab_data)
+  >>>
+  >>> # Because we've passed the vocabulary directly, we don't need to adapt
+  >>> # the layer - the vocabulary is already set. The vocabulary contains the
+  >>> # padding token ('') and OOV token ('[UNK]') as well as the passed tokens.
+  >>> vectorize_layer.get_vocabulary()
+  ['', '[UNK]', 'earth', 'wind', 'and', 'fire']
+
   """
   # TODO(momernick): Add an examples section to the docstring.
 
@@ -207,6 +248,7 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
                output_mode=INT,
                output_sequence_length=None,
                pad_to_max_tokens=True,
+               vocabulary=None,
                **kwargs):
 
     # This layer only applies to string processing, and so should only have
@@ -295,7 +337,7 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
 
     mask_token = "" if output_mode in [None, INT] else None
     self._index_lookup_layer = self._get_index_lookup_class()(
-        max_tokens=max_tokens, mask_token=mask_token)
+        max_tokens=max_tokens, mask_token=mask_token, vocabulary=vocabulary)
 
     # If this layer is configured for string or integer output, we do not
     # create a vectorization layer (as the output is not vectorized).
@@ -404,6 +446,9 @@ class TextVectorization(base_preprocessing_layer.CombinerPreprocessingLayer):
     return self._index_lookup_layer.get_vocabulary()
 
   def get_config(self):
+    # This does not include the 'vocabulary' arg, since if the vocab was passed
+    # at init time it's now stored in variable state - we don't need to
+    # pull it off disk again.
     config = {
         "max_tokens": self._max_tokens,
         "standardize": self._standardize,
diff --git a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
index e7f61e94724..b79fc5cf10f 100644
--- a/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
+++ b/tensorflow/python/keras/layers/preprocessing/text_vectorization_test.py
@@ -690,6 +690,25 @@ class TextVectorizationPreprocessingTest(
 
     self.assertAllEqual(expected_output, output)
 
+  def test_vocab_setting_via_init(self):
+    vocab_data = ["earth", "wind", "and", "fire"]
+    input_array = np.array([["earth", "wind", "and", "fire"],
+                            ["fire", "and", "earth", "michigan"]])
+    expected_output = [[2, 3, 4, 5], [5, 4, 2, 1]]
+
+    input_data = keras.Input(shape=(None,), dtype=dtypes.string)
+    layer = get_layer_class()(
+        max_tokens=None,
+        standardize=None,
+        split=None,
+        output_mode=text_vectorization.INT,
+        vocabulary=vocab_data)
+    int_data = layer(input_data)
+    model = keras.Model(inputs=input_data, outputs=int_data)
+
+    output_dataset = model.predict(input_array)
+    self.assertAllEqual(expected_output, output_dataset)
+
 
 @keras_parameterized.run_all_keras_modes
 class TextVectorizationDistributionTest(
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
index 4d5a28fc8b4..9fc7c410480 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.experimental.preprocessing.-text-vectorization.pbtxt
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'max_tokens\', \'standardize\', \'split\', \'ngrams\', \'output_mode\', \'output_sequence_length\', \'pad_to_max_tokens\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'lower_and_strip_punctuation\', \'whitespace\', \'None\', \'int\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'max_tokens\', \'standardize\', \'split\', \'ngrams\', \'output_mode\', \'output_sequence_length\', \'pad_to_max_tokens\', \'vocabulary\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'lower_and_strip_punctuation\', \'whitespace\', \'None\', \'int\', \'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "adapt"

From bbbacb1d649fe22b3629c67477e5d93d2bc79cc4 Mon Sep 17 00:00:00 2001
From: Chenkai Kuang <chenkai@google.com>
Date: Fri, 2 Oct 2020 17:40:46 -0700
Subject: [PATCH 40/54] Move tests from strategy_reduce_test.py to
 strategy_common_test.py.

strategy_common_test is a now a kitchen sink for testing methods that are common in all strategies. We can consider breaking it down if it grows significantly.

PiperOrigin-RevId: 335133163
Change-Id: I744bcd96ba27a49e1f79d71a5e53331c0bc42b74
---
 tensorflow/python/distribute/BUILD            | 18 -------
 .../python/distribute/strategy_common_test.py | 47 ++++++++++++-----
 .../python/distribute/strategy_reduce_test.py | 52 -------------------
 3 files changed, 35 insertions(+), 82 deletions(-)
 delete mode 100644 tensorflow/python/distribute/strategy_reduce_test.py

diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 0fc54b0c8d1..e5db066100e 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -1335,24 +1335,6 @@ distribute_py_test(
     ],
 )
 
-distribute_py_test(
-    name = "strategy_reduce_test",
-    srcs = ["strategy_reduce_test.py"],
-    disable_mlir_bridge = False,
-    main = "strategy_reduce_test.py",
-    tags = [
-        "multi_and_single_gpu",
-    ],
-    deps = [
-        ":combinations",
-        ":strategy_combinations",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:test",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
 py_library(
     name = "single_loss_example",
     srcs = ["single_loss_example.py"],
diff --git a/tensorflow/python/distribute/strategy_common_test.py b/tensorflow/python/distribute/strategy_common_test.py
index 5eeeb11fa8f..199851ab6c2 100644
--- a/tensorflow/python/distribute/strategy_common_test.py
+++ b/tensorflow/python/distribute/strategy_common_test.py
@@ -51,7 +51,33 @@ from tensorflow.python.util import nest
         mode=['eager']))
 class StrategyTest(test.TestCase, parameterized.TestCase):
 
-  def testSimpleReduce(self, strategy):
+  def testCaptureReplicaId(self, strategy):
+    m = {}
+
+    @def_function.function
+    def f():
+      return ds_context.get_replica_context().replica_id_in_sync_group
+
+    @def_function.function
+    def g():
+      # Make g() a stateful function so it's traced twice.
+      if m.get('v', None) is None:
+        m['v'] = variables.Variable(0.)
+      return strategy.run(f)
+
+    g()
+
+
+@combinations.generate(
+    combinations.combine(
+        strategy=[
+            strategy_combinations.multi_worker_mirrored_2x1_cpu,
+            strategy_combinations.multi_worker_mirrored_2x1_gpu,
+        ] + strategy_combinations.all_strategies,
+        mode=['eager']))
+class ReduceTest(test.TestCase, parameterized.TestCase):
+
+  def testBasic(self, strategy):
     per_replica_value = strategy.experimental_distribute_values_from_function(
         lambda _: array_ops.ones((), dtypes.float32))
 
@@ -72,21 +98,18 @@ class StrategyTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(fn_eager().numpy(), 1.0 * strategy.num_replicas_in_sync)
     self.assertEqual(fn_graph().numpy(), 1.0 * strategy.num_replicas_in_sync)
 
-  def testCaptureReplicaId(self, strategy):
-    m = {}
+  def testAxis(self, strategy):
 
     @def_function.function
-    def f():
-      return ds_context.get_replica_context().replica_id_in_sync_group
+    def fn():
+      return constant_op.constant([1., 2.])
 
-    @def_function.function
-    def g():
-      # Make g() a stateful function so it's traced twice.
-      if m.get('v', None) is None:
-        m['v'] = variables.Variable(0.)
-      return strategy.run(f)
+    x = strategy.run(fn)
 
-    g()
+    x_m = strategy.reduce(reduce_util.ReduceOp.MEAN, x, axis=0)
+    self.assertEqual(1.5, x_m)
+    x_s = strategy.reduce(reduce_util.ReduceOp.SUM, x, axis=0)
+    self.assertEqual(3 * strategy.num_replicas_in_sync, x_s)
 
 
 @combinations.generate(
diff --git a/tensorflow/python/distribute/strategy_reduce_test.py b/tensorflow/python/distribute/strategy_reduce_test.py
deleted file mode 100644
index a87cce2f0b8..00000000000
--- a/tensorflow/python/distribute/strategy_reduce_test.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for `strategy.reduce`."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-
-from tensorflow.python.distribute import combinations
-from tensorflow.python.distribute import reduce_util
-from tensorflow.python.distribute import strategy_combinations
-from tensorflow.python.eager import def_function
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-
-
-class StrategyReduceTest(test.TestCase, parameterized.TestCase):
-
-  @combinations.generate(
-      combinations.combine(
-          distribution=strategy_combinations.all_strategies,
-          mode=["eager"]
-      ))
-  def test_reduce_with_axis(self, distribution):
-
-    @def_function.function
-    def fn():
-      return constant_op.constant([1., 2.])
-    x = distribution.run(fn)
-
-    x_m = distribution.reduce(reduce_util.ReduceOp.MEAN, x, axis=0)
-    self.assertEqual(1.5, self.evaluate(x_m))
-    x_s = distribution.reduce(reduce_util.ReduceOp.SUM, x, axis=0)
-    self.assertEqual(3 * distribution.num_replicas_in_sync, self.evaluate(x_s))
-
-
-if __name__ == "__main__":
-  test.main()

From c8ca19792df04a03964db6da7b27c17c92855485 Mon Sep 17 00:00:00 2001
From: Mehmet Deveci <deveci@google.com>
Date: Fri, 2 Oct 2020 17:41:46 -0700
Subject: [PATCH 41/54] TensorTracer: Removing the necessity of dumping traces
 to get meaningful reads.

PiperOrigin-RevId: 335133281
Change-Id: I18a903543959ad7fc66aab98477a0196b49f0b9b
---
 tensorflow/python/tpu/tensor_tracer.py       | 105 ++++++++++++++++---
 tensorflow/python/tpu/tensor_tracer_flags.py |   5 +-
 2 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index 0942ce814f0..1a5c1330097 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -1317,18 +1317,82 @@ class TensorTracer(object):
         tensor_tracer_flags.TRACE_MODE_SUMMARY,
         tensor_tracer_flags.TRACE_MODE_FULL_TENSOR_SUMMARY)
 
-  def _generate_flush_cache_op(self, num_replicas, on_tpu):
+  def _inspect_summary_cache(self, cache, replica_id, step_num, output_stream,
+                             tensor_trace_order):
+    """Generates a print operation to print trace inspection.
+
+    Args:
+      cache: Tensor storing the trace results for the step.
+      replica_id: Tensor storing the replica id of the running core.
+      step_num: Step number.
+      output_stream: Where to print the outputs, e.g., file path, or sys.stderr.
+      tensor_trace_order: TensorTraceOrder object holding tensorname to id map.
+
+    Returns:
+      The Op to flush the cache to file.
+    """
+    def _inspect_tensor(tensor):
+      """Returns the text to be printed for inspection output."""
+      if (self._parameters.trace_mode ==
+          tensor_tracer_flags.TRACE_MODE_NAN_INF):
+        return control_flow_ops.cond(
+            math_ops.greater(tensor, 0.0),
+            lambda: 'has NaNs/Infs!',
+            lambda: 'has no NaNs or Infs.')
+      else:
+        return tensor
+
+    # No need to print core numbers if the cache is merged already.
+    if self._parameters.collect_summary_per_core:
+      core_message = ['core:', replica_id, ',']
+    else:
+      core_message = []
+
+    # Check if the cache includes any nan or inf
+    if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_NAN_INF:
+      # Cache has 1s or 0s if the mode is NaN_INF
+      step_has_nan_or_inf = math_ops.greater(math_ops.reduce_sum(cache), 0.0)
+    else:
+      # Cache has the actual numerics for other modes.
+      step_has_nan_or_inf = math_ops.reduce_any(
+          gen_math_ops.logical_or(
+              gen_math_ops.is_nan(cache), gen_math_ops.is_inf(cache)))
+
+    # Summarizing message for each step.
+    step_error_message = control_flow_ops.cond(
+        step_has_nan_or_inf,
+        lambda: 'NaNs or Infs in the step!',
+        lambda: 'No numerical issues have been found for the step.')
+
+    print_op = logging_ops.print_v2(
+        '\n', *core_message, 'step:', step_num, '-->', step_error_message,
+        'Printing tensors for mode:%s...' % self._parameters.trace_mode,
+        summarize=-1,
+        output_stream=output_stream)
+
+    for tensor_name, cache_idx in sorted(
+        tensor_trace_order.tensorname_to_cache_idx.items(),
+        key=lambda item: item[1]):
+      with ops.control_dependencies([print_op]):
+        print_op = logging_ops.print_v2(
+            *core_message, 'step:', step_num, ',',
+            tensor_name, '-->', _inspect_tensor(cache[cache_idx, 0]),
+            summarize=-1, output_stream=output_stream)
+    return print_op
+
+  def _generate_flush_cache_op(self, num_replicas, on_tpu, tensor_trace_order):
     """Generates an Op that will flush the cache to file.
 
     Args:
       num_replicas: total number of replicas.
       on_tpu: if the graph is executed on TPU.
+      tensor_trace_order: TensorTraceOrder object holding tensorname to id map.
 
     Returns:
       The Op to flush the cache to file.
     """
 
-    def _flush_fun(cache, replica_id):
+    def _flush_fun(cache, replica_id, step_num):
       """Flushes the cache to a file corresponding to replica_id."""
 
       def _f(file_index):
@@ -1346,12 +1410,20 @@ class TensorTracer(object):
 
           new_step_line = _REPLICA_ID_TAG + replica_str
           print_ops = []
-          for i in range(self._num_signature_dimensions()):
-            print_ops.append(logging_ops.print_v2(
-                new_step_line, '\n',
-                cache[:, i], '\n',
-                summarize=-1,
-                output_stream=output_stream))
+          if self._parameters.inspect_trace:
+            if self._num_signature_dimensions() > 1:
+              raise ValueError('Inspecting multi signatures are not supported.')
+            print_ops.append(self._inspect_summary_cache(
+                cache=cache, replica_id=replica_id, step_num=step_num,
+                output_stream=output_stream,
+                tensor_trace_order=tensor_trace_order))
+          else:
+            for i in range(self._num_signature_dimensions()):
+              print_ops.append(logging_ops.print_v2(
+                  new_step_line, '\n',
+                  cache[:, i], '\n',
+                  summarize=-1,
+                  output_stream=output_stream))
           with ops.control_dependencies(print_ops):
             return constant_op.constant(0).op
         return _print_cache
@@ -1388,10 +1460,12 @@ class TensorTracer(object):
         cache_val = self.merge_caches_on_tpu(cache_val)
         cache_val = self.aggregate_global_cache(cache_val)[0]
 
-      flush_op = tpu.outside_compilation(_flush_fun,
-                                         cache_val, self._replica_id)
+      flush_op = tpu.outside_compilation(
+          _flush_fun, cache_val, self._replica_id,
+          training_util.get_or_create_global_step())
     else:
-      flush_op = _flush_fun(cache_val, self._replica_id)
+      flush_op = _flush_fun(cache_val, self._replica_id,
+                            training_util.get_or_create_global_step())
     if self._use_temp_cache():
       with ops.control_dependencies([flush_op]):
         return constant_op.constant(0).op
@@ -1405,13 +1479,15 @@ class TensorTracer(object):
         with ops.control_dependencies([assign_op]):
           return constant_op.constant(0).op
 
-  def _flush_tensor_values_cache(self, tensor_fetches, op_fetches, on_tpu):
+  def _flush_tensor_values_cache(self, tensor_fetches, op_fetches, on_tpu,
+                                 tensor_trace_order):
     """Flushes the intermediate tensor values in the graph to the cache.
 
     Args:
       tensor_fetches: list of tensor results returned by the model_fn.
       op_fetches: list of ops that are returned by the model_fn, e.g., train_op.
       on_tpu: if the graph is executed on TPU.
+      tensor_trace_order: TensorTraceOrder object holding tensorname to id map.
 
     Returns:
       An identical copy of tensor_fetches.
@@ -1421,7 +1497,7 @@ class TensorTracer(object):
     with ops.control_dependencies(op_fetches +
                                   [tensor.op for tensor in tensor_fetches]):
       flush_cache_op = self._generate_flush_cache_op(
-          self._tt_config.num_replicas, on_tpu)
+          self._tt_config.num_replicas, on_tpu, tensor_trace_order)
       return control_flow_ops.tuple(tensor_fetches,
                                     control_inputs=[flush_cache_op])
 
@@ -1837,7 +1913,8 @@ class TensorTracer(object):
           del self._host_call_fn[_TT_HOSTCALL_KEY]
       else:
         processed_t_fetches = self._flush_tensor_values_cache(
-            processed_t_fetches, op_fetches, on_tpu=on_tpu)
+            processed_t_fetches, op_fetches, on_tpu=on_tpu,
+            tensor_trace_order=tensor_trace_order)
 
     # processed_t_fetches is a list at this point. Convert it to the same
     # format as given in tensor_fetches.
diff --git a/tensorflow/python/tpu/tensor_tracer_flags.py b/tensorflow/python/tpu/tensor_tracer_flags.py
index 6d84314185b..ba375737866 100644
--- a/tensorflow/python/tpu/tensor_tracer_flags.py
+++ b/tensorflow/python/tpu/tensor_tracer_flags.py
@@ -69,6 +69,7 @@ FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS = 'dump_graphs'
 FLAG_NAME_SUMMARY_SIGNATURES = 'signatures'
 FLAG_NAME_SUMMARY_PER_CORE = 'collect_summary_per_core'
 FLAG_NAME_TEMP_CACHE_VAR = 'use_temp_cache'
+FLAG_NAME_INSPECT_TRACE = 'inspect_trace'
 FLAG_NAME_FINGERPRINT_DIR = 'use_fingerprint_subdirectory'
 
 _OP_RANGE_PAT = re.compile(r'(\d+):(\d+)')
@@ -125,6 +126,7 @@ class TTParameters(object):
                                                  TRACE_MODE_MAX_ABS,
                                                  TRACE_MODE_SUMMARY)
     self.use_temp_cache_var = self.is_flag_on(FLAG_NAME_TEMP_CACHE_VAR)
+    self.inspect_trace = self.is_flag_on(FLAG_NAME_INSPECT_TRACE)
     self.use_fingerprint_subdir = self.is_flag_on(FLAG_NAME_FINGERPRINT_DIR)
 
     _, self.graph_dump_path = self.get_flag_value(
@@ -250,7 +252,8 @@ class TTParameters(object):
         FLAG_NAME_OP_RANGE,
         FLAG_NAME_DUMP_BEFORE_AFTER_GRAPHS, FLAG_NAME_TRACE_LEVEL,
         FLAG_NAME_SUMMARY_SIGNATURES, FLAG_NAME_SUMMARY_PER_CORE,
-        FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR
+        FLAG_NAME_TEMP_CACHE_VAR, FLAG_NAME_FINGERPRINT_DIR,
+        FLAG_NAME_INSPECT_TRACE
     ]
     tensor_tracer_flags = self._env.get(FLAGS_ENV_VAR)
     if not tensor_tracer_flags:

From 84ab50e0e33e95c85c9ad5e237bf38a917436609 Mon Sep 17 00:00:00 2001
From: Andy Ly <lyandy@google.com>
Date: Fri, 2 Oct 2020 17:42:45 -0700
Subject: [PATCH 42/54] Rename tf_cuda_runtime_wrappers.cpp to
 tf_cuda_runtime_wrappers.cc (NFC).

PiperOrigin-RevId: 335133391
Change-Id: I97583476632023823aa011f026d532731325116b
---
 tensorflow/compiler/mlir/tools/kernel_gen/BUILD                 | 2 +-
 ...tf_cuda_runtime_wrappers.cpp => tf_cuda_runtime_wrappers.cc} | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename tensorflow/compiler/mlir/tools/kernel_gen/{tf_cuda_runtime_wrappers.cpp => tf_cuda_runtime_wrappers.cc} (97%)

diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 834115af907..181b928bfd5 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -162,7 +162,7 @@ cc_library(
 
 cc_library(
     name = "tf_cuda_runtime_wrappers",
-    srcs = ["tf_cuda_runtime_wrappers.cpp"],
+    srcs = ["tf_cuda_runtime_wrappers.cc"],
     compatible_with = get_compatible_with_cloud(),
     deps = [
         "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cpp b/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cc
similarity index 97%
rename from tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cpp
rename to tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cc
index 5633f639f04..3744a5ea31f 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cpp
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_cuda_runtime_wrappers.cc
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/raw_ostream.h"
-#include "third_party/llvm/llvm-project/mlir/include/mlir/ExecutionEngine/CRunnerUtils.h"
+#include "mlir/ExecutionEngine/CRunnerUtils.h"  // from @llvm-project
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"

From b2503c6a5e5c86a5c29eb2f7b902fc795cc125cf Mon Sep 17 00:00:00 2001
From: Feng Liu <fengliuai@google.com>
Date: Fri, 2 Oct 2020 18:01:40 -0700
Subject: [PATCH 43/54] Add a python binding for verifying the tfr text module

PiperOrigin-RevId: 335135384
Change-Id: I87a67b2ab7ea2ea1a7ca0e47f4f60be58ee486da
---
 tensorflow/compiler/mlir/tfr/BUILD            | 21 +++++++
 .../compiler/mlir/tfr/python/tfr_wrapper.cc   | 58 +++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc

diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
index e7ce1a79dab..579cc39e1cf 100644
--- a/tensorflow/compiler/mlir/tfr/BUILD
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_python_pybind_extension")
 load(
     "//third_party/mlir:tblgen.bzl",
     "gentbl",
@@ -226,3 +227,23 @@ cc_library(
     ],
     alwayslink = 1,
 )
+
+tf_python_pybind_extension(
+    name = "tfr_wrapper",
+    srcs = ["python/tfr_wrapper.cc"],
+    module_name = "tfr_wrapper",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tfr",
+        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python:pybind11_status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:SCFDialect",
+        "@llvm-project//mlir:Shape",
+        "@llvm-project//mlir:StandardOps",
+        "@pybind11",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc b/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
new file mode 100644
index 00000000000..b7372cffe2d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
@@ -0,0 +1,58 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/SCF/SCF.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
+#include "mlir/IR/AsmState.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Verifier.h"  // from @llvm-project
+#include "mlir/Parser.h"  // from @llvm-project
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
+
+PYBIND11_MODULE(tfr_wrapper, m) {
+  m.def("verify", [](std::string input) {
+    mlir::MLIRContext ctx(/*loadAllDialects=*/true);
+    auto& registry = ctx.getDialectRegistry();
+    registry.insert<mlir::scf::SCFDialect, mlir::TF::TensorFlowDialect,
+                    mlir::StandardOpsDialect, mlir::shape::ShapeDialect,
+                    mlir::TFR::TFRDialect>();
+    ctx.getDialectRegistry().loadAll(&ctx);
+
+    llvm::SourceMgr source_mgr = llvm::SourceMgr();
+    source_mgr.AddNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(input),
+                                  llvm::SMLoc());
+    auto module = mlir::parseSourceFile(source_mgr, &ctx);
+    if (!module) {
+      return false;
+    }
+
+    mlir::SourceMgrDiagnosticHandler sourceMgrHandler(source_mgr, &ctx);
+    if (failed(mlir::verify(*module))) {
+      module->emitError("Invalid MLIR module: failed verification.");
+      return false;
+    }
+    return true;
+  });
+}

From 3b0672c24b134f9eb7549cc4cefcb23c773295cd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Oct 2020 18:15:44 -0700
Subject: [PATCH 44/54] Updated the description of K.dot
 https://github.com/tensorflow/tensorflow/issues/43653

PiperOrigin-RevId: 335137025
Change-Id: I16261cab89a9fe7f61660ef2fe16b7322eb061e9
---
 tensorflow/python/keras/backend.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index 7bab18084dd..b8ae91dff02 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -1817,6 +1817,8 @@ def moving_average_update(x, value, momentum):
 def dot(x, y):
   """Multiplies 2 tensors (and/or variables) and returns a tensor.
 
+  This operation corresponds to `numpy.dot(a, b, out=None)`.
+
   Arguments:
       x: Tensor or variable.
       y: Tensor or variable.
@@ -1826,6 +1828,7 @@ def dot(x, y):
 
   Examples:
 
+  If inputs `x` and `y` are 2-D arrays, then it is equivalent to `tf.matmul`.
   >>> x = tf.keras.backend.placeholder(shape=(2, 3))
   >>> y = tf.keras.backend.placeholder(shape=(3, 4))
   >>> xy = tf.keras.backend.dot(x, y)
@@ -1838,6 +1841,8 @@ def dot(x, y):
   >>> xy
   <KerasTensor: shape=(32, 28, 4) dtype=float32 ...>
 
+  If `x` is an N-D array and `y` is an M-D array (where M>=2), it is a sum
+  product over the last axis of `x` and the second-to-last axis of `y`.
   >>> x = tf.keras.backend.random_uniform_variable(shape=(2, 3), low=0, high=1)
   >>> y = tf.keras.backend.ones((4, 3, 5))
   >>> xy = tf.keras.backend.dot(x, y)

From b1109ff54544be100bdb88a6661e8938937cac7b Mon Sep 17 00:00:00 2001
From: Xiao Yu <fishx@google.com>
Date: Fri, 2 Oct 2020 19:22:09 -0700
Subject: [PATCH 45/54] Enable a few test targets for tfrt. Disable
 quantization test since we don't plan to have quantization support in the
 initial launch.

PiperOrigin-RevId: 335143411
Change-Id: I606bacf12bd9b349da304cd97a8acc081dc758f0
---
 tensorflow/python/kernel_tests/BUILD                     | 4 ++++
 tensorflow/python/kernel_tests/bitcast_op_test.py        | 1 +
 tensorflow/python/kernel_tests/constant_op_test.py       | 3 +++
 tensorflow/python/kernel_tests/cwise_ops_binary_test.py  | 1 +
 tensorflow/python/kernel_tests/dynamic_stitch_op_test.py | 1 +
 tensorflow/python/kernel_tests/spacetodepth_op_test.py   | 1 +
 6 files changed, 11 insertions(+)

diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 4559deafc86..4091adf280e 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1810,6 +1810,7 @@ cuda_py_test(
     name = "bitcast_op_test",
     size = "small",
     srcs = ["bitcast_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -1841,6 +1842,7 @@ cuda_py_test(
     name = "constant_op_test",
     size = "small",
     srcs = ["constant_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -2096,6 +2098,7 @@ cuda_py_test(
     name = "dynamic_stitch_op_test",
     size = "small",
     srcs = ["dynamic_stitch_op_test.py"],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:data_flow_grad",
@@ -2683,6 +2686,7 @@ cuda_py_test(
         "no_windows",
         "no_windows_gpu",
     ],
+    tfrt_enabled = True,
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/kernel_tests/bitcast_op_test.py b/tensorflow/python/kernel_tests/bitcast_op_test.py
index ed6d7799c7e..5551469aa73 100644
--- a/tensorflow/python/kernel_tests/bitcast_op_test.py
+++ b/tensorflow/python/kernel_tests/bitcast_op_test.py
@@ -82,6 +82,7 @@ class BitcastTest(test.TestCase):
       datatype = dtypes.int8
       array_ops.bitcast(x, datatype, None)
 
+  @test_util.disable_tfrt("b/169901260")
   def testQuantizedType(self):
     shape = [3, 4]
     x = np.zeros(shape, np.uint16)
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index e35b62a4556..e965c52ee29 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -456,6 +456,7 @@ class ZerosTest(test.TestCase):
         self.assertFalse(np.any(z_value))
         self.assertEqual((2, 3), z_value.shape)
 
+  @test_util.disable_tfrt("b/169901260")
   def testQint8Dtype(self):
     dtype = dtypes_lib.qint8
     z = array_ops.zeros([2, 3], dtype=dtype)
@@ -466,6 +467,7 @@ class ZerosTest(test.TestCase):
     z_value = self.evaluate(math_ops.cast(z, dtypes_lib.int32))
     self.assertFalse(np.any(z_value))
 
+  @test_util.disable_tfrt("b/169901260")
   def testQint16Dtype(self):
     dtype = dtypes_lib.qint16
     z = array_ops.zeros([2, 3], dtype=dtype)
@@ -650,6 +652,7 @@ class OnesTest(test.TestCase):
         self.assertEqual([2, 3], z.get_shape())
         self.assertAllEqual(z, np.ones([2, 3]))
 
+  @test_util.disable_tfrt("b/169901260")
   def testQintDtype(self):
 
     @def_function.function(autograph=False)
diff --git a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
index 98832dd9885..1f8f6ac6153 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
@@ -991,6 +991,7 @@ class ComparisonOpTest(test.TestCase):
           [[True, True, True, True, True], [False, False, False, False, False]],
           values)
 
+  @test_util.disable_tfrt("b/169901260")
   def testEqualQuantizeDType(self):
     dtypes = [
         dtypes_lib.qint8,
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index 50d11a62793..5c4df4c6ac7 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -62,6 +62,7 @@ class DynamicStitchTestBase(object):
         # length.
         self.assertEqual([None], stitched_t.get_shape().as_list())
 
+  @test_util.disable_tfrt("b/169901260")
   def testSimpleOneDimensional(self):
     # Test various datatypes in the simple case to ensure that the op was
     # registered under those types.
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index 762a644b065..6b229ea80f7 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -309,6 +309,7 @@ class SpaceToDepthTest(test.TestCase):
       actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
+  @test_util.disable_tfrt("b/169901260")
   def testAgainstTranspose(self):
     self.compareToTranspose(3, 2, 3, 1, 2, "NHWC", dtypes.float32, False)
     self.compareToTranspose(1, 2, 3, 2, 2, "NHWC", dtypes.float32, False)

From 6f34adbeeb0ff4d097749faf3e962a440ce5c71e Mon Sep 17 00:00:00 2001
From: Randy Dodgen <dodgen@google.com>
Date: Fri, 2 Oct 2020 19:29:49 -0700
Subject: [PATCH 46/54] op_kernel_test.cc: Minor fixes for MSVC / C++14.

PiperOrigin-RevId: 335143984
Change-Id: Id37acca1de9dd2257a281f91cd76d5fbdfb32da0
---
 tensorflow/core/framework/op_kernel_test.cc | 23 ++++++++-------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index fb31b2d203b..9b5648927d1 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -1152,22 +1152,17 @@ TEST(RegisteredKernels, GetRegisteredKernelsForOp) {
   EXPECT_EQ(kernel_list.kernel(0).device_type(), "CPU");
 }
 
-#define EXTRACT_KERNEL_NAME_AND_BUILDER_IMPL(kernel_name, kernel_builder, ...) \
-  constexpr char const* kKernelName = kernel_name;                             \
-  auto builder = []() {                                                        \
-    return std::unique_ptr<KernelDef const>(kernel_builder.Build());           \
-  };
-#define EXTRACT_KERNEL_NAME_AND_BUILDER(kernel_builder) \
-  TF_EXTRACT_KERNEL_NAME(EXTRACT_KERNEL_NAME_AND_BUILDER_IMPL, kernel_builder)
+// EXTRACT_KERNEL_NAME_TO_STRING wraps TF_EXTRACT_KERNEL_NAME for testing
+// (it involves quite a bit of macro-magic).
+#define EXTRACT_KERNEL_NAME_TO_STRING_IMPL(name, kernel_builder, ...) name
+#define EXTRACT_KERNEL_NAME_TO_STRING(kernel_builder) \
+  TF_EXTRACT_KERNEL_NAME(EXTRACT_KERNEL_NAME_TO_STRING_IMPL, kernel_builder)
 
 TEST(RegisterKernelMacro, ExtractName) {
-  constexpr char const* kName = "Foo";
-  constexpr char const* kLabel = "Label";
-  EXTRACT_KERNEL_NAME_AND_BUILDER(Name(kName).Label(kLabel));
-  EXPECT_THAT(kKernelName, ::testing::StrEq(kName));
-  std::unique_ptr<KernelDef const> kernel_def = builder();
-  EXPECT_THAT(kernel_def->op(), ::testing::StrEq(kName));
-  EXPECT_THAT(kernel_def->label(), ::testing::StrEq(kLabel));
+  static constexpr char const* kName = "Foo";
+  static constexpr char const* kExtractedName =
+      EXTRACT_KERNEL_NAME_TO_STRING(Name(kName).Label("Label"));
+  EXPECT_THAT(kExtractedName, ::testing::StrEq(kName));
 }
 
 }  // namespace

From dfd74b35d993791920c92b6b498668ca67079f59 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Oct 2020 19:32:05 -0700
Subject: [PATCH 47/54] Integrate LLVM at llvm/llvm-project@88c9162c9d47

Updates LLVM usage to match
[88c9162c9d47](https://github.com/llvm/llvm-project/commit/88c9162c9d47)

PiperOrigin-RevId: 335144150
Change-Id: Iec15f83b63dc8548696a883de7727afa39ccd7eb
---
 tensorflow/workspace.bzl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 0a38596f151..2c807e0ead4 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -712,8 +712,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
     )
 
     # Check out LLVM and MLIR from llvm-project.
-    LLVM_COMMIT = "0f0cbcc4b166f32603371fb1d62ef3816cf8425f"
-    LLVM_SHA256 = "60fb8f8e25b31ed4d0697129df5d4bc097f6539b8b5a0ec05c5030e171344c74"
+    LLVM_COMMIT = "88c9162c9d47ef43a505bc5301dc626f3cd4f437"
+    LLVM_SHA256 = "1f1f40dcde1dbc681c020427d4c3433006305daa24424b6de230ad5480a507d3"
     LLVM_URLS = [
         "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
         "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),

From 0f7d38ba0ef04c8eb158f79eec0ca57a989d4a61 Mon Sep 17 00:00:00 2001
From: Mehmet Deveci <deveci@google.com>
Date: Fri, 2 Oct 2020 20:16:05 -0700
Subject: [PATCH 48/54] Fixing test breaks.

PiperOrigin-RevId: 335147190
Change-Id: I72a3c72b3d2915a829580f9de1d08eea85369456
---
 tensorflow/python/tpu/tensor_tracer.py | 62 +++++++++++++++++++-------
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/tensorflow/python/tpu/tensor_tracer.py b/tensorflow/python/tpu/tensor_tracer.py
index 1a5c1330097..204bf0e3f10 100644
--- a/tensorflow/python/tpu/tensor_tracer.py
+++ b/tensorflow/python/tpu/tensor_tracer.py
@@ -1342,12 +1342,6 @@ class TensorTracer(object):
       else:
         return tensor
 
-    # No need to print core numbers if the cache is merged already.
-    if self._parameters.collect_summary_per_core:
-      core_message = ['core:', replica_id, ',']
-    else:
-      core_message = []
-
     # Check if the cache includes any nan or inf
     if self._parameters.trace_mode == tensor_tracer_flags.TRACE_MODE_NAN_INF:
       # Cache has 1s or 0s if the mode is NaN_INF
@@ -1364,20 +1358,58 @@ class TensorTracer(object):
         lambda: 'NaNs or Infs in the step!',
         lambda: 'No numerical issues have been found for the step.')
 
-    print_op = logging_ops.print_v2(
-        '\n', *core_message, 'step:', step_num, '-->', step_error_message,
-        'Printing tensors for mode:%s...' % self._parameters.trace_mode,
-        summarize=-1,
-        output_stream=output_stream)
+    # No need to print core numbers if the cache is merged already.
+    if self._parameters.collect_summary_per_core:
+      print_op = logging_ops.print_v2(
+          '\n',
+          'core:',
+          replica_id,
+          ',',
+          'step:',
+          step_num,
+          '-->',
+          step_error_message,
+          'Printing tensors for mode:%s...' % self._parameters.trace_mode,
+          summarize=-1,
+          output_stream=output_stream)
+    else:
+      print_op = logging_ops.print_v2(
+          '\n',
+          'step:',
+          step_num,
+          '-->',
+          step_error_message,
+          'Printing tensors for mode:%s...' % self._parameters.trace_mode,
+          summarize=-1,
+          output_stream=output_stream)
 
     for tensor_name, cache_idx in sorted(
         tensor_trace_order.tensorname_to_cache_idx.items(),
         key=lambda item: item[1]):
       with ops.control_dependencies([print_op]):
-        print_op = logging_ops.print_v2(
-            *core_message, 'step:', step_num, ',',
-            tensor_name, '-->', _inspect_tensor(cache[cache_idx, 0]),
-            summarize=-1, output_stream=output_stream)
+        if self._parameters.collect_summary_per_core:
+          print_op = logging_ops.print_v2(
+              '\n',
+              'core:',
+              replica_id,
+              ',',
+              'step:',
+              step_num,
+              ',',
+              tensor_name,
+              '-->',
+              _inspect_tensor(cache[cache_idx, 0]),
+              summarize=-1, output_stream=output_stream)
+        else:
+          print_op = logging_ops.print_v2(
+              '\n',
+              'step:',
+              step_num,
+              ',',
+              tensor_name,
+              '-->',
+              _inspect_tensor(cache[cache_idx, 0]),
+              summarize=-1, output_stream=output_stream)
     return print_op
 
   def _generate_flush_cache_op(self, num_replicas, on_tpu, tensor_trace_order):

From 4fa8162dd8b5f7904694b75961bb3539a25353bd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Oct 2020 20:18:18 -0700
Subject: [PATCH 49/54] Update Eigen to
 https://gitlab.com/libeigen/eigen/-/commit/f93841b53ef83460348b19d3b0e82a96a81cd05c

PiperOrigin-RevId: 335147393
Change-Id: I0ed4e283c551e5a96c85b40300243403b09428e7
---
 tensorflow/core/kernels/eigen_pooling.h      | 4 ++--
 tensorflow/core/kernels/quantization_utils.h | 9 ++++++++-
 tensorflow/python/keras/losses.py            | 2 +-
 tensorflow/workspace.bzl                     | 8 ++++----
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 7db4a69a8b3..b9c9e549b5d 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -131,8 +131,8 @@ SpatialMaxPooling(const Input& input, DenseIndex patchRows,
       .extract_image_patches(
           patchRows, patchCols, strideRows, strideCols, in_strideRows,
           in_strideCols, padding_type,
-          -Eigen::NumTraits<typename internal::remove_const<
-              typename internal::traits<Input>::Scalar>::type>::highest())
+          Eigen::NumTraits<typename internal::remove_const<
+              typename internal::traits<Input>::Scalar>::type>::lowest())
       .maximum(reduction_dims)
       .reshape(post_reduce_dims);
 }
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index fef3ed582b3..eaa29023a60 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -43,7 +43,8 @@ namespace tensorflow {
 // We have to be able to detect and handle overflows in int32, so this function
 // uses doubles and int64's to make sure we have enough room.
 template <class T>
-int64 FloatToQuantizedUnclamped(float input, float range_min, float range_max) {
+inline int64 FloatToQuantizedUnclamped(float input, float range_min,
+                                       float range_max) {
   const int64 lowest_quantized =
       static_cast<double>(Eigen::NumTraits<T>::lowest());
   if (range_min == range_max) {
@@ -60,6 +61,12 @@ int64 FloatToQuantizedUnclamped(float input, float range_min, float range_max) {
   return quantized;
 }
 
+template <>
+inline int64 FloatToQuantizedUnclamped<float>(float input, float range_min,
+                                              float range_max) {
+  return -1;
+}
+
 // This converts the float into the final quantized type, clamping/saturating
 // any over or underflows.
 template <class T>
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index c66bc55a9a2..d7ad8e83a42 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -1288,7 +1288,7 @@ def mean_squared_logarithmic_error(y_true, y_pred):
   >>> assert loss.shape == (2,)
   >>> y_true = np.maximum(y_true, 1e-7)
   >>> y_pred = np.maximum(y_pred, 1e-7)
-  >>> assert np.array_equal(
+  >>> assert np.allclose(
   ...     loss.numpy(),
   ...     np.mean(
   ...         np.square(np.log(y_true + 1.) - np.log(y_pred + 1.)), axis=-1))
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 2c807e0ead4..2358dd26bb2 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -235,11 +235,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
         patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"),
-        sha256 = "a3c10a8c14f55e9f09f98b0a0ac6874c21bda91f65b7469d9b1f6925990e867b",  # SHARED_EIGEN_SHA
-        strip_prefix = "eigen-d10b27fe37736d2944630ecd7557cefa95cf87c9",
+        sha256 = "00ff67c15f8e8faf14495482e7396cc1d99cdfaaa2151f4aafef92bc754e634d",  # SHARED_EIGEN_SHA
+        strip_prefix = "eigen-22c971a225dbb567cd1a45f6006d16c4aa618551",
         urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/d10b27fe37736d2944630ecd7557cefa95cf87c9/eigen-d10b27fe37736d2944630ecd7557cefa95cf87c9.tar.gz",
-            "https://gitlab.com/libeigen/eigen/-/archive/d10b27fe37736d2944630ecd7557cefa95cf87c9/eigen-d10b27fe37736d2944630ecd7557cefa95cf87c9.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/22c971a225dbb567cd1a45f6006d16c4aa618551/eigen-22c971a225dbb567cd1a45f6006d16c4aa618551.tar.gz",
+            "https://gitlab.com/libeigen/eigen/-/archive/22c971a225dbb567cd1a45f6006d16c4aa618551/eigen-22c971a225dbb567cd1a45f6006d16c4aa618551.tar.gz",
         ],
     )
 

From 5b5aab7f63ad9667edee3bb837adc4033ccbeac3 Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Fri, 2 Oct 2020 20:20:11 -0700
Subject: [PATCH 50/54] Internal change

PiperOrigin-RevId: 335147548
Change-Id: Ib445cfbcb28421b4eb522d4d9524e4a64fe631df
---
 tensorflow/c/eager/BUILD                      |   6 +-
 tensorflow/c/eager/c_api.cc                   |   4 +-
 tensorflow/compiler/jit/BUILD                 |  10 +-
 .../compiler/jit/xla_compilation_cache.cc     |   4 +-
 tensorflow/compiler/tf2xla/BUILD              |  10 +-
 tensorflow/compiler/tf2xla/xla_compiler.cc    |   4 +-
 tensorflow/compiler/xla/rpc/BUILD             |   4 +-
 tensorflow/core/BUILD                         |   7 +-
 tensorflow/core/common_runtime/BUILD          |   4 +-
 .../core/platform/default/build_config.bzl    |   4 +-
 tensorflow/core/tpu/kernels/BUILD             | 120 +++++++-----------
 .../tpu/kernels/tpu_compilation_cache_grpc.cc |   6 +-
 .../tpu/kernels/tpu_compilation_cache_grpc.h  |   6 +-
 .../tpu_compilation_cache_rpc_lookup.cc       |   2 +-
 .../tpu_compilation_cache_rpc_support.cc      |   6 +-
 .../tpu/kernels/tpu_compilation_metrics.cc    |   4 +-
 .../core/tpu/kernels/tpu_compile_op_impl.cc   |   4 +-
 tensorflow/core/tpu/kernels/tpu_pod_state.cc  |   4 +-
 tensorflow/stream_executor/gpu/BUILD          |   4 +-
 tensorflow/tensorflow.bzl                     |   6 +-
 20 files changed, 94 insertions(+), 125 deletions(-)

diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index 54771ffa840..b90b2644269 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -3,7 +3,7 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
-    "if_tpu",
+    "if_libtpu",
     "tf_cc_test",
     "tf_copts",
     "tf_cuda_cc_test",
@@ -289,7 +289,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/llvm_rtti",
-    ] + if_tpu(
+    ] + if_libtpu(
         if_false = ["//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration"],
         if_true = [],
     ),
@@ -354,7 +354,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/llvm_rtti",
-    ] + if_tpu(
+    ] + if_libtpu(
         if_false = ["//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration"],
         if_true = [],
     ),
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 91c5a72ad64..5f388bfe0cd 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #include "tensorflow/c/eager/tfe_op_internal.h"
 #include "tensorflow/c/eager/tfe_tensorhandle_internal.h"
 #include "tensorflow/c/tf_tensor_internal.h"
-#if defined(PLATFORM_GOOGLE) && !defined(LIBTFTPU)
+#if defined(PLATFORM_GOOGLE) && !defined(LIBTPU_ON_GCE)
 #include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
 #endif
 #include "tensorflow/core/common_runtime/device.h"
@@ -729,7 +729,7 @@ void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
   if (opts->use_tfrt) {
-#if defined(PLATFORM_GOOGLE) && !defined(LIBTFTPU)
+#if defined(PLATFORM_GOOGLE) && !defined(LIBTPU_ON_GCE)
     return tensorflow::wrap(new tfrt::tf::ContextInterface(opts->async));
 #else
     status->status = tensorflow::errors::Unimplemented("TFRT is not supported");
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index b3655dcba63..da3db1789b5 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -4,7 +4,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "tf_cc_test")
 
 # buildifier: disable=same-origin-load
-load("//tensorflow:tensorflow.bzl", "if_tpu", "tf_copts")
+load("//tensorflow:tensorflow.bzl", "if_libtpu", "tf_copts")
 load("//tensorflow/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 
 # buildifier: disable=same-origin-load
@@ -77,7 +77,7 @@ cc_library(
         "//tensorflow/compiler/jit/kernels:xla_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
-    ] + if_tpu(
+    ] + if_libtpu(
         if_false = ["//tensorflow/compiler/xla/service:cpu_plugin"],
         if_true = [],
     ),
@@ -114,7 +114,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
-    ] + if_tpu(
+    ] + if_libtpu(
         if_false = [
             "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
         ],
@@ -141,7 +141,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime/gpu:gpu_init",
-    ] + if_tpu(
+    ] + if_libtpu(
         if_false = [
             "//tensorflow/compiler/xla/service:gpu_plugin",  # buildcleaner: keep
         ],
@@ -375,7 +375,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:logging",
-    ] + if_tpu(
+    ] + if_libtpu(
         if_false = [
             "//tensorflow/compiler/mlir:array_container_utils",
             "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 3ab3c19e439..91c2e0f9fdb 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -47,7 +47,7 @@ limitations under the License.
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/dump_graph.h"
 
-#if !defined(LIBTFTPU)
+#if !defined(LIBTPU_ON_GCE)
 #include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
 #include "tensorflow/compiler/mlir/utils/array_container_utils.h"
 #endif
@@ -289,7 +289,7 @@ Status XlaCompilationCache::CompileSingleOp(
         });
     const ConfigProto* config = ctx->function_library()->config_proto();
     bool use_mlir = config && config->experimental().enable_mlir_bridge();
-#ifdef LIBTFTPU
+#ifdef LIBTPU_ON_GCE
     if (use_mlir && has_tensor_list_arg) {
       LOG(WARNING) << "MLIR is not supported in this environment.";
     }
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 7b289f00bd0..03510701af4 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "if_tpu", "tf_cc_binary", "tf_cc_test", "tf_copts", "tf_cuda_cc_test", "tf_openmp_copts")
+load("//tensorflow:tensorflow.bzl", "if_libtpu", "tf_cc_binary", "tf_cc_test", "tf_copts", "tf_cuda_cc_test", "tf_openmp_copts")
 load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -298,7 +298,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/stream_executor:platform",
-    ] + if_tpu(
+    ] + if_libtpu(
         if_false = [
             "//tensorflow/compiler/xla/service:cpu_plugin",
             "//tensorflow/compiler/xla/service/cpu:buffer_info_util",
@@ -369,7 +369,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
-    ] + if_tpu(
+    ] + if_libtpu(
         if_false = [
             "//tensorflow/compiler/mlir:array_container_utils",
             "//tensorflow/compiler/mlir/tensorflow:compile_mlir_util_no_tf_dialect_passes",
@@ -877,13 +877,13 @@ cc_library(
 
 cc_library(
     name = "mlir_bridge_pass_registration",
-    srcs = if_tpu(
+    srcs = if_libtpu(
         if_false = [
             "mlir_bridge_pass_registration.cc",
         ],
         if_true = [],
     ),
-    deps = if_tpu(
+    deps = if_libtpu(
         if_false = [
             ":mlir_bridge_pass",
             "//tensorflow/compiler/mlir:mlir_graph_optimization_pass_registration",
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index e67e183aef9..c62b8286bbe 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -56,7 +56,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/graph_debug_info.pb.h"
 #include "tensorflow/core/util/dump_graph.h"
 
-#ifndef LIBTFTPU
+#ifndef LIBTPU_ON_GCE
 #include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h"
 #include "tensorflow/compiler/mlir/utils/array_container_utils.h"
 #endif
@@ -733,7 +733,7 @@ Status XlaCompiler::CompileFunction(
   }
 
   VLOG(1) << "====================================================";
-#ifdef LIBTFTPU
+#ifdef LIBTPU_ON_GCE
   if (GetMlirCommonFlags()->tf_mlir_enable_mlir_bridge) {
     VLOG(1) << "MLIR is not supported in this environment.";
   }
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index cb2c194858c..15022d1a879 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -2,7 +2,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")
 load(
     "//tensorflow:tensorflow.bzl",
-    "if_tpu",
+    "if_libtpu",
     "tf_cc_binary",
     "tf_cc_test",
 )
@@ -57,7 +57,7 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         tf_grpc_cc_dependency(),
-    ] + if_tpu(
+    ] + if_libtpu(
         if_false = ["//tensorflow/compiler/xla/service:cpu_plugin"],
         if_true = [],
     ),
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index a664c96d15b..a1149ec1187 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -68,9 +68,9 @@ load(
     "if_chromiumos",
     "if_cuda_or_rocm",
     "if_ios",
+    "if_libtpu",
     "if_mobile",
     "if_not_windows",
-    "if_tpu",
     "tf_android_core_proto_headers",
     "tf_cc_test",
     "tf_cc_test_mkl",
@@ -894,8 +894,7 @@ cc_library(
         "//tensorflow/c/kernels:summary_op_lib",
     ] + if_chromiumos(
         [],
-        # Non-tpu platforms don't need tpu dependency. It would be best to guard
-        # them by if_tpu. But there is no such flag yet.
+        # Non-tpu platforms don't need tpu dependency.
         [
             ":tpu_configuration_ops_op_lib",
             ":tpu_cross_replica_ops_op_lib",
@@ -916,7 +915,7 @@ cc_library(
     ]) + if_tensorrt([
         "//tensorflow/compiler/tf2tensorrt:trt_engine_resource_ops_op_lib",
         "//tensorflow/compiler/tf2tensorrt:trt_op_libs",
-    ]) + if_tpu(
+    ]) + if_libtpu(
         if_false = ["//tensorflow/compiler/mlir/tensorflow:mlir_passthrough_op"],
         if_true = [],
     ),
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index f0378ad7538..66dfdffaccf 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -1,6 +1,6 @@
 load(
     "//tensorflow:tensorflow.bzl",
-    "if_tpu",
+    "if_libtpu",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_cc_tests",
@@ -93,7 +93,7 @@ cc_library(
     deps = [
         ":core_cpu",
         "//tensorflow/core/common_runtime/gpu:gpu_runtime",
-    ] + if_tpu(["//tensorflow/core/tpu:tpu_runtime"]),
+    ] + if_libtpu(["//tensorflow/core/tpu:tpu_runtime"]),
 )
 
 filegroup(
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 9a1068d5a3a..d049ecec70a 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -1,7 +1,7 @@
 # Platform-specific build configurations.
 
 load("@com_google_protobuf//:protobuf.bzl", "proto_gen")
-load("//tensorflow:tensorflow.bzl", "clean_dep", "if_not_windows", "if_tpu")
+load("//tensorflow:tensorflow.bzl", "clean_dep", "if_libtpu", "if_not_windows")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
@@ -814,4 +814,4 @@ def if_llvm_system_z_available(then, otherwise = []):
     })
 
 def tf_tpu_dependencies():
-    return if_tpu(["//tensorflow/core/tpu/kernels"])
+    return if_libtpu(["//tensorflow/core/tpu/kernels"])
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index 90107abf4b6..8de50acfd6c 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -5,13 +5,11 @@ load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_proto_library",
 )
+load("//tensorflow:tensorflow.bzl", "if_libtpu", "tf_copts")
 load("//tensorflow:tensorflow.bzl", "tf_grpc_cc_dependency")  # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")  # buildifier: disable=same-origin-load
 
 # Config setting to enable go/libtpu support.
-WITH_TPU_SUPPORT = "//tensorflow:with_tpu_support"
-
-DEFAULT = "//conditions:default"
 
 package(
     default_visibility = [
@@ -44,10 +42,10 @@ cc_library(
     name = "tpu_compile_op_common",
     srcs = ["tpu_compile_op_common.cc"],
     hdrs = ["tpu_compile_op_common.h"],
-    deps = select({
-        WITH_TPU_SUPPORT: [":tpu_compilation_metrics"],
-        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_metrics"],
-    }) + [
+    deps = if_libtpu(
+        [":tpu_compilation_metrics"],
+        ["//tensorflow/core/tpu/kernels:tpu_compilation_metrics"],
+    ) + [
         ":tpu_compilation_cache_entry_unloader",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_metrics_hdrs",
@@ -97,14 +95,10 @@ tf_kernel_library(
     name = "tpu_configuration_ops",
     srcs = ["tpu_configuration_ops.cc"],
     hdrs = ["tpu_configuration_ops.h"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
-    deps = select({
-        WITH_TPU_SUPPORT: [":tpu_util"],
-        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_util"],
-    }) + [
+    deps = if_libtpu(
+        [":tpu_util"],
+        ["//tensorflow/core/tpu/kernels:tpu_util"],
+    ) + [
         ":tpu_compilation_cache_factory",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_local_lookup",
@@ -346,10 +340,10 @@ cc_library(
     name = "tpu_compilation_cache_interface",
     srcs = ["tpu_compilation_cache_interface.cc"],
     hdrs = ["tpu_compilation_cache_interface.h"],
-    deps = select({
-        WITH_TPU_SUPPORT: [":tpu_compilation_metrics"],
-        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_metrics"],
-    }) + [
+    deps = if_libtpu(
+        [":tpu_compilation_metrics"],
+        ["//tensorflow/core/tpu/kernels:tpu_compilation_metrics"],
+    ) + [
         ":compiled_subgraph",
         ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_entry",
@@ -424,10 +418,7 @@ cc_library(
 cc_library(
     name = "tpu_compilation_metrics",
     srcs = ["tpu_compilation_metrics.cc"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
+    copts = tf_copts(),
     deps = [
         ":tpu_compilation_metrics_hdrs",
     ],
@@ -529,14 +520,11 @@ cc_library(
 cc_library(
     name = "tpu_compilation_cache_rpc_support_hdrs",
     hdrs = ["tpu_compilation_cache_rpc_support.h"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
-    deps = select({
-        WITH_TPU_SUPPORT: [":tpu_compilation_cache_proto_cc"],  # build_cleaner: keep
-        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_cc_proto"],  # build_cleaner: keep
-    }) + [
+    copts = tf_copts(),
+    deps = if_libtpu(
+        [":tpu_compilation_cache_proto_cc"],
+        ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_cc_proto"],
+    ) + [
         ":tpu_compilation_cache_entry",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_lookup",
@@ -550,10 +538,7 @@ cc_library(
 cc_library(
     name = "tpu_compilation_cache_rpc_support",
     srcs = ["tpu_compilation_cache_rpc_support.cc"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
+    copts = tf_copts(),
     deps = [
         ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_proto_cc",
@@ -572,14 +557,11 @@ cc_library(
     name = "tpu_compilation_cache_rpc_lookup",
     srcs = ["tpu_compilation_cache_rpc_lookup.cc"],
     hdrs = ["tpu_compilation_cache_rpc_lookup.h"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
-    deps = select({
-        WITH_TPU_SUPPORT: [":tpu_compilation_cache_rpc_support"],
-        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_rpc_support"],
-    }) + [
+    copts = tf_copts(),
+    deps = if_libtpu(
+        [":tpu_compilation_cache_rpc_support"],
+        ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_rpc_support"],
+    ) + [
         ":tpu_compilation_cache_grpc",
         ":tpu_compilation_cache_interface",
         ":tpu_compilation_cache_lookup",
@@ -617,14 +599,11 @@ cc_library(
     name = "tpu_compilation_cache_grpc",
     srcs = ["tpu_compilation_cache_grpc.cc"],
     hdrs = ["tpu_compilation_cache_grpc.h"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
-    deps = select({
-        WITH_TPU_SUPPORT: [":tpu_compilation_cache_proto_cc"],
-        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_cc_proto"],
-    }) + [
+    copts = tf_copts(),
+    deps = if_libtpu(
+        [":tpu_compilation_cache_proto_cc"],
+        ["//tensorflow/core/tpu/kernels:tpu_compilation_cache_cc_proto"],
+    ) + [
         ":tpu_compilation_cache_common_proto_cc",
         tf_grpc_cc_dependency(),
     ],
@@ -634,20 +613,17 @@ cc_library(
     name = "tpu_compilation_cache_service",
     srcs = ["tpu_compilation_cache_service.cc"],
     hdrs = ["tpu_compilation_cache_service.h"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
-    deps = select({
-        WITH_TPU_SUPPORT: [
-            ":tpu_compilation_cache_rpc_support",  # build_cleaner: keep
-            ":tpu_compilation_cache_proto_cc",  # build_cleaner: keep
+    copts = tf_copts(),
+    deps = if_libtpu(
+        [
+            ":tpu_compilation_cache_rpc_support",
+            ":tpu_compilation_cache_proto_cc",
         ],
-        DEFAULT: [
-            "//tensorflow/core/tpu/kernels:tpu_compilation_cache_rpc_support",  # build_cleaner: keep
-            "//tensorflow/core/tpu/kernels:tpu_compilation_cache_cc_proto",  # build_cleaner: keep
+        [
+            "//tensorflow/core/tpu/kernels:tpu_compilation_cache_rpc_support",
+            "//tensorflow/core/tpu/kernels:tpu_compilation_cache_cc_proto",
         ],
-    }) + [
+    ) + [
         ":tpu_compilation_cache_common_proto_cc",
         ":tpu_compilation_cache_grpc",
         ":tpu_compilation_cache_interface",
@@ -704,10 +680,7 @@ cc_library(
     name = "tpu_compile_op_impl",
     srcs = ["tpu_compile_op_impl.cc"],
     hdrs = ["tpu_compile_op_impl.h"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
+    copts = tf_copts(),
     deps = [
         ":tpu_compilation_cache_key",
         ":tpu_compile_c_api_hdrs",
@@ -952,14 +925,11 @@ cc_library(
     name = "tpu_pod_state",
     srcs = ["tpu_pod_state.cc"],
     hdrs = ["tpu_pod_state.h"],
-    copts = select({
-        WITH_TPU_SUPPORT: ["-DLIBTFTPU"],
-        DEFAULT: [],
-    }),
-    deps = select({
-        WITH_TPU_SUPPORT: [":tpu_util"],
-        DEFAULT: ["//tensorflow/core/tpu/kernels:tpu_util"],
-    }) + [
+    copts = tf_copts(),
+    deps = if_libtpu(
+        [":tpu_util"],
+        ["//tensorflow/core/tpu/kernels:tpu_util"],
+    ) + [
         ":tpu_compilation_cache_service",
         "//tensorflow/c:tf_status",
         "//tensorflow/c:tf_status_helper",
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
index 207a60e7b48..c3aa62805c0 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
@@ -30,11 +30,11 @@ namespace tensorflow {
 namespace tpu {
 
 static const char* grpcTpuCompilationCacheService_method_names[] = {
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
     "/tensorflow.tpu.TpuCompilationCacheServiceExternal/GetTpuProgram",
-#else  // LIBTFTPU
+#else  // LIBTPU_ON_GCE
     "/tensorflow.tpu.TpuCompilationCacheService/GetTpuProgram",
-#endif  // LIBTFTPU
+#endif  // LIBTPU_ON_GCE
 };
 
 std::unique_ptr<grpc::TpuCompilationCacheService::Stub>
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
index 324fc9e6f08..55877d15df2 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
@@ -35,7 +35,7 @@ limitations under the License.
 
 #include <functional>
 
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
 #else
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"  // copybara"
@@ -48,7 +48,7 @@ namespace grpc {
 class TpuCompilationCacheService final {
  public:
   using RequestType = ::tensorflow::tpu::GetTpuProgramRequest;
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
   using ResponseType = ::tensorflow::tpu::GetTpuProgramResponseExternal;
 #else
   using ResponseType = ::tensorflow::tpu::GetTpuProgramResponse;
@@ -59,7 +59,7 @@ class TpuCompilationCacheService final {
   enum class MethodId { kGetTpuProgram = 0 };
 
   static constexpr char const* service_full_name() {
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
     return "tensorflow.tpu.TpuCompilationCacheServiceExternal";
 #else
     return "tensorflow.tpu.TpuCompilationCacheService";
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
index 8b0fb674682..7846cc7bbb3 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
@@ -25,7 +25,7 @@ namespace tensorflow {
 namespace tpu {
 namespace {
 
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
 using ResponseType = GetTpuProgramResponseExternal;
 #else
 using ResponseType = GetTpuProgramResponse;
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
index 9a6ca6be7e4..29ec8701a37 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/platform/casts.h"
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache.pb.h"
 #endif
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
@@ -30,7 +30,7 @@ std::shared_ptr<::grpc::ChannelCredentials> CreateChannelCredentials() {
   return ::grpc::InsecureChannelCredentials();  // NOLINT
 }
 
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
 template <>
 Status DeserializeRpcResponseToCacheEntry<GetTpuProgramResponseExternal>(
     absl::string_view local_proto_key, GetTpuProgramResponseExternal* response,
@@ -156,6 +156,6 @@ xla::StatusOr<std::vector<::grpc::Slice>> SerializeCacheEntryToBufferSlices(
 
   return std::vector<::grpc::Slice>{::grpc::Slice(encoded_header)};
 }
-#endif  // LIBTFTPU
+#endif  // LIBTPU_ON_GCE
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_metrics.cc b/tensorflow/core/tpu/kernels/tpu_compilation_metrics.cc
index e1a65ad0f32..ce982a1bd9a 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_metrics.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_metrics.cc
@@ -19,7 +19,7 @@ namespace tpu {
 
 // TODO(henrytan): remove this once `TpuCompilationCache` migration to OSS is
 // completed.
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
 /* static */
 void TpuCompilationMetrics::IncrementCacheLookupCount(
     bool is_cache_hit, absl::string_view session_name) {
@@ -36,7 +36,7 @@ void TpuCompilationMetrics::IncrementCompilationCount(
     absl::string_view session_name) {
   // A placeholder for tracking metrics.
 }
-#endif  // LIBTFTPU
+#endif  // LIBTPU_ON_GCE
 
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
index 8703dd818f5..270c2c53d7a 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
@@ -68,11 +68,11 @@ class TpuCompileOpImplFactory : public CompileOpImplFactory {
   }
 };
 
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
 REGISTER_MODULE_INITIALIZER(tpu_compile_op_impl_factory, {
   VLOG(1) << "register TpuCompileOpImplFactory()";
   CompileOpImplFactory::Register(new TpuCompileOpImplFactory());
 });
-#endif  // LIBTFTPU
+#endif  // LIBTPU_ON_GCE
 }  // namespace tpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.cc b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
index 7b02998b343..898f02b28e9 100644
--- a/tensorflow/core/tpu/kernels/tpu_pod_state.cc
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
 #else
 #include "tensorflow/core/tpu/kernels/tpu_util.h"  // copybara"
@@ -54,7 +54,7 @@ xla::StatusOr<std::unique_ptr<TpuCompilationCacheService>>
 ConstructCacheService(ResourceMgr* rmgr, int serving_port,
                       tpu::TpuCompilationCacheInterface* compilation_cache) {
   xla::StatusOr<std::unique_ptr<::grpc::ServerBuilder>> server_builder;
-#if defined(LIBTFTPU)
+#if defined(LIBTPU_ON_GCE)
   server_builder = tpu::CreateServerBuilder(serving_port);
 #else
   server_builder = tpu::CreateServerBuilderGoogle(serving_port);
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index 7fbb40e0ae9..8626c34d383 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -10,7 +10,7 @@ load(
     "//tensorflow/core/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
 )
-load("//tensorflow:tensorflow.bzl", "if_tpu", "tf_copts")
+load("//tensorflow:tensorflow.bzl", "if_libtpu", "tf_copts")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -70,7 +70,7 @@ cc_library(
         "//tensorflow/stream_executor:device_options",
         "//tensorflow/stream_executor/lib",
         "//tensorflow/stream_executor/platform",
-    ] + if_tpu(
+    ] + if_libtpu(
         if_false = ["@local_config_cuda//cuda:cuda_headers"],
         if_true = [],
     ),
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 8ed12136c55..990130e1702 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -261,8 +261,8 @@ def if_nccl(if_true, if_false = []):
         "//conditions:default": if_true,
     })
 
-def if_tpu(if_true, if_false = []):
-    """Shorthand for select()ing whether to build for TPUs."""
+def if_libtpu(if_true, if_false = []):
+    """Shorthand for select()ing whether to build support for using TPUs via libtpu.so"""
     return select({
         str(Label("//tensorflow:with_tpu_support")): if_true,
         "//conditions:default": if_false,
@@ -328,7 +328,7 @@ def tf_copts(
         (if_not_windows(["-fno-exceptions"]) if not allow_exceptions else []) +
         if_cuda(["-DGOOGLE_CUDA=1"]) +
         if_nvcc(["-DTENSORFLOW_USE_NVCC=1"]) +
-        if_tpu(["-DLIBTFTPU"]) +
+        if_libtpu(["-DLIBTPU_ON_GCE"], []) +
         if_xla_available(["-DTENSORFLOW_USE_XLA=1"]) +
         if_tensorrt(["-DGOOGLE_TENSORRT=1"]) +
         if_mkl(["-DINTEL_MKL=1", "-DENABLE_MKLDNN_V1", "-DENABLE_INTEL_MKL_BFLOAT16"]) +

From fd20aef919be295ce540aef232a4450ffb5fb521 Mon Sep 17 00:00:00 2001
From: Raman Sarokin <sorokin@google.com>
Date: Fri, 2 Oct 2020 22:52:11 -0700
Subject: [PATCH 51/54] Removed expensive check that one layer texture array
 supported.

PiperOrigin-RevId: 335160656
Change-Id: Ic55f1fb51143090ff92a06deb7d8f685b7c10a06
---
 tensorflow/lite/delegates/gpu/cl/BUILD        |  2 -
 .../lite/delegates/gpu/cl/environment.cc      | 75 ++-----------------
 .../lite/delegates/gpu/cl/environment.h       |  1 -
 3 files changed, 7 insertions(+), 71 deletions(-)

diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index 28b6cc7671a..907302f156a 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -286,10 +286,8 @@ cc_library(
         ":cl_command_queue",
         ":cl_context",
         ":cl_device",
-        ":cl_kernel",
         ":precision",
         ":program_cache",
-        ":tensor",
         ":tensor_type",
         ":util",
         "//tensorflow/lite/delegates/gpu/common:data_type",
diff --git a/tensorflow/lite/delegates/gpu/cl/environment.cc b/tensorflow/lite/delegates/gpu/cl/environment.cc
index 785e88299a7..d0f2463bcb2 100644
--- a/tensorflow/lite/delegates/gpu/cl/environment.cc
+++ b/tensorflow/lite/delegates/gpu/cl/environment.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/lite/delegates/gpu/cl/cl_kernel.h"
 #include "tensorflow/lite/delegates/gpu/cl/util.h"
 #include "tensorflow/lite/delegates/gpu/common/shape.h"
 
@@ -26,59 +25,6 @@ namespace tflite {
 namespace gpu {
 namespace cl {
 namespace {
-
-std::string GetKernelOneLayerTextureArray() {
-  return R"(
-
-__kernel void main_function(__write_only image2d_array_t dst) {
-  int X = (int)(get_global_id(0));
-  int Y = (int)(get_global_id(1));
-
-  write_imagef(dst, (int4)(X, Y, 0, 0), (float4)(2.0, 2.0, 2.0, 2.0));
-}
-)";
-}
-
-// Some Adreno < 600 have bug with one layer texture array. b/131099086
-// If we have one layer texture array and will write smt from kernel to this
-// texture, we will get zeroes instead of actual values.
-// The same kernel will work, if we use texture array with more than one layer.
-// With help of this code we can detect this bug.
-absl::Status CheckKernelSupportOfOneLayerTextureArray(Environment* env,
-                                                      bool* result) {
-  // No bug on Adreno 6xx
-  if (env->device().info_.adreno_info.gpu_version >= 600) {
-    *result = true;
-    return absl::OkStatus();
-  }
-  CLKernel kernel;
-  RETURN_IF_ERROR(env->program_cache()->GetOrCreateCLKernel(
-      GetKernelOneLayerTextureArray(), "main_function", env->context(),
-      env->device(), &kernel));
-
-  Tensor tensor;
-  const BHWC shape(1, 4, 4, 4);
-  RETURN_IF_ERROR(CreateTensor(
-      env->context(), shape,
-      {DataType::FLOAT32, TensorStorageType::TEXTURE_ARRAY, Layout::HWC},
-      &tensor));
-  RETURN_IF_ERROR(kernel.SetMemory(0, tensor.GetMemoryPtr()));
-  RETURN_IF_ERROR(env->queue()->DispatchImplicit(kernel, {4, 4, 1}, {4, 4, 1}));
-  TensorFloat32 tensor_gpu;
-  tensor_gpu.shape = shape;
-  tensor_gpu.data.resize(shape.DimensionsProduct());
-  RETURN_IF_ERROR(tensor.ReadData(env->queue(), &tensor_gpu));
-
-  *result = true;
-  for (int i = 0; i < 64; ++i) {
-    if (tensor_gpu.data[i] != 2.0) {
-      *result = false;
-      break;
-    }
-  }
-  return absl::OkStatus();
-}
-
 absl::Status CreateEnvironment(Environment* result, bool shared,
                                cl_context_properties egl_context,
                                cl_context_properties egl_display) {
@@ -99,16 +45,7 @@ absl::Status CreateEnvironment(Environment* result, bool shared,
   *result = Environment(std::move(gpu), std::move(context), std::move(queue),
                         std::move(profiling_queue));
 
-  if (result->device().IsAdreno() && result->device().SupportsTextureArray()) {
-    bool supports_one_layer;
-    RETURN_IF_ERROR(
-        CheckKernelSupportOfOneLayerTextureArray(result, &supports_one_layer));
-    if (!supports_one_layer) {
-      result->GetDevicePtr()->DisableOneLayerTextureArray();
-    }
-  }
-
-  return absl::OkStatus();
+  return result->Init();
 }
 
 }  // namespace
@@ -141,10 +78,12 @@ Environment& Environment::operator=(Environment&& environment) {
 
 absl::Status Environment::Init() {
   if (device().IsAdreno() && device().SupportsTextureArray()) {
-    bool supports_one_layer;
-    RETURN_IF_ERROR(
-        CheckKernelSupportOfOneLayerTextureArray(this, &supports_one_layer));
-    if (!supports_one_layer) {
+    // Some Adreno < 600 have bug with one layer texture array. b/131099086
+    // If we have one layer texture array and will write smt from kernel to this
+    // texture, we will get zeroes instead of actual values.
+    // The same kernel will work, if we use texture array with more than one
+    // layer.
+    if (device().info_.adreno_info.gpu_version < 600) {
       GetDevicePtr()->DisableOneLayerTextureArray();
     }
   }
diff --git a/tensorflow/lite/delegates/gpu/cl/environment.h b/tensorflow/lite/delegates/gpu/cl/environment.h
index 640f2d8cac3..43b5467d2ca 100644
--- a/tensorflow/lite/delegates/gpu/cl/environment.h
+++ b/tensorflow/lite/delegates/gpu/cl/environment.h
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/cl/cl_device.h"
 #include "tensorflow/lite/delegates/gpu/cl/precision.h"
 #include "tensorflow/lite/delegates/gpu/cl/program_cache.h"
-#include "tensorflow/lite/delegates/gpu/cl/tensor.h"
 #include "tensorflow/lite/delegates/gpu/cl/tensor_type.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"

From 177639123d6fdcfd636c2ecc3c93a85c2700753c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 2 Oct 2020 23:54:05 -0700
Subject: [PATCH 52/54] [MLIR][KernelGen] Compile for multiple NVIDIA GPU
 architectures simultaneously

For every architecture, compile the kernel module to ptx and to asm.  The
resulting cubins are then combined into one fatbin using the fatbinary tool.
This change only affects the `tf_to_kernel` tool.

PiperOrigin-RevId: 335165109
Change-Id: Ibed7bf37a732f6cdab29a9b7a978ae52a0033e80
---
 tensorflow/compiler/mlir/runlit.cfg.py        |   4 +-
 .../compiler/mlir/tools/kernel_gen/BUILD      |   5 +-
 .../mlir/tools/kernel_gen/kernel_creator.cc   |  12 +-
 .../mlir/tools/kernel_gen/kernel_creator.h    |   5 +-
 .../tests/tf_to_gpu_binary/tanh.mlir          |   3 +-
 .../tools/kernel_gen/tests/tf_to_kernel/BUILD |  17 ---
 .../kernel_gen/tests/tf_to_kernel/tanh.mlir   |   6 -
 .../mlir/tools/kernel_gen/tf_to_gpu_binary.cc |   2 +-
 .../mlir/tools/kernel_gen/tf_to_kernel.cc     |  11 +-
 .../mlir/tools/kernel_gen/transforms/BUILD    |   1 -
 .../transforms/gpu_kernel_to_blob_pass.cc     |  76 +++--------
 .../mlir/tools/kernel_gen/transforms/passes.h |   3 +-
 .../tools/kernel_gen/transforms/passes.td     |   5 +-
 .../kernels/mlir_generated/build_defs.bzl     |   3 +
 tensorflow/stream_executor/cuda/BUILD         |   2 +-
 tensorflow/stream_executor/gpu/BUILD          |   2 +-
 .../stream_executor/gpu/asm_compiler.cc       | 124 +++---------------
 tensorflow/stream_executor/gpu/asm_compiler.h |  10 --
 18 files changed, 61 insertions(+), 230 deletions(-)
 delete mode 100644 tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD
 delete mode 100644 tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/tanh.mlir

diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index 17410b4e5b2..e403a75d3b9 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -74,8 +74,8 @@ tool_names = [
     'tf_tfjs_translate', 'flatbuffer_to_string', 'flatbuffer_translate',
     'tf-mlir-translate', 'mlir-tflite-runner', 'tfcompile',
     'json_to_flatbuffer', 'xla-gpu-opt', 'xla-mlir-gpu-opt', 'xla-opt',
-    'hlo_to_llvm_ir', 'kernel-gen-opt', 'tf_to_kernel', 'tf_to_gpu_binary',
-    'xla-thunks-opt', 'tfjs-opt'
+    'hlo_to_llvm_ir', 'kernel-gen-opt', 'tf_to_gpu_binary', 'xla-thunks-opt',
+    'tfjs-opt'
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 181b928bfd5..a1db342c411 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -105,10 +105,7 @@ tf_cc_binary(
 tf_cc_binary(
     name = "tf_to_kernel",
     srcs = ["tf_to_kernel.cc"],
-    visibility = [
-        "//tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel:__pkg__",
-        "//tensorflow/core/kernels/mlir_generated:__pkg__",
-    ],
+    visibility = ["//tensorflow/core/kernels/mlir_generated:__pkg__"],
     deps = [
         ":kernel_creator",
         "//tensorflow/compiler/mlir:init_mlir",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
index c3b16721f56..48696f6e8b0 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@@ -174,8 +174,7 @@ Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only,
 Status LowerGPUToLLVM(mlir::ModuleOp module, bool gpu_binary_only,
                       llvm::ArrayRef<uint32_t> same_shape,
                       llvm::StringRef gpu_binary_attr_name,
-                      llvm::ArrayRef<uint32_t> architectures,
-                      bool generate_fatbin) {
+                      int32_t architecture) {
   mlir::PassManager pm(module.getContext());
   applyTensorflowAndCLOptions(pm);
 
@@ -188,7 +187,7 @@ Status LowerGPUToLLVM(mlir::ModuleOp module, bool gpu_binary_only,
   }
   kernel_pm.addPass(mlir::createStripDebugInfoPass());
   kernel_pm.addPass(mlir::kernel_gen::transforms::CreateGpuKernelToBlobPass(
-      gpu_binary_attr_name, architectures, generate_fatbin));
+      gpu_binary_attr_name, architecture));
 
   if (!gpu_binary_only) {
     pm.addPass(mlir::kernel_gen::transforms::CreateTFKernelToLLVMPass());
@@ -203,9 +202,9 @@ Status LowerGPUToLLVM(mlir::ModuleOp module, bool gpu_binary_only,
 
 StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
     mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
-    llvm::ArrayRef<uint32_t> architectures, llvm::ArrayRef<uint32_t> tile_sizes,
+    int32_t architecture, llvm::ArrayRef<uint32_t> tile_sizes,
     llvm::ArrayRef<uint32_t> same_shape,
-    llvm::ArrayRef<uint32_t> unroll_factors, bool generate_fatbin) {
+    llvm::ArrayRef<uint32_t> unroll_factors) {
   mlir::RegisterAllTensorFlowDialects(context.getDialectRegistry());
   mlir::OwningModuleRef module = mlir::parseSourceString(tf_code, &context);
   TF_RETURN_IF_ERROR(
@@ -222,8 +221,7 @@ StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
   TF_RETURN_IF_ERROR(xla::mlir_gpu::LowerKernelBodiesToNVVM(module.get()));
 #endif
   TF_RETURN_IF_ERROR(LowerGPUToLLVM(module.get(), gpu_binary_only, same_shape,
-                                    kGpuBinaryAttrName, architectures,
-                                    generate_fatbin));
+                                    kGpuBinaryAttrName, architecture));
   return module;
 }
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
index 0a74a8a3d5a..b168ec815de 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
@@ -38,10 +38,9 @@ namespace kernel_gen {
 // false, lowers the host side to LLVM Dialect.
 xla::StatusOr<mlir::OwningModuleRef> GenerateKernelForTfCode(
     mlir::MLIRContext& context, llvm::StringRef tf_code, bool gpu_binary_only,
-    llvm::ArrayRef<uint32_t> architectures = {75},
-    llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
+    int32_t architecture = 75, llvm::ArrayRef<uint32_t> tile_sizes = {16, 64},
     llvm::ArrayRef<uint32_t> same_shape = {},
-    llvm::ArrayRef<uint32_t> unroll_factors = {}, bool generate_fatbin = true);
+    llvm::ArrayRef<uint32_t> unroll_factors = {});
 
 // Extracts gpu_binary from the converted module.
 xla::StatusOr<std::string> ExtractGpuBinary(mlir::ModuleOp module);
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/tanh.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/tanh.mlir
index de9f4aee1cb..e596c338b14 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/tanh.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_gpu_binary/tanh.mlir
@@ -1,5 +1,6 @@
 // RUN: tf_to_gpu_binary --input=%s --output=%t --same_shape=0,1 --unroll_factors=4 --tile_sizes=256 --arch=70
 func @tanh(%arg0: tensor<?xf32>) -> tensor<?xf32> {
-  %0 = "tf.Tanh"(%arg0) : (tensor<?xf32>) -> tensor<?xf32>
+  %0 = "tf.Tanh"(%arg0) { }
+    : (tensor<?xf32>) -> tensor<?xf32>
   return %0 : tensor<?xf32>
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD
deleted file mode 100644
index 24e288c246c..00000000000
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
-
-package(licenses = ["notice"])
-
-glob_lit_tests(
-    data = [
-        "//tensorflow/compiler/mlir/tools/kernel_gen:tf_to_kernel",
-        "@llvm-project//mlir:run_lit.sh",
-    ],
-    default_tags = [
-        # We need access to the CUDA SDK.
-        "gpu",
-        "no_rocm",
-    ],
-    driver = "//tensorflow/compiler/mlir:run_lit.sh",
-    test_file_exts = ["mlir"],
-)
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/tanh.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/tanh.mlir
deleted file mode 100644
index d5d1b87bb67..00000000000
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/tanh.mlir
+++ /dev/null
@@ -1,6 +0,0 @@
-// RUN: tf_to_kernel --input=%s --output=%t --same_shape=0,1 --unroll_factors=4 --tile_sizes=256 --arch=70,75
-
-func @tanh(%arg: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = "tf.Tanh"(%arg) : (tensor<*xf32>) -> tensor<*xf32>
-  return %0 : tensor<*xf32>
-}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc
index cbd97e258b7..c7cb92404f5 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_gpu_binary.cc
@@ -48,7 +48,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
       mlir::OwningModuleRef module,
       GenerateKernelForTfCode(context, tf_code, /*gpu_binary_only=*/true,
                               architecture, tile_sizes, same_shape,
-                              unroll_factors, /*generate_fatbin=*/false));
+                              unroll_factors));
   // Extract gpu_binary.
   TF_ASSIGN_OR_RETURN(std::string gpu_binary, ExtractGpuBinary(*module));
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
index d2d71a28ff3..e62fa47cea9 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
@@ -95,8 +95,7 @@ xla::StatusOr<std::string> EmitToBinary(mlir::ModuleOp module) {
 }
 
 xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
-                llvm::ArrayRef<uint32_t> architectures,
-                llvm::ArrayRef<uint32_t> tile_sizes,
+                int32_t architecture, llvm::ArrayRef<uint32_t> tile_sizes,
                 llvm::ArrayRef<uint32_t> same_shape,
                 llvm::ArrayRef<uint32_t> unroll_factors) {
   // Read TF code.
@@ -108,7 +107,7 @@ xla::Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
   TF_ASSIGN_OR_RETURN(
       mlir::OwningModuleRef module,
       GenerateKernelForTfCode(context, tf_code, /*gpu_binary_only=*/false,
-                              architectures, tile_sizes, same_shape,
+                              architecture, tile_sizes, same_shape,
                               unroll_factors));
   // Get binary.
   TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(*module));
@@ -130,8 +129,8 @@ int main(int argc, char** argv) {
   llvm::cl::opt<std::string> output_file(
       "output", llvm::cl::desc("output file"), llvm::cl::value_desc("filename"),
       llvm::cl::init("foo.bin"));
-  llvm::cl::list<uint32_t> architectures(
-      "arch", llvm::cl::desc("target architectures (e.g. 50 for sm_50)"),
+  llvm::cl::list<int32_t> architecture(
+      "arch", llvm::cl::desc("target architecture (e.g. 50 for sm_50)"),
       llvm::cl::OneOrMore, llvm::cl::CommaSeparated);
   llvm::cl::list<uint32_t> tile_sizes(
       "tile_sizes", llvm::cl::desc("tile sizes to use"), llvm::cl::ZeroOrMore,
@@ -152,7 +151,7 @@ int main(int argc, char** argv) {
   llvm::cl::ParseCommandLineOptions(argc, argv, "TF op GPU kernel generator\n");
 
   auto status =
-      tensorflow::kernel_gen::Run(input_file, output_file, architectures,
+      tensorflow::kernel_gen::Run(input_file, output_file, architecture.front(),
                                   tile_sizes, same_shape, unroll_factors);
   if (!status.ok()) {
     LOG(ERROR) << status;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
index caa665b2971..d4110b466c9 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/BUILD
@@ -117,7 +117,6 @@ cc_library(
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
-        "@llvm-project//llvm:TransformUtils",
         "//tensorflow/compiler/mlir/hlo",
         "//tensorflow/compiler/mlir/hlo:hlo_legalize_to_lhlo",
         "//tensorflow/compiler/mlir/hlo:lhlo",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
index f995c22f36f..dda0e242b2e 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "llvm/Transforms/Utils/Cloning.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"  // from @llvm-project
 #include "mlir/Target/NVVMIR.h"  // from @llvm-project
 #include "mlir/Target/ROCDLIR.h"  // from @llvm-project
@@ -50,12 +49,9 @@ using xla::InternalError;
 class GpuKernelToBlobPass
     : public GpuKernelToBlobPassBase<GpuKernelToBlobPass> {
  public:
-  GpuKernelToBlobPass(mlir::StringRef blob_annotation,
-                      llvm::ArrayRef<uint32_t> architectures,
-                      bool generate_fatbin) {
+  GpuKernelToBlobPass(mlir::StringRef blob_annotation, int32_t arch) {
     blob_annotation_ = blob_annotation.str();
-    architectures_ = architectures;
-    generate_fatbin_ = generate_fatbin;
+    arch_ = arch;
   }
 
   void runOnOperation() override {
@@ -73,17 +69,7 @@ class GpuKernelToBlobPass
 
   xla::StatusOr<std::vector<uint8_t>> GetGpuBinaryBlob(
       mlir::gpu::GPUModuleOp gpu_module) {
-    if (architectures_.empty()) {
-      return InternalError("Expected at least one GPU architecture.");
-    }
-    if (!generate_fatbin_ && architectures_.size() > 1) {
-      return InternalError(
-          "Can only generate machine code for more than one architecture as a "
-          "fatbin.");
-    }
-
     llvm::LLVMContext llvmContext;
-
 #if TENSORFLOW_USE_ROCM
     auto llvmModule = mlir::translateModuleToROCDLIR(gpu_module, llvmContext);
     if (!llvmModule) {
@@ -95,14 +81,9 @@ class GpuKernelToBlobPass
     xla::HloModuleConfig config;
     config.set_debug_options(xla::GetDebugOptionsFromFlags());
 
-    // TODO(b/169066682): Support fatbin on ROCm.
-    if (generate_fatbin_) {
-      return InternalError("Fatbins are not yet supported for ROCm.");
-    }
-
-    uint32_t arch = architectures_.front();
     std::string libdevice_dir = tensorflow::RocdlRoot();
-    return xla::gpu::amdgpu::CompileToHsaco(llvmModule.get(), arch, config,
+
+    return xla::gpu::amdgpu::CompileToHsaco(llvmModule.get(), arch_, config,
                                             libdevice_dir);
 
 #elif GOOGLE_CUDA
@@ -121,42 +102,19 @@ class GpuKernelToBlobPass
       target->Options.AllowFPOpFusion = llvm::FPOpFusion::FPOpFusionMode::Fast;
     };
 
-    // Compile and collect requested cubin and PTX images.
-    std::vector<tensorflow::se::CubinOrPTXImage> images;
+    int32_t cc_major = arch_ / 10;
+    int32_t cc_minor = arch_ % 10;
     TF_ASSIGN_OR_RETURN(std::string libdevice_dir, GetLibdeviceDir(config));
-    auto gpu_asm_opts = xla::gpu::PtxOptsFromConfig(config);
-    for (uint32_t arch : architectures_) {
-      int32_t cc_major = arch / 10;
-      int32_t cc_minor = arch % 10;
-      // Module may be changed by CompileToPtx.
-      auto llvmModuleCopy = llvm::CloneModule(*llvmModule);
-      TF_ASSIGN_OR_RETURN(
-          std::string ptx,
-          xla::gpu::nvptx::CompileToPtx(llvmModuleCopy.get(),
-                                        std::make_pair(cc_major, cc_minor),
-                                        config, libdevice_dir, enable_fusion));
-      // TODO(b/169066682): If compute_XX profile, collect PTX image here.
-      VLOG(1) << ptx;
-      TF_ASSIGN_OR_RETURN(std::vector<uint8_t> gpu_asm,
-                          tensorflow::se::CompileGpuAsm(
-                              cc_major, cc_minor, ptx.c_str(), gpu_asm_opts));
+    TF_ASSIGN_OR_RETURN(
+        std::string ptx,
+        xla::gpu::nvptx::CompileToPtx(llvmModule.get(),
+                                      std::make_pair(cc_major, cc_minor),
+                                      config, libdevice_dir, enable_fusion));
+    VLOG(1) << ptx;
 
-      if (!generate_fatbin_) {
-        // Skip fatbin generation and return the first and only GPU machine
-        // code.
-        return gpu_asm;
-      }
-
-      // Collect cubin image.
-      images.push_back({absl::StrCat("sm_", arch), std::move(gpu_asm)});
-    }
-
-    // TODO(b/169870789): Revisit the use of fatbins.
-    // Bundle cubin and PTX images into a single fatbin.
-    return tensorflow::se::BundleGpuAsm(images,
-                                        gpu_asm_opts.preferred_cuda_dir);
+    return tensorflow::se::CompileGpuAsm(cc_major, cc_minor, ptx.c_str(),
+                                         xla::gpu::PtxOptsFromConfig(config));
 #endif
-
     return InternalError(
         "Neither TENSORFLOW_USE_ROCM nor GOOGLE_CUDA are defined."
         " Did you specify either --config=rocm or --config=cuda ?");
@@ -183,10 +141,8 @@ class GpuKernelToBlobPass
 }  // namespace
 
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
-    mlir::StringRef blob_annotation, ArrayRef<uint32_t> architectures,
-    bool generate_fatbin) {
-  return std::make_unique<GpuKernelToBlobPass>(blob_annotation, architectures,
-                                               generate_fatbin);
+    mlir::StringRef blob_annotation, int32_t architecture) {
+  return std::make_unique<GpuKernelToBlobPass>(blob_annotation, architecture);
 }
 
 }  // namespace transforms
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
index 43e464645a2..2ef863a394c 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
@@ -61,8 +61,7 @@ CreatePropagateTensorFlowABIKnowledgePass(
 
 // Pass to annotate GPU Module with its PTX.
 std::unique_ptr<OperationPass<gpu::GPUModuleOp>> CreateGpuKernelToBlobPass(
-    mlir::StringRef blob_annotation = "", ArrayRef<uint32_t> architectures = {},
-    bool generate_fatbin = true);
+    mlir::StringRef blob_annotation = "", int32_t architecture = 0);
 
 // Pass to unfuse batch norm.
 std::unique_ptr<FunctionPass> CreateUnfuseBatchNormPass();
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
index e84971bbf69..5bdd466732b 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
@@ -53,10 +53,7 @@ def GpuKernelToBlobPass : Pass<"gpu-kernel-to-blob", "gpu::GPUModuleOp"> {
   let options = [
     Option<"blob_annotation_", "blob-annotation", "std::string",
            /*default=*/"", "Blob attribute name">,
-    ListOption<"architectures_", "arch", "uint32_t", "GPU architectures">,
-    Option<"generate_fatbin_", "generate-fatbin", "bool", /*default=*/"true",
-           "Bundle machine code for the different architectures in one "
-           "fatbin.">,
+    Option<"arch_", "arch", "int32_t", /*default=*/"0", "GPU architecture">,
   ];
   let constructor = "transforms::CreateGpuKernelToBlobPass()";
 }
diff --git a/tensorflow/core/kernels/mlir_generated/build_defs.bzl b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
index 79944cf2ca9..93e2e555135 100644
--- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl
+++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
@@ -296,6 +296,9 @@ def _gen_unranked_kernel_fatbin_impl(ctx):
         archs_trimmed.append(arch[3:])
     arch_flag = ",".join(archs_trimmed)
 
+    # TODO(b/169066682): Generate Fatbin when lowering GPU module.
+    arch_flag = "75"
+
     filename = "%s.a" % (name)
     gpu_bin = ctx.outputs.output
     ctx.actions.run(
diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD
index 56d24bffef0..ea65d7aee5c 100644
--- a/tensorflow/stream_executor/cuda/BUILD
+++ b/tensorflow/stream_executor/cuda/BUILD
@@ -104,7 +104,7 @@ cc_library(
 
 # Buildozer can not remove dependencies inside select guards, so we have to use
 # an intermediate target.
-cc_library(name = "cuda_root_wrapper")
+cc_library(name = "ptxas_wrapper")
 
 cc_library(
     name = "cuda_driver",
diff --git a/tensorflow/stream_executor/gpu/BUILD b/tensorflow/stream_executor/gpu/BUILD
index 8626c34d383..a2696bd0088 100644
--- a/tensorflow/stream_executor/gpu/BUILD
+++ b/tensorflow/stream_executor/gpu/BUILD
@@ -250,7 +250,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
     ]) + if_cuda_is_configured([
         "//tensorflow/stream_executor/cuda:cuda_driver",
-        "//tensorflow/stream_executor/cuda:cuda_root_wrapper",
+        "//tensorflow/stream_executor/cuda:ptxas_wrapper",
     ]),
 )
 
diff --git a/tensorflow/stream_executor/gpu/asm_compiler.cc b/tensorflow/stream_executor/gpu/asm_compiler.cc
index 53f76503f2a..0f6fd4de910 100644
--- a/tensorflow/stream_executor/gpu/asm_compiler.cc
+++ b/tensorflow/stream_executor/gpu/asm_compiler.cc
@@ -140,44 +140,34 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int device_ordinal,
   return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
 }
 
-static std::string findCudaExecutable(const std::string binary_name,
-                                      const std::string preferred_cuda_dir) {
-#if defined(PLATFORM_WINDOWS)
-  const std::string binary_filename = binary_name + ".exe";
-#else
-  const std::string& binary_filename = binary_name;
-#endif
-
-  // Search in cuda root candidates.
-  auto env = tensorflow::Env::Default();
-  std::string binary_path;
-  for (const std::string& cuda_root :
-       tensorflow::CandidateCudaRoots(preferred_cuda_dir)) {
-    binary_path = tensorflow::io::JoinPath(cuda_root, "bin", binary_filename);
-    VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
-    if (env->FileExists(binary_path).ok()) {
-      break;
-    }
-  }
-  if (!env->FileExists(binary_path).ok()) {
-    // Rely on subprocess invocation to find the correct binary.
-    binary_path = binary_filename;
-  }
-  VLOG(2) << "Using " << binary_filename << " at " << binary_path;
-  return binary_path;
-}
-
 port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
                                                  const char* ptx_contents,
                                                  GpuAsmOpts options) {
-  std::string ptxas_path =
-      findCudaExecutable("ptxas", options.preferred_cuda_dir);
+  std::string ptxas_path;
+  auto env = tensorflow::Env::Default();
+  std::string ptxas_binary_name = "ptxas";
+#if defined(PLATFORM_WINDOWS)
+  ptxas_binary_name += ".exe";
+#endif
+
+  for (const std::string& cuda_root :
+       tensorflow::CandidateCudaRoots(options.preferred_cuda_dir)) {
+    ptxas_path = tensorflow::io::JoinPath(cuda_root, "bin", ptxas_binary_name);
+    VLOG(2) << "Looking for ptxas at " << ptxas_path;
+    if (env->FileExists(ptxas_path).ok()) {
+      break;
+    }
+  }
+  if (!env->FileExists(ptxas_path).ok()) {
+    // Rely on subprocess invocation to find the correct binary.
+    ptxas_path = ptxas_binary_name;
+  }
+  VLOG(2) << "Using ptxas at " << ptxas_path;
 
   WarnIfBadPtxasVersion(ptxas_path);
 
   // Write ptx into a temporary file.
   std::string ptx_path;
-  auto env = tensorflow::Env::Default();
   if (!env->LocalTempFilename(&ptx_path)) {
     return port::InternalError("couldn't get temp PTX file name");
   }
@@ -242,78 +232,4 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
   return cubin_vector;
 }
 
-port::StatusOr<std::vector<uint8>> BundleGpuAsm(
-    std::vector<CubinOrPTXImage> images, const std::string preferred_cuda_dir) {
-  std::string fatbinary_path =
-      findCudaExecutable("fatbinary", preferred_cuda_dir);
-
-  // Write images to temporary files.
-  std::vector<std::string> image_paths;
-  auto env = tensorflow::Env::Default();
-  for (const CubinOrPTXImage& img : images) {
-    std::string img_path;
-    if (!env->LocalTempFilename(&img_path)) {
-      return port::InternalError(
-          "Could not get temporary filenames for images.");
-    }
-    TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(
-        env, img_path, std::string(img.bytes.begin(), img.bytes.end())));
-    VLOG(2) << "image written to " << img_path;
-    image_paths.push_back(std::move(img_path));
-  }
-  auto image_files_cleaner = tensorflow::gtl::MakeCleanup([&image_paths] {
-    for (const auto& path : image_paths) {
-      TF_CHECK_OK(tensorflow::Env::Default()->DeleteFile(path));
-    }
-  });
-
-  // Prepare temorary result file.
-  std::string result_path;
-  if (!env->LocalTempFilename(&result_path)) {
-    return port::InternalError(
-        "Could not get temporary filename for fatbin result.");
-  }
-  auto result_file_cleaner = tensorflow::gtl::MakeCleanup([&result_path] {
-    // This file may never be created, so the failure to delete it should not
-    // propagate to TF.
-    tensorflow::Env::Default()->DeleteFile(result_path).IgnoreError();
-  });
-
-  // Invoke fatbinary and collect its output.
-  tensorflow::SubProcess fatbinary;
-  std::vector<std::string> fatbinary_args = {
-      fatbinary_path, "--64",           "--cmdline=--compile-only",
-      "--link",       "--compress-all", absl::StrCat("--create=", result_path)};
-  assert(images.size() == image_paths.size());
-  for (int i = 0; i < images.size(); i++) {
-    fatbinary_args.push_back(absl::StrFormat(
-        "--image=profile=%s,file=%s", images[i].profile, image_paths[i]));
-  }
-  if (VLOG_IS_ON(3)) {
-    VLOG(3) << absl::StrJoin(fatbinary_args, " ");
-  }
-  fatbinary.SetProgram(fatbinary_path, fatbinary_args);
-  fatbinary.SetChannelAction(tensorflow::CHAN_STDERR, tensorflow::ACTION_PIPE);
-  if (!fatbinary.Start()) {
-    return port::InternalError("Failed to launch fatbinary.");
-  }
-  std::string stderr_output;
-  int exit_status = fatbinary.Communicate(
-      /*stdin_input=*/nullptr, /*stdout_output=*/nullptr, &stderr_output);
-  if (exit_status != 0) {
-    return port::InternalError(absl::StrFormat(
-        "fatbinary exited with non-zero error code %d, output: %s", exit_status,
-        stderr_output));
-  }
-  if (!stderr_output.empty()) {
-    VLOG(2) << stderr_output;
-  }
-
-  // Read in the result and return it as a byte vector.
-  std::string result_blob;
-  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(tensorflow::Env::Default(),
-                                                  result_path, &result_blob));
-  return std::vector<uint8>(result_blob.begin(), result_blob.end());
-}
-
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/gpu/asm_compiler.h b/tensorflow/stream_executor/gpu/asm_compiler.h
index 513ac6ca867..e5f67a71242 100644
--- a/tensorflow/stream_executor/gpu/asm_compiler.h
+++ b/tensorflow/stream_executor/gpu/asm_compiler.h
@@ -52,16 +52,6 @@ port::StatusOr<std::vector<uint8>> CompileGpuAsm(int cc_major, int cc_minor,
 port::StatusOr<absl::Span<const uint8>> CompileGpuAsmOrGetCached(
     int device_ordinal, const char* ptx, GpuAsmOpts compilation_options);
 
-struct CubinOrPTXImage {
-  std::string profile;
-  std::vector<uint8> bytes;
-};
-
-// Bundles the GPU machine code (cubins) and PTX if requested and returns the
-// resulting binary (i.e. a fatbin) as a byte array.
-port::StatusOr<std::vector<uint8>> BundleGpuAsm(
-    std::vector<CubinOrPTXImage> images, const std::string preferred_cuda_dir);
-
 }  // namespace stream_executor
 
 #endif  // TENSORFLOW_STREAM_EXECUTOR_GPU_ASM_COMPILER_H_

From 99af4797cb0b7d0815f420042226e67c054322be Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 3 Oct 2020 02:01:33 -0700
Subject: [PATCH 53/54] compat: Update forward compatibility horizon to
 2020-10-03

PiperOrigin-RevId: 335174273
Change-Id: Ic36ab412be399d0193e38dd8ea4c9100440aded7
---
 tensorflow/python/compat/compat.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index c21b774e0f2..ab02032c0a3 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -33,7 +33,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 10, 2)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2020, 10, 3)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 

From 2256067e7038786530f0f50265893668b5a7330e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 3 Oct 2020 02:01:35 -0700
Subject: [PATCH 54/54] Update GraphDef version to 543.

PiperOrigin-RevId: 335174276
Change-Id: I677bd2a95ac15df1b6d7f18dd1934be737bba004
---
 tensorflow/core/public/version.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index a683466dbf1..2fa52a94e0e 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 542  // Updated: 2020/10/2
+#define TF_GRAPH_DEF_VERSION 543  // Updated: 2020/10/3
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //