Merge pull request #45767 from tensorflow-jenkins/relnotes-2.0.4-9907

Update release notes for TensorFlow 2.0.4
Update RELEASE.md
2021-01-04 12:19:09 -08:00 · 2021-01-04 11:57:46 -08:00 · 2021-01-04 11:34:45 -08:00 · 2021-01-04 11:23:01 -08:00 · 2020-12-19 19:03:41 -08:00 · 2020-12-19 19:01:58 -08:00
552 changed files with 18028 additions and 6369 deletions
--- a/.bazelrc
+++ b/.bazelrc
@ -1,3 +1,80 @@
+# TensorFlow Bazel configuration file.
+# This file tries to group and simplify build options for TensorFlow
+#
+# ----CONFIG OPTIONS----
+# Android options:
+#    android:
+#    android_arm:
+#    android_x86:
+#    android_x86_64:
+#
+# iOS options:
+#     ios:
+#     ios_armv7:
+#     ios_arm64:
+#     ios_x86_64:
+#     ios_fat:
+#
+# Compiler options:
+#     cuda_clang:             Use clang when building CUDA code.
+#     c++17:                  Build with C++17 options
+#     C++1z:                  Build with C++17 options
+#     avx_linux:              Build with avx instruction set on linux.
+#     avx2_linux:             Build with avx2 instruction set on linux.
+#     arch_native_linux:      Build with instruction sets available to the host machine on linux
+#     avx_win:                Build with avx instruction set on windows
+#     avx2_win:               Build with avx2 instruction set on windows
+#
+# Other build options:
+#     short_logs:       Only log errors during build, skip warnings.
+#     monolithic:       Build all TF C++ code into a single shared object.
+#     dynamic_kernels:  Try to link all kernels dynamically (experimental).
+#
+#
+# TF version options;
+#     v1: Build TF V1 (without contrib)
+#     v2: Build TF v2
+#
+# Feature and Third party library support options:
+#     xla:          Build TF with XLA
+#     using_cuda:   CUDA is available to build system.
+#     cuda:         Build with full cuda support.
+#     rocm:         Build with AMD GPU support (rocm).
+#     sycl:         Build with SYCL support.
+#     sycl_nodouble:
+#     sycl_asan:
+#     sycl_trisycl:
+#     mkl:          Enable full mkl support.
+#     mkl_open_source_only: Enable MKL support only using open source MKL libraries.
+#     tensorrt:     Enable Tensorrt support.
+#     ngraph:       Enable ngraph support.
+#     numa:         Enable numa using hwloc.
+#     noaws:        Disable AWS S3 storage support
+#     nogcp:        Disable GCS support.
+#     nohdfs:       Disable hadoop hdfs support.
+#     nonccl:       Disable nccl support.
+#
+#
+# Remote build execution options (only configured to work with TF team projects for now.)
+#     rbe:        General RBE options shared by all flavors.
+#     rbe_linux:  General RBE options used on all linux builds.
+#     rbe_win:    General RBE options used on all windows builds.
+#
+#     rbe_cpu_linux:        RBE options to build with only CPU support.
+#     rbe_linux_cuda_nvcc:  RBE options to build with GPU support using nvcc.
+#     rbe_gpu_linux:        An alias for rbe_linux_cuda_nvcc
+#
+#     rbe_linux_py2: Linux Python 2 RBE config.
+#     rbe_linux_py3: Linux Python 3 RBE config
+#
+#     rbe_win_py37: Windows Python 3.7 RBE config
+#
+#     tensorflow_testing_rbe_linux: RBE options to use RBE with tensorflow-testing project on linux
+#     tensorflow_testing_rbe_win:   RBE options to use RBE with tensorflow-testing project on windows
+#
+
+
+
 # Android configs. Bazel needs to have --cpu and --fat_apk_cpu both set to the
 # target CPU to build transient dependencies correctly. See
 # https://docs.bazel.build/versions/master/user-manual.html#flag--fat_apk_cpu
@ -48,15 +125,6 @@ build:mkl_open_source_only --define=build_with_mkl_dnn_v1_only=true
 build:mkl_open_source_only --define=build_with_mkl=true --define=enable_mkl=true
 build:mkl_open_source_only --define=tensorflow_mkldnn_contraction_kernel=0

-build:download_clang --crosstool_top=@local_config_download_clang//:toolchain
-build:download_clang --define=using_clang=true
-build:download_clang --action_env TF_DOWNLOAD_CLANG=1
-# Instruct clang to use LLD for linking.
-# This only works with GPU builds currently, since Bazel sets -B/usr/bin in
-# auto-generated CPU crosstool, forcing /usr/bin/ld.lld to be preferred over
-# the downloaded one.
-build:download_clang_use_lld --linkopt='-fuse-ld=lld'
-
 # This config refers to building with CUDA available. It does not necessarily
 # mean that we build CUDA op kernels.
 build:using_cuda --define=using_cuda=true
@ -109,7 +177,6 @@ build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true

 build --spawn_strategy=standalone
-build --strategy=Genrule=standalone
 build -c opt

 # Make Bazel print out all options from rc files.
@ -132,29 +199,147 @@ build --define=PREFIX=/usr
 build --define=LIBDIR=$(PREFIX)/lib
 build --define=INCLUDEDIR=$(PREFIX)/include

+# Suppress C++ compiler warnings, otherwise build logs become 10s of MBs.
+build --copt=-w
+
 # Suppress all warning messages.
 build:short_logs --output_filter=DONT_MATCH_ANYTHING

+# Instruction set optimizations
+# TODO(gunan): Create a feature in toolchains for avx/avx2 to
+#   avoid having to define linux/win separately.
+build:avx_linux --copt=-mavx
+build:avx2_linux --copt=-mavx2
+build:native_arch_linux --copt=-march=native
+build:avx_win --copt=/arch=AVX
+build:avx2_win --copt=/arch=AVX2
+
+# Options to build TensorFlow 1.x or 2.x.
+build:v1 --define=tf_api_version=1
+build:v2 --define=tf_api_version=2
+build:v1 --action_env=TF2_BEHAVIOR=0
+build:v2 --action_env=TF2_BEHAVIOR=1
+build --config=v2
+test --config=v2
+
+# Enable XLA
+build:xla --action_env=TF_ENABLE_XLA=1
+build:xla --define=with_xla_support=true
+
+# BEGIN TF REMOTE BUILD EXECUTION OPTIONS
 # Options when using remote execution
+# WARNING: THESE OPTIONS WONT WORK IF YOU DO NOT HAVE PROPER AUTHENTICATION AND PERMISSIONS
 build:rbe --action_env=BAZEL_DO_NOT_DETECT_CPP_TOOLCHAIN=1
 build:rbe --auth_enabled=true
 build:rbe --auth_scope=https://www.googleapis.com/auth/cloud-source-tools
+build:rbe --bes_backend=buildeventservice.googleapis.com
+build:rbe --bes_best_effort=false
+build:rbe --bes_results_url="https://source.cloud.google.com/results/invocations"
+build:rbe --bes_timeout=600s
 build:rbe --define=EXECUTOR=remote
 build:rbe --flaky_test_attempts=3
 build:rbe --jobs=200
-build:rbe --remote_accept_cached=true
-build:rbe --remote_cache=remotebuildexecution.googleapis.com
-build:rbe --remote_executor=remotebuildexecution.googleapis.com
-build:rbe --remote_local_fallback=false
-build:rbe --remote_timeout=600
+build:rbe --remote_executor=grpcs://remotebuildexecution.googleapis.com
+build:rbe --remote_timeout=3600
 build:rbe --spawn_strategy=remote
-build:rbe --strategy=Genrule=remote
-build:rbe --strategy=Closure=remote
-build:rbe --strategy=Javac=remote
-build:rbe --strategy=TestRunner=remote
-build:rbe --tls_enabled
 test:rbe --test_env=USER=anon

+build:rbe --distinct_host_configuration=false
+
+build:rbe_linux --config=rbe
+build:rbe_linux --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
+build:rbe_linux --host_javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.1:jdk8
+build:rbe_linux --javabase=@bazel_toolchains//configs/ubuntu16_04_clang/1.1:jdk8
+build:rbe_linux --host_java_toolchain=@bazel_tools//tools/jdk:toolchain_hostjdk8
+build:rbe_linux --java_toolchain=@bazel_tools//tools/jdk:toolchain_hostjdk8
+
+# Non-rbe settings we should include because we do not run configure
+build:rbe_linux --config=xla
+build:rbe_linux --config=avx_linux
+build:rbe_linux --config=short_logs
+# TODO(gunan): Check why we need this specified in rbe, but not in other builds.
+build:rbe_linux --linkopt=-lrt
+
+build:rbe_cpu_linux --config=rbe_linux
+build:rbe_cpu_linux --crosstool_top="//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:toolchain"
+build:rbe_cpu_linux --extra_toolchains="//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010:cc-toolchain-k8"
+build:rbe_cpu_linux --extra_execution_platforms"=@org_tensorflow//third_party/toolchains:rbe_ubuntu16.04-manylinux2010"
+build:rbe_cpu_linux --host_platform="@org_tensorflow//third_party/toolchains:rbe_ubuntu16.04-manylinux2010"
+build:rbe_cpu_linux --platforms="@org_tensorflow//third_party/toolchains:rbe_ubuntu16.04-manylinux2010"
+
+build:rbe_linux_cuda_nvcc --config=rbe_linux
+build:rbe_linux_cuda_nvcc --crosstool_top="//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain"
+build:rbe_linux_cuda_nvcc --extra_toolchains="//third_party/toolchains/preconfig/ubuntu16.04/gcc7_manylinux2010-nvcc-cuda10.1:toolchain-linux-x86_64"
+build:rbe_linux_cuda_nvcc --extra_execution_platforms="@org_tensorflow//third_party/toolchains:rbe_cuda10.1-cudnn7-ubuntu16.04-manylinux2010,@org_tensorflow//third_party/toolchains:rbe_cuda10.1-cudnn7-ubuntu16.04-manylinux2010-gpu"
+build:rbe_linux_cuda_nvcc --host_platform="@org_tensorflow//third_party/toolchains:rbe_cuda10.1-cudnn7-ubuntu16.04-manylinux2010"
+build:rbe_linux_cuda_nvcc --platforms="@org_tensorflow//third_party/toolchains:rbe_cuda10.1-cudnn7-ubuntu16.04-manylinux2010"
+build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_CONFIG_REPO="@org_tensorflow//third_party/toolchains/preconfig/ubuntu16.04/cuda10.1-cudnn7"
+build:rbe_linux_cuda_nvccrepoo_env=TF_TENSORRT_CONFIG_REPO="@org_tensorflow//third_party/toolchains/preconfig/ubuntu16.04/tensorrt6.0"
+build:rbe_linux_cuda_nvcc --repo_env=TF_NEED_TENSORRT=1
+build:rbe_linux_cuda_nvcc --repo_env=TF_CUDA_VERSION=10
+build:rbe_linux_cuda_nvcc --repo_env=TF_CUDNN_VERSION=7
+build:rbe_linux_cuda_nvcc --repo_env=REMOTE_GPU_TESTING=1
+build:rbe_linux_cuda_nvcc --repo_env=TF_NEED_CUDA=1
+build:rbe_linux_cuda_nvcc --define=using_cuda_nvcc=true
+test:rbe_linux_cuda_nvcc --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
+
+common:rbe_gpu_linux --config=rbe_linux_cuda_nvcc
+
+build:rbe_linux_py2 --config=rbe_linux
+build:rbe_linux_py2 --repo_env=PYTHON_BIN_PATH="/usr/bin/python2"
+build:rbe_linux_py2 --python_path="/usr/bin/python2"
+build:rbe_linux_py2 --repo_env=TF_PYTHON_CONFIG_REPO="@org_tensorflow//third_party/toolchains/preconfig/ubuntu16.04/py"
+
+build:rbe_linux_py3 --config=rbe_linux
+build:rbe_linux_py3 --repo_env=PYTHON_BIN_PATH="/usr/bin/python3"
+build:rbe_linux_py3 --python_path="/usr/bin/python3"
+build:rbe_linux_py3 --repo_env=TF_PYTHON_CONFIG_REPO="@org_tensorflow//third_party/toolchains/preconfig/ubuntu16.04/py3"
+
+build:rbe_win --config=rbe
+build:rbe_win --crosstool_top="@org_tensorflow//third_party/toolchains/preconfig/win_1803/bazel_026:toolchain"
+build:rbe_win --extra_execution_platforms="@org_tensorflow//third_party/toolchains/preconfig/win_1803:rbe_windows_1803"
+build:rbe_win --extra_toolchains="@org_tensorflow//third_party/toolchains/preconfig/win_1803/bazel_026:cc-toolchain-x64_windows"
+build:rbe_win --host_javabase="@org_tensorflow//third_party/toolchains/preconfig/win_1803:windows_jdk8"
+build:rbe_win --host_platform="@org_tensorflow//third_party/toolchains/preconfig/win_1803:rbe_windows_1803"
+build:rbe_win --javabase="@org_tensorflow//third_party/toolchains/preconfig/win_1803:windows_jdk8"
+build:rbe_win --platforms="@org_tensorflow//third_party/toolchains/preconfig/win_1803:rbe_windows_1803"
+build:rbe_win --shell_executable=C:\\tools\\msys64\\usr\\bin\\bash.exe
+
+# Misc build options we need for windows
+build:rbe_win --copt=-DWIN32_LEAN_AND_MEAN
+build:rbe_win --host_copt=-DWIN32_LEAN_AND_MEAN
+build:rbe_win --copt=-DNOGDI
+build:rbe_win --host_copt=-DNOGDI
+build:rbe_win --linkopt=/DEBUG
+build:rbe_win --host_linkopt=/DEBUG
+build:rbe_win --linkopt=/OPT:REF
+build:rbe_win --host_linkopt=/OPT:REF
+build:rbe_win --linkopt=/OPT:ICF
+build:rbe_win --host_linkopt=/OPT:ICF
+build:rbe_win --config=monolithic
+build:rbe_win --experimental_strict_action_env=true
+build:rbe_win --incompatible_windows_native_test_wrapper
+# TODO(gunan): Remove once we use MSVC 2019 with latest patches.
+build:rbe_win --define=override_eigen_strong_inline=true
+
+build:rbe_win_py37 --config=rbe
+build:rbe_win_py37 --repo_env=PYTHON_BIN_PATH=C:\\Python37\\python.exe
+build:rbe_win_py37 --repo_env=PYTHON_LIB_PATH=C:\\Python37\\lib\\site-packages
+build:rbe_win_py37 --repo_env=TF_PYTHON_CONFIG_REPO=@org_tensorflow//third_party/toolchains/preconfig/win_1803/py37
+build:rbe_win_py37 --python_path=C:\\Python37\\python.exe
+
+# These you may need to change for your own GCP project.
+build:tensorflow_testing_rbe --project_id=tensorflow-testing
+common:tensorflow_testing_rbe_linux --remote_instance_name=tensorflow-testing/instances/default_instance
+build:tensorflow_testing_rbe_linux --config=tensorflow_testing_rbe
+build:tensorflow_testing_rbe_linux --config=rbe
+build:tensorflow_testing_rbe_linux --config=rbe_linux
+
+common:tensorflow_testing_rbe_win --remote_instance_name=projects/tensorflow-testing/instances/windows
+build:tensorflow_testing_rbe_win --config=tensorflow_testing_rbe
+build:tensorflow_testing_rbe_win --config=rbe_win
+# END TF REMOTE BUILD EXECUTION OPTIONS
+
 # Default options should come above this line

 # Options from ./configure
--- a/README.md
+++ b/README.md
@ -116,6 +116,8 @@ The TensorFlow project strives to abide by generally accepted best practices in

 Build Type                                                                        | Status                                                                                                                                                                                        | Artifacts
 --------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
+**Linux AMD ROCm GPU** Nightly                                                    | [![Build Status](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly/badge/icon)](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly)                                                  | [Nightly](http://ml-ci.amd.com:21096/job/tensorflow-rocm-nightly/lastSuccessfulBuild/)
+**Linux AMD ROCm GPU** Stable Release                                             | [![Build Status](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/badge/icon)](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/)                                                 | [Release](http://ml-ci.amd.com:21096/job/tensorflow-rocm-release/lastSuccessfulBuild/)
 **Linux s390x** Nightly                                                           | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                             | [Nightly](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)
 **Linux s390x CPU** Stable Release                                                | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/badge/icon)](https://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/)                                      | [Release](https://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_Release_Build/)
 **Linux ppc64le CPU** Nightly                                                     | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                       | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Nightly_Artifact/)
--- a/RELEASE.md
+++ b/RELEASE.md
--- a/configure.py
+++ b/configure.py
@ -1559,9 +1559,6 @@ def main():
  if is_windows():
    set_windows_build_flags(environ_cp)

-  # Add a config option to build TensorFlow 2.0 API.
-  write_to_bazelrc('build:v2 --define=tf_api_version=2')
-
  if get_var(environ_cp, 'TF_SET_ANDROID_WORKSPACE', 'android workspace', False,
             ('Would you like to interactively configure ./WORKSPACE for '
              'Android builds?'), 'Searching for NDK and SDK installations.',
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -777,8 +777,8 @@ genrule(
    mkdir $@
    for f in $(SRCS); do
      d="$${f%/*}"
-      d="$${d#bazel-out*genfiles/}"
-      d="$${d#*external/eigen_archive/}"
+      d="$${d#bazel-out/*/genfiles/}"
+      d="$${d#bazel-out/*/bin/}"

      if [[ $${d} == *local_config_* ]]; then
        continue
@ -790,6 +790,9 @@ genrule(
        if [[ $${TF_SYSTEM_LIBS:-} == *$${extname}* ]]; then
          continue
        fi
+
+        d="$${d#*external/farmhash_archive/src}"
+        d="$${d#*external/$${extname}/}"
      fi

      mkdir -p "$@/$${d}"
@ -808,8 +811,8 @@ genrule(
    }),
    outs = ["__init__.py"],
    cmd = select({
-        "api_version_2": "cp $(@D)/_api/v2/v2.py $(OUTS)",
-        "//conditions:default": "cp $(@D)/_api/v1/v1.py $(OUTS)",
+        "api_version_2": "cp $(@D)/_api/v2/v2.py $(OUTS) && sed -i'.original' 's:from . import:from . _api.v2 import:g' $(OUTS)",
+        "//conditions:default": "cp $(@D)/_api/v1/v1.py $(OUTS) && sed -i'.original' 's:from . import:from ._api.v1 import:g' $(OUTS)",
    }),
 )

--- a/tensorflow/api_template.init.py
+++ b/tensorflow/api_template.init.py
@ -56,10 +56,10 @@ elif _tf_api_dir not in __path__:
  __path__.append(_tf_api_dir)

 # Hook external TensorFlow modules.
-
 # Import compat before trying to import summary from tensorboard, so that
-# reexport_tf_summary can get compat from sys.modules
-_current_module.compat.v2.compat.v1 = _current_module.compat.v1
+# reexport_tf_summary can get compat from sys.modules. Only needed if using
+# lazy loading.
+_current_module.compat.v2  # pylint: disable=pointless-statement
 try:
  from tensorboard.summary._tf import summary
  _current_module.__path__ = (
@ -78,7 +78,7 @@ except ImportError:
  pass

 try:
-  from tensorflow.python.keras.api._v2 import keras
+  from .python.keras.api._v2 import keras
  _current_module.__path__ = (
      [_module_util.get_parent_dir(keras)] + _current_module.__path__)
  setattr(_current_module, "keras", keras)
@ -125,25 +125,6 @@ if _running_from_pip_package():
    if _fi.file_exists(plugin_dir):
      _ll.load_library(plugin_dir)

-# These symbols appear because we import the python package which
-# in turn imports from tensorflow.core and tensorflow.python. They
-# must come from this module. So python adds these symbols for the
-# resolution to succeed.
-# pylint: disable=undefined-variable
-try:
-  del python
-except NameError:
-  pass
-try:
-  del core
-except NameError:
-  pass
-try:
-  del compiler
-except NameError:
-  pass
-# pylint: enable=undefined-variable
-
 # Add module aliases
 if hasattr(_current_module, 'keras'):
  losses = keras.losses
--- a/tensorflow/api_template_v1.init.py
+++ b/tensorflow/api_template_v1.init.py
@ -60,6 +60,10 @@ elif _tf_api_dir not in __path__:
  __path__.append(_tf_api_dir)

 # Hook external TensorFlow modules.
+# Import compat before trying to import summary from tensorboard, so that
+# reexport_tf_summary can get compat from sys.modules. Only needed if using
+# lazy loading.
+_current_module.compat.v2  # pylint: disable=pointless-statement
 try:
  from tensorflow_estimator.python.estimator.api._v1 import estimator
  _current_module.__path__ = (
@ -69,7 +73,7 @@ except ImportError:
  pass

 try:
-  from tensorflow.python.keras.api._v1 import keras
+  from .python.keras.api._v1 import keras
  _current_module.__path__ = (
      [_module_util.get_parent_dir(keras)] + _current_module.__path__)
  setattr(_current_module, "keras", keras)
@ -134,6 +138,10 @@ if _running_from_pip_package():
    if _fi.file_exists(plugin_dir):
      _ll.load_library(plugin_dir)

+# Disable TF2 behavior
+from tensorflow.python.compat import v2_compat as _compat  # pylint: disable=g-import-not-at-top
+_compat.disable_v2_behavior()
+
 # These symbols appear because we import the python package which
 # in turn imports from tensorflow.core and tensorflow.python. They
 # must come from this module. So python adds these symbols for the
@ -152,5 +160,4 @@ try:
 except NameError:
  pass

-_current_module.compat.v2.compat.v1 = _current_module.compat.v1
 # pylint: enable=undefined-variable
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@ -671,7 +671,7 @@ void TFE_OpAddInputList(TFE_Op* op, TFE_TensorHandle** inputs, int num_inputs,

 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
                              unsigned char* is_list, TF_Status* status) {
-  TF_AttrType ret;
+  TF_AttrType ret = TF_ATTR_INT;
  status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(),
                                              attr_name, &ret, is_list);
  return ret;
--- a/tensorflow/c/eager/c_api_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_experimental_test.cc
@ -83,7 +83,10 @@ void ExecuteWithProfiling(bool async) {
  if (!gpu_device_name.empty()) {
    EXPECT_TRUE(HasSubstr(profile_proto_str, "/device:GPU:0"));
    // device name with "stream:all" is collected by Device Tracer.
+#ifndef TENSORFLOW_USE_ROCM
+    // ROCm platform does not yet support stream level tracing
    EXPECT_TRUE(HasSubstr(profile_proto_str, "stream:all"));
+#endif
  }
  // "/host:CPU" is collected by TraceMe
  EXPECT_TRUE(HasSubstr(profile_proto_str, "/host:CPU"));
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@ -63,12 +63,26 @@ cat << EOF > tensorflow.pc
 prefix=${TF_PREFIX}
 exec_prefix=\${prefix}
 libdir=\${exec_prefix}/${LIBDIR}
-includedir=\${prefix}/include
+includedir=\${prefix}/include/tensorflow

 Name: TensorFlow
 Version: ${TF_VERSION}
 Description: Library for computation using data flow graphs for scalable machine learning
 Requires:
-Libs: -L\${libdir} -ltensorflow
+Libs: -L\${libdir} -ltensorflow -ltensorflow_framework
+Cflags: -I\${includedir}
+EOF
+
+cat << EOF > tensorflow_cc.pc
+prefix=${TF_PREFIX}
+exec_prefix=\${prefix}
+libdir=\${exec_prefix}/${LIBDIR}
+includedir=\${prefix}/include/tensorflow
+
+Name: TensorFlow
+Version: ${TF_VERSION}
+Description: Library for computation using data flow graphs for scalable machine learning
+Requires:
+Libs: -L\${libdir} -ltensorflow_cc -ltensorflow_framework
 Cflags: -I\${includedir}
 EOF
--- a/tensorflow/c/kernels/BUILD
+++ b/tensorflow/c/kernels/BUILD
@ -1,8 +1,8 @@
 load(
    "//tensorflow:tensorflow.bzl",
    "tf_cc_test",
-    "tf_kernel_library",
    "tf_gen_op_libs",
+    "tf_kernel_library",
 )

 package(
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@ -293,7 +293,9 @@ string ToCamelCase(const string& str) {
  bool cap = true;
  while (i < str.size()) {
    const char c = str[i++];
-    if (c == joiner) {
+    if (c == '>') {
+      cap = true;
+    } else if (c == joiner) {
      cap = true;
    } else if (cap) {
      result += toupper(c);
@ -305,6 +307,21 @@ string ToCamelCase(const string& str) {
  return result;
 }

+string SeparateNamespaces(const string& str) {
+  string result;
+  const char joiner = '_';
+  size_t i = 0;
+  while (i < str.size()) {
+    const char c = str[i++];
+    if (c == '>') {
+      result += joiner;
+    } else {
+      result += c;
+    }
+  }
+  return result;
+}
+
 // Returns a <string, bool> pair. The string is the C++ type name to be used for
 // attr_type when defining an object of that type. The bool is a flag to
 // indicate whether to treat the type as const when accepting the C++ type as an
@ -550,7 +567,7 @@ struct OpInfo {
 OpInfo::OpInfo(const OpDef& graph_op_def, const ApiDef& api_def,
               const std::vector<string>& aliases)
    : graph_op_def(graph_op_def), api_def(api_def), aliases(aliases) {
-  op_name = api_def.endpoint(0).name();
+  op_name = SeparateNamespaces(api_def.endpoint(0).name());
  InferOpAttributes(graph_op_def, &inferred_input_attrs);
  has_optional_attrs = HasOptionalAttrs(api_def, inferred_input_attrs);
  arg_types.push_back("const ::tensorflow::Scope&");
--- a/tensorflow/cc/profiler/BUILD
+++ b/tensorflow/cc/profiler/BUILD
@ -9,6 +9,7 @@ tf_cuda_cc_test(
    name = "profiler_test",
    srcs = ["profiler_test.cc"],
    tags = [
+        "no_rocm",  # stream level tracing not supported on ROCm
        "nogpu",  # b/77649654
    ],
    deps = [
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@ -19,6 +19,11 @@ limitations under the License.

 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/cc/saved_model/reader.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/lib/monitoring/sampler.h"
@ -64,12 +69,54 @@ uint64 GetLatencyMicroseconds(const uint64 start_microseconds) {
  return end_microseconds - start_microseconds;
 }

+// Ensure that constant tensors loaded from the saved model have valid shape.
+// Also ensure that constant nodes have a value assigned to them.
+// TODO(b/154763635): this is temporary and will be replaced with a better audit
+static Status ValidateNode(const NodeDef& node) {
+  const auto node_iterator = node.attr().find("value");
+  if (node_iterator != node.attr().end()) {
+    AttrValue node_value = node_iterator->second;
+    if (node_value.has_tensor()) {
+      const PartialTensorShape node_shape(node_value.tensor().tensor_shape());
+      if (node_shape.num_elements() < 0) {
+        return errors::FailedPrecondition(
+            "Saved model contains node \"", node.name(), "\" (op \"", node.op(),
+            "\") which initializes from a tensor with ",
+            node_shape.num_elements(), " elements");
+      }
+    }
+  } else if (node.op() == "Const") {
+    return errors::FailedPrecondition(
+        "Saved model contains node \"", node.name(),
+        "\" which is a constant tensor but no value has been provided");
+  }
+  return Status::OK();
+}
+
+static Status ValidateSavedTensors(const GraphDef& graph_def) {
+  for (const auto& node : graph_def.node()) {
+    TF_RETURN_IF_ERROR(ValidateNode(node));
+  }
+
+  if (graph_def.has_library()) {
+    const FunctionDefLibrary& library = graph_def.library();
+    for (const auto& function : library.function()) {
+      for (const auto& node : function.node_def()) {
+	TF_RETURN_IF_ERROR(ValidateNode(node));
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
 Status LoadMetaGraphIntoSession(const MetaGraphDef& meta_graph_def,
                                const SessionOptions& session_options,
                                std::unique_ptr<Session>* session) {
  Session* session_p = nullptr;
  TF_RETURN_IF_ERROR(NewSession(session_options, &session_p));
  session->reset(session_p);
+  TF_RETURN_IF_ERROR(ValidateSavedTensors(meta_graph_def.graph_def()));
  return (*session)->Create(meta_graph_def.graph_def());
 }

--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "cc_header_only_library")
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "tf_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library", "tf_jit_compilation_passes_extra_deps")
 load("//tensorflow/core/platform:default/build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@ -693,7 +693,8 @@ class EagerFunctionTest(xla_test.XLATestCase):
        return x, y

      wholly_compiled_f = def_function.function(f)
-      op_by_op_f = def_function.function(f, experimental_compile=False)
+      op_by_op_f = function.defun_with_attributes(
+          f, attributes={'_XlaCompile': False})

      x = constant_op.constant([0.0, 2.0], name='data')

--- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc
@ -776,7 +776,9 @@ class TRT_TensorOrWeights::SimpleITensor : public nvinfer1::ITensor {

  nvinfer1::TensorFormats getAllowedFormats() const override { return 1; }

-  bool isShape() const override { return false; }
+  bool isShapeTensor() const override { return false; }
+
+  bool isExecutionTensor() const override { return true; }
 #endif

 private:
@ -5191,7 +5193,11 @@ Status ConvertGraphDefToEngine(
  }

  // Build the network
-  VLOG(1) << "Starting engine conversion ";
+  if (VLOG_IS_ON(1)) {
+    string mode_str;
+    TF_RETURN_IF_ERROR(TrtPrecisionModeToName(precision_mode, &mode_str));
+    VLOG(1) << "Starting engine conversion, precision mode: " << mode_str;
+  }
  Converter converter(trt_network.get(), precision_mode, use_calibration);
  std::vector<Converter::EngineOutputInfo> output_tensors;
  // Graph nodes are already topologically sorted during construction
--- a/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/get_calibration_data_op.cc
@ -48,12 +48,10 @@ class GetCalibrationDataOp : public OpKernel {
                                &resource));
    core::ScopedUnref sc(resource);

-    auto* calib_ctx = resource->calib_ctx_.get();
-
    // Serialize the resource as output.
-    string serialized_resource;
-    OP_REQUIRES_OK(context, calib_ctx->SerializeToString(&serialized_resource));
-    resource->calib_ctx_.reset();
+    string serialized_resource = resource->calib_ctx_->TerminateCalibration();
+    OP_REQUIRES(context, !serialized_resource.empty(),
+                errors::Unknown("Calibration table is empty."));

    Tensor* output = nullptr;
    OP_REQUIRES_OK(context,
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_op.cc
@ -392,9 +392,8 @@ Status TRTEngineOp::VerifyInputShapes(const std::vector<TensorShape>& shapes) {
  return Status::OK();
 }

-Status TRTEngineOp::GetEngineInputShapes(
-    const CacheType& cache, const std::vector<TensorShape>& actual_input_shapes,
-    std::vector<TensorShape>* engine_input_shapes) {
+bool AreShapesCompatible(const std::vector<TensorShape>& actual_shapes,
+                         const std::vector<TensorShape>& cached_shapes) {
  auto match_shape = [](const TensorShape& actual_shape,
                        const TensorShape& cached_shape) {
    // Match the rank.
@ -407,16 +406,17 @@ Status TRTEngineOp::GetEngineInputShapes(
    }
    return true;
  };
-  auto match_shapes = [&](const std::vector<TensorShape>& actual_shapes,
-                          const std::vector<TensorShape>& cached_shapes) {
-    for (int i = 0; i < actual_shapes.size(); ++i) {
-      if (!match_shape(actual_shapes[i], cached_shapes[i])) {
-        return false;
-      }
+  for (int i = 0; i < actual_shapes.size(); ++i) {
+    if (!match_shape(actual_shapes[i], cached_shapes[i])) {
+      return false;
    }
-    return true;
-  };
+  }
+  return true;
+}

+Status TRTEngineOp::GetEngineInputShapes(
+    const CacheType& cache, const std::vector<TensorShape>& actual_input_shapes,
+    std::vector<TensorShape>* engine_input_shapes) {
  // VerifyInputShapes() already ensured that all input shapes have same
  // batch size, and are not scalars.
  *engine_input_shapes = actual_input_shapes;
@ -430,7 +430,7 @@ Status TRTEngineOp::GetEngineInputShapes(
          ", cached size: ", cached_input_shapes.size(),
          " vs. actual size: ", actual_input_shapes.size());
    }
-    if (match_shapes(actual_input_shapes, cached_input_shapes)) {
+    if (AreShapesCompatible(actual_input_shapes, cached_input_shapes)) {
      const int cached_batch_size = cached_input_shapes[0].dim_size(0);
      if (min_matched_batch_size > cached_batch_size) {
        min_matched_batch_size = cached_batch_size;
@ -668,7 +668,8 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
  static EngineContext empty_context;

  mutex_lock lock(engine_mutex_);
-  // TODO(tmorris): using first input to get batch size - is this reliable?
+  // Using first input to get batch size is reliable - VerifyInputShapes() has
+  // verified that.
  const int batch_size = input_shapes[0].dim_size(0);
  auto& cache = cache_res->cache_;
  auto allocator = cache_res->allocator_.get();
@ -678,14 +679,9 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(

  // Handle the static engine case. For static engines, the cache will have a
  // single element containing the only engine.
-  //
-  // TODO(laigd): This is legacy mode for TF v1.x, need to remove when all known
-  // users switch to 2.0.
  if (static_engine_) {
    if (cache.size()) {
-      // Batch size of engine must be >= the input batch size
-      // TODO(tmorris): use match compatible function?
-      if (cache.begin()->first[0].dim_size(0) >= batch_size) {
+      if (AreShapesCompatible(input_shapes, cache.begin()->first)) {
        return cache.begin()->second.get();
      }
      return &empty_context;
@ -724,9 +720,7 @@ StatusOr<EngineContext*> TRTEngineOp::GetEngine(
    return cache.at(engine_input_shapes).get();
  }  // static_engine_

-  // Handle the dynamic engine case.
-  // See if there is a compatible engine cached. The batch size should be <= the
-  // cached batch size.
+  // Handle the dynamic engine case. See if there is a compatible engine cached.
  std::vector<TensorShape> engine_input_shapes;
  TF_RETURN_IF_ERROR(
      GetEngineInputShapes(cache, input_shapes, &engine_input_shapes));
@ -843,19 +837,18 @@ Status TRTEngineOp::AllocateCalibrationResources(
    if (!s.ok()) {
      LOG(ERROR) << "Calibration failed: " << s;
      cres->calibrator_->setDone();  // Ignore further pushes
+    } else {
+      // Transfer the ownership of the engine to the engine cache, so we can
+      // dump it out during conversion for TF 2.0.
+      mutex_lock lock(this->engine_mutex_);
+      this->calibrator_ = std::move(cres->calibrator_);
+      TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
+          cres->engine_->createExecutionContext());
+      cache_res->cache_.emplace(
+          shapes, absl::make_unique<EngineContext>(std::move(cres->engine_),
+                                                   std::move(exec_context)));
    }

-    // Transfer the ownership of the engine to the engine cache, so we can
-    // dump it out during conversion for TF 2.0.
-    mutex_lock lock(this->engine_mutex_);
-    cres->SetCalibrationTable();
-    this->calibrator_ = std::move(cres->calibrator_);
-    TrtUniquePtrType<nvinfer1::IExecutionContext> exec_context(
-        cres->engine_->createExecutionContext());
-    cache_res->cache_.emplace(
-        shapes, absl::make_unique<EngineContext>(std::move(cres->engine_),
-                                                 std::move(exec_context)));
-
    VLOG(1) << "Calibration loop terminated " << this->name();
  }));
  VLOG(1) << "initialized calibrator resource";
--- a/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
+++ b/tensorflow/compiler/tf2tensorrt/kernels/trt_engine_resource_ops.cc
@ -184,13 +184,7 @@ class SerializeTRTResource : public OpKernel {
    core::ScopedUnref unref_me(resource);

    // Terminate the calibration if any.
-    if (resource->calib_ctx_) {
-      // We don't save the calibration_table for TF 2.0 at the moment, it's used
-      // in 1.x environment.
-      string calibration_table;
-      OP_REQUIRES_OK(
-          ctx, resource->calib_ctx_->SerializeToString(&calibration_table));
-    }
+    if (resource->calib_ctx_) resource->calib_ctx_->TerminateCalibration();

    // Serialize the engines and write them to file.
    std::unique_ptr<WritableFile> file;
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.cc
@ -30,6 +30,26 @@ limitations under the License.
 namespace tensorflow {
 namespace tensorrt {

+string CalibrationContext::TerminateCalibration() {
+  mutex_lock l(mu_);
+  if (terminated_) return calibration_table_;
+
+  TRTInt8Calibrator* raw_calibrator = calibrator_.get();
+  raw_calibrator->waitAndSetDone();
+  terminated_ = true;
+
+  // At this point the calibration thread `thr_` is woken up and can
+  // transfer the ownership of `calibrator_` and `engine_` at any time, so
+  // it's not safe to use `calibrator_` below, but we can still access it
+  // using raw pointer.
+  // TODO(laigd): make TRTEngineOp::AllocateCalibrationResources() a member
+  // function of this class instead.
+
+  thr_->join();
+  calibration_table_ = raw_calibrator->getCalibrationTableAsString();
+  return calibration_table_;
+}
+
 const absl::string_view kTfTrtContainerName = "TF-TRT";

 Logger& TRTEngineCacheResource::GetLogger() {
--- a/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
+++ b/tensorflow/compiler/tf2tensorrt/utils/trt_lru_cache.h
@ -142,19 +142,7 @@ struct EngineContext {
 // Contains the context required to build the calibration data.
 class CalibrationContext {
 public:
-  void SetCalibrationTable() {
-    calibration_table_ = calibrator_->getCalibrationTableAsString();
-  }
-
-  Status SerializeToString(string* serialized) {
-    calibrator_->waitAndSetDone();
-    thr_->join();
-    *serialized = calibration_table_;
-    if (serialized->empty()) {
-      return errors::Unknown("Calibration table is empty.");
-    }
-    return Status::OK();
-  }
+  string TerminateCalibration();

  // Lookup table for temporary staging areas of input tensors for calibration.
  std::unordered_map<string, std::pair<void*, size_t>> device_buffers_;
@ -162,12 +150,16 @@ class CalibrationContext {
  // Temporary staging areas for calibration inputs.
  std::vector<PersistentTensor> device_tensors_;

-  string calibration_table_;
  std::unique_ptr<TRTInt8Calibrator> calibrator_;
  TrtUniquePtrType<nvinfer1::IBuilder> builder_;
  TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
  // TODO(sami): Use threadpool threads!
  std::unique_ptr<std::thread> thr_;
+
+ private:
+  mutex mu_;
+  bool terminated_ GUARDED_BY(mu_) = false;
+  std::string calibration_table_ GUARDED_BY(mu_);
 };

 ABSL_CONST_INIT extern const absl::string_view kTfTrtContainerName;
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_test", "cc_header_only_library")
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "tf_cc_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 load(
    "//tensorflow/core/platform:default/build_config.bzl",
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@ -6,7 +6,7 @@ load(
    "//third_party/mkl:build_defs.bzl",
    "mkl_deps",
 )
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_openmp_copts")
 load(":build_defs.bzl", "runtime_copts")

 package(
@ -560,7 +560,7 @@ cc_library(
        "runtime_conv2d_mkl.cc",
    ],
    hdrs = ["runtime_conv2d_mkl.h"],
-    copts = runtime_copts(),
+    copts = runtime_copts() + tf_openmp_copts(),
    visibility = ["//visibility:public"],
    deps = [
        ":runtime_conv2d",
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@ -232,8 +232,11 @@ tensorflow/core/kernels/scatter_nd_op_cpu_impl_5.cc
 tensorflow/core/kernels/scatter_nd_op_cpu_impl_6.cc
 tensorflow/core/kernels/scatter_nd_op_cpu_impl_7.cc
 tensorflow/core/kernels/scatter_op.cc
-tensorflow/core/kernels/segment_reduction_ops.cc
-tensorflow/core/kernels/segment_reduction_ops.cc
+tensorflow/core/kernels/segment_reduction_ops_impl_1.cc
+tensorflow/core/kernels/segment_reduction_ops_impl_2.cc
+tensorflow/core/kernels/segment_reduction_ops_impl_3.cc
+tensorflow/core/kernels/segment_reduction_ops_impl_4.cc
+tensorflow/core/kernels/segment_reduction_ops_impl_5.cc
 tensorflow/core/kernels/sendrecv_ops.cc
 tensorflow/core/kernels/sequence_ops.cc
 tensorflow/core/kernels/session_ops.cc
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@ -443,11 +443,9 @@ class Image(ItemHandler):
      """Decodes a raw image."""
      return parsing_ops.decode_raw(image_buffer, out_type=self._dtype)

-    pred_fn_pairs = {
-        math_ops.logical_or(
-            math_ops.equal(image_format, 'raw'),
-            math_ops.equal(image_format, 'RAW')): decode_raw,
-    }
+    pred_fn_pairs = [(math_ops.logical_or(
+        math_ops.equal(image_format, 'raw'),
+        math_ops.equal(image_format, 'RAW')), decode_raw)]
    image = control_flow_ops.case(
        pred_fn_pairs, default=check_jpeg, exclusive=True)

--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -86,6 +86,7 @@ load(
    "tf_gen_op_libs",
    "tf_generate_proto_text_sources",
    "tf_genrule_cmd_append_to_srcs",
+    "tf_openmp_copts",
    "tf_opts_nortti_if_android",
    "tf_opts_nortti_if_emscripten",
    "transitive_hdrs",
@ -3263,7 +3264,7 @@ tf_cuda_library(
        "public/version.h",
    ],
    hdrs = CORE_CPU_LIB_HEADERS,
-    copts = tf_copts(),
+    copts = tf_copts() + tf_openmp_copts(),
    deps = [
        ":bfc_allocator",
        ":graph",
@ -4604,7 +4605,7 @@ tf_cc_test(
    size = "small",
    srcs = ["common_runtime/constant_folding_test.cc"],
    linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + ["no_rocm"],
    deps = [
        ":core",
        ":core_cpu",
@ -4670,6 +4671,7 @@ tf_cuda_cc_test(
    size = "small",
    srcs = ["common_runtime/process_function_library_runtime_test.cc"],
    linkstatic = tf_kernel_tests_linkstatic(),
+    tags = ["no_rocm"],
    deps = [
        ":core_cpu",
        ":core_cpu_internal",
--- a/tensorflow/core/api_def/base_api/api_def_ExperimentalRebatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalRebatchDataset.pbtxt
@ -8,9 +8,9 @@ A variant tensor representing the input dataset.
 END
  }
  in_arg {
-  name: "num_workers"
+  name: "num_replicas"
  description: <<END
-A scalar representing the number of workers to distribute this batch across. As
+A scalar representing the number of replicas to distribute this batch across. As
 a result of this transformation the current batch size would end up being
 divided  by this parameter.
 END
@ -18,6 +18,6 @@ END
  summary: "Creates a dataset that changes the batch size."
  description: <<END
 Creates a dataset that changes the batch size of the dataset to current batch
-size // num_workers.
+size // num_replicas.
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_RebatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RebatchDataset.pbtxt
@ -8,9 +8,9 @@ A variant tensor representing the input dataset.
 END
  }
  in_arg {
-  name: "num_workers"
+  name: "num_replicas"
  description: <<END
-A scalar representing the number of workers to distribute this batch across. As
+A scalar representing the number of replicas to distribute this batch across. As
 a result of this transformation the current batch size would end up being
 divided  by this parameter.
 END
--- a/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Equal.pbtxt
@ -1,9 +1,4 @@
 op {
  graph_op_name: "Equal"
-  endpoint {
-    name: "math.equal"
-  }
-  endpoint {
-    name: "equal"
-  }
+  visibility: HIDDEN
 }
--- a/tensorflow/core/api_def/python_api/api_def_Fill.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Fill.pbtxt
@ -0,0 +1,4 @@
+op {
+  graph_op_name: "Fill"
+  visibility: HIDDEN
+}
--- a/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_NotEqual.pbtxt
@ -1,9 +1,4 @@
 op {
  graph_op_name: "NotEqual"
-  endpoint {
-    name: "math.not_equal"
-  }
-  endpoint {
-    name: "not_equal"
-  }
+  visibility: HIDDEN
 }
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@ -59,13 +59,13 @@ namespace {
 const char* GetCollectiveName(const CollectiveParams* cp, bool nccl) {
  switch (cp->instance.type) {
    case BROADCAST_COLLECTIVE:
-      return nccl ? "NcclBroadcast" : "HierarchicalTreeBroadcast";
+      return "HierarchicalTreeBroadcast";

    case REDUCTION_COLLECTIVE:
      return nccl ? "NcclReduce" : "RingReduce";

    case GATHER_COLLECTIVE:
-      return nccl ? "NcclGather" : "RingGather";
+      return "RingGather";

    default:
      return "undef";
@ -91,8 +91,16 @@ void CollectiveParamResolverLocal::CompleteGroupLocal(

      // Initialize group runtime details.
      CollectiveImplementationInterface* col_impl;
-      status = CollectiveRegistry::LookupParamResolverInstance(
-          GetCollectiveName(cp, nccl_), &col_impl);
+      // Try to lookup a NCCL collective kernel.  This will return error status
+      // if `NcclReduce` kernel is not present in the registry, e.g. on an
+      // environment that does not support NCCL.
+      status = CollectiveRegistry::LookupParamResolverInstance("NcclReduce",
+                                                               &col_impl);
+      if (!status.ok()) {
+        // Fallback to non-NCCL collective.
+        status = CollectiveRegistry::LookupParamResolverInstance(
+            GetCollectiveName(cp, /*nccl=*/false), &col_impl);
+      }
      if (status.ok()) {
        status = col_impl->InitializeCollectiveGroupRuntimeDetails(
            &gr->group.runtime_details);
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@ -51,9 +51,11 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/include/hip/hip_runtime.h"
 #endif  // GOOGLE_CUDA

 namespace tensorflow {
@ -2089,6 +2091,12 @@ bool IsCUDATensor(const Tensor& t) {
  if (err == cudaErrorInvalidValue) return false;
  CHECK_EQ(cudaSuccess, err) << cudaGetErrorString(err);
  return (attributes.memoryType == cudaMemoryTypeDevice);
+#elif TENSORFLOW_USE_ROCM
+  hipPointerAttribute_t attributes;
+  hipError_t err = hipPointerGetAttributes(&attributes, t.tensor_data().data());
+  if (err == hipErrorInvalidValue) return false;
+  CHECK_EQ(hipSuccess, err) << hipGetErrorString(err);
+  return (attributes.memoryType == hipMemoryTypeDevice);
 #else
  return false;
 #endif
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@ -1,6 +1,7 @@
 load(
    "//tensorflow:tensorflow.bzl",
    "tf_cc_test",
+    "tf_copts",
    "tf_cuda_library",
 )
 load(
@ -276,6 +277,8 @@ cc_library(
 cc_library(
    name = "mkl_eager_op_rewrite",
    srcs = ["mkl_eager_op_rewrite.cc"],
+    copts = tf_copts(),
+    nocopts = "-fno-exceptions",
    deps = [
        ":eager_op_rewrite_registry",
        "//tensorflow/core:framework",
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@ -486,10 +486,6 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
      IsMultiDevice(ctx->FindFunctionDef(op->Name()));

  std::vector<Device*> input_dev_ptrs;
-  // `input_tensor_shapes` contains (potentially a subset of) non DT_RESOURCE
-  // arguments, and `input_resource_variable_dtypes_and_shapes` contains shapes
-  // and underlying types for (potentially a subset) of DT_RESOURCE arguments.
-  std::unordered_map<int, TensorShape> input_tensor_shapes;
  std::unordered_map<int, DtypeAndPartialTensorShape>
      input_resource_variable_dtypes_and_shapes;
  if (is_multi_device_function) {
@ -524,19 +520,9 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
      cache_key =
          FingerprintCat128(cache_key, Fingerprint128(input_device->name()));

-      // If input is normal tensor, get its shape and add it to 'cache_key';
      // If input is a ResourceHandle, get its resource handle dtypes and shapes
      // and add them to 'cache_key'.
-      if (input->dtype != DT_RESOURCE) {
-        TensorShape shape;
-        TF_RETURN_IF_ERROR(input->Shape(&shape));
-
-        input_tensor_shapes[i] = shape;
-
-        // Add both _Arg index and shape to "cache_key".
-        cache_key = FingerprintCat128(cache_key, i);
-        AppendTensorShapeToFingerprint(shape, &cache_key);
-      } else {
+      if (input->dtype == DT_RESOURCE) {
        // We only care about data type and shape for resource variable inputs.
        // But we have no way to tell if input is resource variable (other than
        // looking it up in ResourceMgr, which is slow). So we just get
@ -616,7 +602,6 @@ Status EagerLocalExecute(EagerOperation* op, TensorHandle** retvals,
              << ". Full node_def=" << ndef.DebugString();
      kernel.reset(new KernelAndDeviceFunc(
          flr, ctx->pflr(), std::move(input_dev_ptrs),
-          std::move(input_tensor_shapes),
          std::move(input_resource_variable_dtypes_and_shapes), runner,
          ctx->GetCollectiveExecutorHandle(), ctx->HostCPU(), op->Name(),
          [ctx](const int64 step_id) {
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@ -112,7 +112,6 @@ Status KernelAndDeviceFunc::Init(const NodeDef& ndef,
  for (const Device* device : input_devices_) {
    options.input_devices.push_back(device->name());
  }
-  options.input_tensor_shapes = input_tensor_shapes_;
  options.input_resource_dtypes_and_shapes = input_resource_dtypes_and_shapes_;

  const auto& it = ndef.attr().find("executor_type");
@ -337,7 +336,12 @@ Status KernelAndDeviceOp::Run(ScopedStepContainer* step_container,
  if (outputs != nullptr) {
    outputs->clear();
    for (int i = 0; i < context.num_outputs(); ++i) {
-      outputs->push_back(Tensor(*context.mutable_output(i)));
+      const auto* output_tensor = context.mutable_output(i);
+      if (output_tensor != nullptr) {
+        outputs->push_back(Tensor(*output_tensor));
+      } else {
+        outputs->push_back(Tensor());
+      }
    }
  }
  if (stats != nullptr) {
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@ -185,7 +185,6 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
  KernelAndDeviceFunc(
      FunctionLibraryRuntime* flr, ProcessFunctionLibraryRuntime* pflr,
      std::vector<Device*> input_devices,
-      std::unordered_map<int, TensorShape> input_tensor_shapes,
      std::unordered_map<int, DtypeAndPartialTensorShape>
          input_resource_dtypes_and_shapes,
      std::function<void(std::function<void()>)>* runner,
@ -197,7 +196,6 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
        pflr_(pflr),
        handle_(kInvalidHandle),
        input_devices_(std::move(input_devices)),
-        input_tensor_shapes_(std::move(input_tensor_shapes)),
        input_resource_dtypes_and_shapes_(
            std::move(input_resource_dtypes_and_shapes)),
        name_(name),
@ -240,7 +238,6 @@ class KernelAndDeviceFunc final : public KernelAndDevice {
  // CPU devices are not null. Resource handles' devices are actual backing
  // devices.
  std::vector<Device*> input_devices_;
-  std::unordered_map<int, TensorShape> input_tensor_shapes_;
  std::unordered_map<int, DtypeAndPartialTensorShape>
      input_resource_dtypes_and_shapes_;

--- a/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
+++ b/tensorflow/core/common_runtime/eager/mkl_eager_op_rewrite.cc
@ -45,6 +45,11 @@ class MklEagerOpRewrite : public EagerOpRewrite {
  static Status SetupNewOp(EagerOperation* orig_op, const string mkl_op_name,
                           std::unique_ptr<EagerOperation>* new_mkl_op);

+  // Generic rewrite that can be used for any mkl op that doesn't need
+  // special processing.
+  static Status CreateGenericMklOp(EagerOperation* orig_op,
+                                   std::unique_ptr<EagerOperation>* mkl_op);
+
  // Creates new MKL op for Conv2D, Conv2DBackpropInput and
  // Conv2DBackpropFilter.
  static Status CreateMklConv2DOp(
@ -60,6 +65,10 @@ class MklEagerOpRewrite : public EagerOpRewrite {

  // Checks whether we can rewrite the op to MKL one or not.
  bool ShouldRewriteOp(EagerOperation* op, int* op_idx);
+
+  // Default rewrite rule to be used when rewrite should happen without any
+  // restriction.
+  static bool AlwaysRewrite(EagerOperation* op) { return true; }
 };

 REGISTER_REWRITE(EagerOpRewriteRegistry::PRE_EXECUTION, MklEagerOpRewrite);
@ -67,11 +76,15 @@ REGISTER_REWRITE(EagerOpRewriteRegistry::PRE_EXECUTION, MklEagerOpRewrite);
 // Constructor
 MklEagerOpRewrite::MklEagerOpRewrite(string name, string file, string line)
    : EagerOpRewrite(name, file, line) {
+  mkl_eager_ops_.push_back({"BatchMatMul", AlwaysRewrite, CreateGenericMklOp});
+  mkl_eager_ops_.push_back(
+      {"BatchMatMulV2", AlwaysRewrite, CreateGenericMklOp});
  mkl_eager_ops_.push_back({"Conv2D", RewriteConv2D, CreateMklConv2DOp});
  mkl_eager_ops_.push_back(
      {"Conv2DBackpropInput", RewriteConv2D, CreateMklConv2DOp});
  mkl_eager_ops_.push_back(
      {"Conv2DBackpropFilter", RewriteConv2D, CreateMklConv2DOp});
+  mkl_eager_ops_.push_back({"MatMul", AlwaysRewrite, CreateGenericMklOp});
 }

 Status MklEagerOpRewrite::Run(
@ -124,6 +137,13 @@ Status MklEagerOpRewrite::SetupNewOp(
  return Status::OK();
 }

+Status MklEagerOpRewrite::CreateGenericMklOp(
+    EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_op) {
+  const string mkl_op_name = mkl_op_registry::GetMklOpName(orig_op->Name());
+  TF_CHECK_OK(SetupNewOp(orig_op, mkl_op_name, mkl_op));
+  return Status::OK();
+}
+
 Status MklEagerOpRewrite::CreateMklConv2DOp(
    EagerOperation* orig_op, std::unique_ptr<EagerOperation>* mkl_conv2d_op) {
  const string mkl_op_name =
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@ -311,7 +311,6 @@ const string* AssignedOrRequestedDeviceName(const Node& node) {
 }

 Status SetArgShape(
-    const std::unordered_map<int, TensorShape>& input_tensor_shapes,
    const std::unordered_map<int, DtypeAndPartialTensorShape>&
        input_resource_dtypes_and_shapes,
    const std::vector<Node*>& arg_nodes) {
@ -320,16 +319,7 @@ Status SetArgShape(
    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "index", &index));
    DataType dtype;
    TF_RETURN_IF_ERROR(GetNodeAttr(n->def(), "T", &dtype));
-    if (dtype != DT_RESOURCE) {
-      auto shape_iter = input_tensor_shapes.find(index);
-      if (shape_iter != input_tensor_shapes.end()) {
-        TensorShapeProto shape_proto;
-        shape_iter->second.AsProto(&shape_proto);
-        AttrValue attr_value;
-        *attr_value.mutable_list()->add_shape() = shape_proto;
-        n->AddAttr("_output_shapes", attr_value);
-      }
-    } else {
+    if (dtype == DT_RESOURCE) {
      auto dtype_and_shape_iter = input_resource_dtypes_and_shapes.find(index);
      if (dtype_and_shape_iter != input_resource_dtypes_and_shapes.end()) {
        AttrValue dtype_attr_value;
@ -620,9 +610,8 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
    options.graph_collector->CollectRawGraph(def);
  }

-  TF_RETURN_IF_ERROR(SetArgShape(options.input_tensor_shapes,
-                                 options.input_resource_dtypes_and_shapes,
-                                 arg_nodes));
+  TF_RETURN_IF_ERROR(
+      SetArgShape(options.input_resource_dtypes_and_shapes, arg_nodes));
  TF_RETURN_IF_ERROR(PinArgsAndRets(options.input_devices,
                                    options.output_devices, device_set_,
                                    arg_nodes, ret_nodes));
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@ -33,9 +33,11 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"

-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/include/hip/hip_runtime.h"
 #endif  // GOOGLE_CUDA

 namespace tensorflow {
@ -122,7 +124,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
  }

  Tensor GPUToCPU(const Tensor& device_tensor) {
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
    CHECK(gpu_device_);
    CHECK(gpu_device_->tensorflow_gpu_device_info() != nullptr);
    DeviceContext* device_context =
@ -146,7 +148,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
  }

  Tensor CPUToGPU(const Tensor& cpu_tensor) {
-#ifdef GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
    CHECK(gpu_device_);
    CHECK(gpu_device_->tensorflow_gpu_device_info() != nullptr);
    DeviceContext* device_context =
@ -461,6 +463,12 @@ bool IsCUDATensor(const Tensor& t) {
  if (err == cudaErrorInvalidValue) return false;
  CHECK_EQ(cudaSuccess, err) << cudaGetErrorString(err);
  return (attributes.memoryType == cudaMemoryTypeDevice);
+#elif TENSORFLOW_USE_ROCM
+  hipPointerAttribute_t attributes;
+  hipError_t err = hipPointerGetAttributes(&attributes, t.tensor_data().data());
+  if (err == hipErrorInvalidValue) return false;
+  CHECK_EQ(hipSuccess, err) << hipGetErrorString(err);
+  return (attributes.memoryType == hipMemoryTypeDevice);
 #else
  CHECK(false)
      << "IsCUDATensor should not be called when CUDA is not available";
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@ -561,6 +561,13 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
  } else if (src_op == "StridedSlice") {
    TF_RETURN_IF_ERROR(
        PartialStridedSliceShape(input_edge->src(), src_context, result));
+  } else if (src_op == "VariableShape") {
+    auto* handle_data = src_context->input_handle_shapes_and_types(0);
+    if (handle_data != nullptr && !handle_data->empty()) {
+      *result = handle_data->at(0).shape;
+    } else {
+      *result = target_context->UnknownShape();
+    }
  } else {
    Tensor t;
    bool evaluated = false;
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@ -366,7 +366,8 @@ Status EinsumShape(shape_inference::InferenceContext* c) {
    output_bcast_shape = input_bcast_shapes[0];
  } else if (input_bcast_shapes.size() == 2) {
    TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
-        c, input_bcast_shapes[0], input_bcast_shapes[1], &output_bcast_shape));
+        c, input_bcast_shapes[0], input_bcast_shapes[1], true,
+        &output_bcast_shape));
  }

  bool output_has_ellipsis = false;
@ -441,7 +442,7 @@ Status BatchMatMulV2Shape(shape_inference::InferenceContext* c) {
  TF_RETURN_IF_ERROR(c->Subshape(b_shape, 0, -2, &b_batch_shape));

  TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
-      c, a_batch_shape, b_batch_shape, &output_batch_shape));
+      c, a_batch_shape, b_batch_shape, true, &output_batch_shape));

  ShapeHandle output_shape;
  TF_RETURN_IF_ERROR(c->Concatenate(
@ -1613,6 +1614,7 @@ Status QuantizedConcatV2Shape(InferenceContext* c, int num_inputs_to_concat) {
 Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
                                            ShapeHandle shape_x,
                                            ShapeHandle shape_y,
+                                            bool incompatible_shape_error,
                                            ShapeHandle* out) {
  CHECK_NOTNULL(out);
  if (!c->RankKnown(shape_x) || !c->RankKnown(shape_y)) {
@ -1646,8 +1648,16 @@ Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
      // or the same as the known dim.
      // - If either dimension is 1, the other dimension is the output.
      if (c->Value(dim_x) > 1) {
+        if (!incompatible_shape_error) {
+          *out = c->UnknownShape();
+          return Status::OK();
+        }
        dims.push_back(dim_x);
      } else if (c->Value(dim_y) > 1) {
+        if (!incompatible_shape_error) {
+          *out = c->UnknownShape();
+          return Status::OK();
+        }
        dims.push_back(dim_y);
      } else if (c->Value(dim_x) == 1) {
        dims.push_back(dim_y);
@ -1656,6 +1666,10 @@ Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
      } else if (dim_y.SameHandle(dim_x)) {
        dims.push_back(dim_x);
      } else {
+        if (!incompatible_shape_error) {
+          *out = c->UnknownShape();
+          return Status::OK();
+        }
        dims.push_back(c->UnknownDim());
      }
    } else if (c->Value(dim_x) == 1 || c->Value(dim_y) == 1) {
@ -1669,7 +1683,14 @@ Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
      }
    } else {
      DimensionHandle dim;
-      TF_RETURN_IF_ERROR(c->Merge(dim_x, dim_y, &dim));
+      Status s = c->Merge(dim_x, dim_y, &dim);
+      if (!s.ok()) {
+        if (!incompatible_shape_error) {
+          *out = c->MakeShape({});
+          return Status::OK();
+        }
+        return s;
+      }
      dims.push_back(dim);
    }
  }
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@ -306,6 +306,7 @@ Status QuantizedConcatV2Shape(InferenceContext* c, int num_inputs_to_concat);
 Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
                                            ShapeHandle shape_x,
                                            ShapeHandle shape_y,
+                                            bool incompatible_shape_error,
                                            ShapeHandle* out);

 // Shape function for binary operators that broadcast their inputs
@ -313,8 +314,8 @@ Status BroadcastBinaryOpOutputShapeFnHelper(InferenceContext* c,
 inline Status BroadcastBinaryOpOutputShapeFn(InferenceContext* c,
                                             int output_index) {
  ShapeHandle out;
-  TF_RETURN_IF_ERROR(
-      BroadcastBinaryOpOutputShapeFnHelper(c, c->input(0), c->input(1), &out));
+  TF_RETURN_IF_ERROR(BroadcastBinaryOpOutputShapeFnHelper(
+      c, c->input(0), c->input(1), true, &out));
  c->set_output(output_index, out);
  return Status::OK();
 }
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@ -921,11 +921,6 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
    entries.push_back(strings::StrCat(
        "_output_dev", i, "=", absl::CEscape(options.output_devices[i])));
  }
-  for (const auto& iter : options.input_tensor_shapes) {
-    entries.push_back(
-        strings::StrCat("_input_tensor_shape", iter.first, "=",
-                        absl::CEscape(iter.second.DebugString())));
-  }
  for (const auto& iter : options.input_resource_dtypes_and_shapes) {
    entries.push_back(strings::StrCat("_input_resource_dtype", iter.first, "=",
                                      DataTypeString(iter.second.dtype)));
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@ -563,14 +563,6 @@ class FunctionLibraryRuntime {
    // infer correct device.
    std::vector<string> output_devices;

-    // This interface is EXPERIMENTAL and subject to change.
-    //
-    // For multi-device functions, a mapping from _Arg node index to input
-    // tensor shape.
-    // REQUIRES: if input_tensor_shapes.count(i) > 0 then i-th argument type
-    // must not be DT_RESOURCE.
-    std::unordered_map<int, TensorShape> input_tensor_shapes;
-
    // This interface is EXPERIMENTAL and subject to change.
    //
    // For multi-device functions, a mapping from _Arg node index to type and
--- a/tensorflow/core/framework/graph_def_util_test.cc
+++ b/tensorflow/core/framework/graph_def_util_test.cc
@ -35,6 +35,17 @@ Status FinalizeOpDef(const OpDefBuilder& b, OpDef* op_def) {
  return s;
 }

+// We can create a Graph containing a namespaced Op
+TEST(AddToGraphTest, MakeGraphDefWithNamespacedOpName) {
+  OpList op_list;
+  TF_ASSERT_OK(FinalizeOpDef(OpDefBuilder("Project>SomeOp"), op_list.add_op()));
+  OpListOpRegistry registry(&op_list);
+
+  GraphDef graph_def;
+  TF_ASSERT_OK(NodeDefBuilder("node", "Project>SomeOp", &registry)
+                   .Finalize(graph_def.add_node()));
+}
+
 // Producer and consumer have default for an attr -> graph unchanged.
 TEST(RemoveNewDefaultAttrsFromGraphDefTest, NoChangeWithDefault) {
  OpList op_list;
--- a/tensorflow/core/framework/node_def.proto
+++ b/tensorflow/core/framework/node_def.proto
@ -11,7 +11,7 @@ import "tensorflow/core/framework/attr_value.proto";
 message NodeDef {
  // The name given to this operator. Used for naming inputs,
  // logging, visualization, etc.  Unique within a single GraphDef.
-  // Must match the regexp "[A-Za-z0-9.][A-Za-z0-9_./]*".
+  // Must match the regexp "[A-Za-z0-9.][A-Za-z0-9_>./]*".
  string name = 1;

  // The operation name.  There may be custom parameters in attrs.
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@ -742,12 +742,22 @@ namespace {

 using ::tensorflow::strings::Scanner;

-bool IsValidOpName(StringPiece sp) {
-  return Scanner(sp)
-      .One(Scanner::LETTER_DIGIT_DOT)
-      .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE)
-      .Eos()
-      .GetResult();
+bool IsValidNodeName(StringPiece sp) {
+  Scanner scanner(sp);
+  scanner.One(Scanner::LETTER_DIGIT_DOT)
+      .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
+
+  while (true) {
+    if (!scanner.GetResult())  // Some error in previous iteration.
+      return false;
+    if (scanner.empty())  // No error, but nothing left, good.
+      return true;
+
+    // Absorb another piece, starting with a '>'
+    scanner.One(Scanner::RANGLE)
+        .One(Scanner::LETTER_DIGIT_DOT)
+        .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
+  }
 }

 bool IsValidDataInputName(StringPiece sp) {
@ -791,16 +801,16 @@ Status ValidateOpInput(const string& input_name, bool* is_control_input) {
  }
 }

-Status ValidateOpName(const string& op_name) {
-  if (IsValidOpName(op_name)) {
+Status ValidateNodeName(const string& node_name) {
+  if (IsValidNodeName(node_name)) {
    return Status::OK();
  } else {
-    return errors::InvalidArgument("Illegal op name '", op_name, "'");
+    return errors::InvalidArgument("Illegal op name '", node_name, "'");
  }
 }

 Status ValidateExternalNodeDefSyntax(const NodeDef& node_def) {
-  Status s = ValidateOpName(node_def.name());
+  Status s = ValidateNodeName(node_def.name());
  if (!s.ok()) {
    return AttachDef(s, node_def);
  }
--- a/tensorflow/core/framework/node_def_util_test.cc
+++ b/tensorflow/core/framework/node_def_util_test.cc
@ -282,10 +282,28 @@ TEST(NodeDefUtilTest, ValidSyntax) {
    )proto");
  ExpectValidSyntax(node_def);

+  const NodeDef node_def_namespace = ToNodeDef(R"proto(
+    name: 'n'
+    op: 'Project>AnyIn'
+    input: 'a'
+    input: 'b'
+    attr {
+      key: 'T'
+      value { list { type: [ DT_INT32, DT_STRING ] } }
+    }
+  )proto");
+  ExpectValidSyntax(node_def_namespace);
+
  const NodeDef node_def_explicit_inputs = ToNodeDef(R"proto(
-    name:'n' op:'AnyIn' input:'a:0' input:'b:123'
-    attr { key:'T' value { list { type: [DT_INT32, DT_STRING] } } }
-    )proto");
+    name: 'n'
+    op: 'AnyIn'
+    input: 'a:0'
+    input: 'b:123'
+    attr {
+      key: 'T'
+      value { list { type: [ DT_INT32, DT_STRING ] } }
+    }
+  )proto");
  ExpectValidSyntax(node_def_explicit_inputs);

  EXPECT_EQ("{{node n}} = AnyIn[T=[DT_INT32, DT_STRING]](a:0, b:123)",
--- a/tensorflow/core/framework/op_def.proto
+++ b/tensorflow/core/framework/op_def.proto
@ -14,7 +14,7 @@ import "tensorflow/core/framework/types.proto";
 // LINT.IfChange
 message OpDef {
  // Op names starting with an underscore are reserved for internal use.
-  // Names should be CamelCase and match the regexp "[A-Z][a-zA-Z0-9_]*".
+  // Names should be CamelCase and match the regexp "[A-Z][a-zA-Z0-9>_]*".
  string name = 1;

  // For describing inputs and outputs.
--- a/tensorflow/core/framework/op_def_util.cc
+++ b/tensorflow/core/framework/op_def_util.cc
@ -248,16 +248,29 @@ static Status ValidateArg(const OpDef::ArgDef& arg, const OpDef& op_def,
  return Status::OK();
 }

-Status ValidateOpDef(const OpDef& op_def) {
+bool IsValidOpName(StringPiece sp) {
  using ::tensorflow::strings::Scanner;

+  Scanner scanner(sp);
+  scanner.One(Scanner::UPPERLETTER).Any(Scanner::LETTER_DIGIT_UNDERSCORE);
+
+  while (true) {
+    if (!scanner.GetResult())  // Some error in previous iteration.
+      return false;
+    if (scanner.empty())  // No error, but nothing left, good.
+      return true;
+
+    // Absorb another name/namespace, starting with a '>'
+    scanner.One(Scanner::RANGLE)
+        .One(Scanner::UPPERLETTER)
+        .Any(Scanner::LETTER_DIGIT_UNDERSCORE);
+  }
+}
+
+Status ValidateOpDef(const OpDef& op_def) {
  if (!absl::StartsWith(op_def.name(), "_")) {
-    VALIDATE(Scanner(op_def.name())
-                 .One(Scanner::UPPERLETTER)
-                 .Any(Scanner::LETTER_DIGIT_UNDERSCORE)
-                 .Eos()
-                 .GetResult(),
-             "Invalid name: ", op_def.name(), " (Did you use CamelCase?)");
+    VALIDATE(IsValidOpName(op_def.name()), "Invalid name: ", op_def.name(),
+             " (Did you use CamelCase?)");
  }

  std::set<string> names;  // for detecting duplicate names
--- a/tensorflow/core/framework/op_def_util_test.cc
+++ b/tensorflow/core/framework/op_def_util_test.cc
@ -74,12 +74,26 @@ TEST_F(ValidateOpDefTest, OpDefValid) {
  TF_EXPECT_OK(TestBuilder(OpDefBuilder("X").Attr("a: int >= -5 = 3")));
  TF_EXPECT_OK(TestBuilder(OpDefBuilder("X").Attr("a: numbertype")));
  TF_EXPECT_OK(TestBuilder(OpDefBuilder("Uppercase")));
+
+  TF_EXPECT_OK(TestBuilder(OpDefBuilder("Namespace>X").Attr("a: int")));
+  TF_EXPECT_OK(TestBuilder(OpDefBuilder("Namespace>X>Y").Attr("a: int")));
 }

 TEST_F(ValidateOpDefTest, InvalidName) {
  ExpectFailure(TestBuilder(OpDefBuilder("lower").Attr("a: int")),
                "Invalid name");
  ExpectFailure(TestBuilder(OpDefBuilder("BadSuffix 7%")), "Invalid name");
+  ExpectFailure(TestBuilder(OpDefBuilder(">OpName").Attr("a: int")),
+                "Invalid name");
+  // Can't have a dangling empty namespace
+  ExpectFailure(TestBuilder(OpDefBuilder("OpName>").Attr("a: int")),
+                "Invalid name");
+  // Each namespace section must be Camelcased
+  ExpectFailure(TestBuilder(OpDefBuilder("OpName>b").Attr("a: int")),
+                "Invalid name");
+  // Can't have empty namespaces
+  ExpectFailure(TestBuilder(OpDefBuilder("OpName>A>>B").Attr("a: int")),
+                "Invalid name");
 }

 TEST_F(ValidateOpDefTest, DuplicateName) {
--- a/tensorflow/core/framework/rendezvous.cc
+++ b/tensorflow/core/framework/rendezvous.cc
@ -264,7 +264,24 @@ class LocalRendezvousImpl : public Rendezvous {

      VLOG(2) << "Enqueue Recv Item (key:" << key.FullKey() << "). ";
      Item* item = new Item;
-      item->waiter = std::move(done);
+
+      if (cm != nullptr) {
+        auto wrapped_done = std::bind(
+            [cm, token](const DoneCallback& done,
+                        // Begin unbound arguments.
+                        const Status& s, const Args& send_args,
+                        const Args& recv_args, const Tensor& v, bool dead) {
+              cm->TryDeregisterCallback(token);
+              done(s, send_args, recv_args, v, dead);
+            },
+            std::move(done), std::placeholders::_1, std::placeholders::_2,
+            std::placeholders::_3, std::placeholders::_4,
+            std::placeholders::_5);
+        item->waiter = std::move(wrapped_done);
+      } else {
+        item->waiter = std::move(done);
+      }
+
      item->recv_args = recv_args;
      item->cancellation_token = token;
      if (item->recv_args.device_context) {
@ -332,11 +349,6 @@ class LocalRendezvousImpl : public Rendezvous {
      if (recv_args.device_context) {
        recv_args.device_context->Unref();
      }
-      auto* cm = recv_args.cancellation_manager;
-      if (cancellation_token != CancellationManager::kInvalidToken &&
-          cm != nullptr) {
-        cm->TryDeregisterCallback(cancellation_token);
-      }
    }

    // Returns true iff this item represents a value being sent.
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@ -515,7 +515,12 @@ TensorBuffer* FromProtoField<Variant>(Allocator* a, const TensorProto& in,
  if (in_n <= 0) {
    std::fill_n(data, n, Variant());
  } else {
-    for (int64 i = 0; i < in_n; ++i) {
+    // If tensor shape says we have n < in_n elements in the output tensor
+    // then make sure to only decode the first n out of the in_n elements in the
+    // in tensors. In all other cases, we decode all in_n elements of in and set
+    // the remaining elements up to n to be the default Variant() value.
+    const int64 real_n = n < in_n ? n : in_n;
+    for (int64 i = 0; i < real_n; ++i) {
      data[i] = in.variant_val(i);
      if (!DecodeUnaryVariant(&data[i])) {
        LOG(ERROR) << "Could not decode variant with type_name: \""
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@ -66,12 +66,23 @@ inline bool IsNextIteration(const NodeDef& node_def) {

 bool IsValidNodeName(StringPiece s, bool allow_internal_ops) {
  using ::tensorflow::strings::Scanner;
-  return Scanner(s)
+  Scanner scanner(s);
+  scanner
      .One(allow_internal_ops ? Scanner::LETTER_DIGIT_DOT_UNDERSCORE
                              : Scanner::LETTER_DIGIT_DOT)
-      .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE)
-      .Eos()
-      .GetResult();
+      .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
+
+  while (true) {
+    if (!scanner.GetResult())  // Some error in previous iteration.
+      return false;
+    if (scanner.empty())  // No error, but nothing left, good.
+      return true;
+
+    // Absorb another piece, starting with a '>'
+    scanner.One(Scanner::RANGLE)
+        .One(Scanner::LETTER_DIGIT_DOT)
+        .Any(Scanner::LETTER_DIGIT_DASH_DOT_SLASH_UNDERSCORE);
+  }
 }

 class GraphConstructor {
@ -1399,6 +1410,17 @@ void GraphConstructor::Undo() {

 Status GraphConstructor::MakeEdge(Node* src, int output_index, Node* dst,
                                  int input_index) {
+  if (output_index >= src->num_outputs()) {
+    return errors::InvalidArgument(
+        "Output ", output_index, " of node ", src->name(),
+        " does not exist. Node only has ", src->num_outputs(), " outputs.");
+  }
+  if (input_index >= dst->num_inputs()) {
+    return errors::InvalidArgument(
+        "Input ", input_index, " of node ", dst->name(),
+        " does not exist. Node only has ", dst->num_inputs(), " inputs.");
+  }
+
  DataType src_out = src->output_type(output_index);
  DataType dst_in = dst->input_type(input_index);
  if (!TypesCompatible(dst_in, src_out)) {
--- a/tensorflow/core/graph/mkl_graph_util.h
+++ b/tensorflow/core/graph/mkl_graph_util.h
@ -206,6 +206,7 @@ static inline bool IsMklElementWiseOp(const string& op_name, DataType T) {
    return false;
  }
  bool result = (0 == op_name.compare(GetMklOpName("Add")) ||
+                 0 == op_name.compare(GetMklOpName("AddV2")) ||
                 0 == op_name.compare(GetMklOpName("Sub")) ||
                 0 == op_name.compare(GetMklOpName("Mul")) ||
                 0 == op_name.compare(GetMklOpName("Maximum")) ||
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@ -246,6 +246,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
    csinfo_.avg_pool3d = "AvgPool3D";
    csinfo_.avg_pool3d_grad = "AvgPool3DGrad";
    csinfo_.batch_matmul = "BatchMatMul";
+    csinfo_.batch_matmul_v2 = "BatchMatMulV2";
    csinfo_.bias_add = "BiasAdd";
    csinfo_.bias_add_grad = "BiasAddGrad";
    csinfo_.concat = "Concat";
@ -349,6 +350,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
    // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
    // MklInputConversion op is added before it.
    csinfo_.add = "Add";
+    csinfo_.add_v2 = "AddV2";
    csinfo_.maximum = "Maximum";
    csinfo_.mul = "Mul";
    csinfo_.squared_difference = "SquaredDifference";
@ -363,6 +365,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
    rinfo_.push_back({csinfo_.add, mkl_op_registry::GetMklOpName(csinfo_.add),
                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
                      kRewriteForLayoutPropagation});
+    rinfo_.push_back({csinfo_.add_v2,
+                      mkl_op_registry::GetMklOpName(csinfo_.add_v2),
+                      CopyAttrsAll, RewriteIfAtleastOneMklInput,
+                      kRewriteForLayoutPropagation});
    rinfo_.push_back(
        {csinfo_.avg_pool, mkl_op_registry::GetMklOpName(csinfo_.avg_pool),
         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
@ -380,6 +386,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
    rinfo_.push_back({csinfo_.batch_matmul,
                      mkl_op_registry::GetMklOpName(csinfo_.batch_matmul),
                      CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
+    rinfo_.push_back({csinfo_.batch_matmul_v2,
+                      mkl_op_registry::GetMklOpName(csinfo_.batch_matmul_v2),
+                      CopyAttrsAll, AlwaysRewrite, kRewriteForOpNameChange});
    rinfo_.push_back(
        {csinfo_.concat, mkl_op_registry::GetMklOpName(csinfo_.concat),
         CopyAttrsAll, AlwaysRewrite, kRewriteForLayoutPropagation});
@ -863,11 +872,13 @@ rinfo_.push_back({csinfo_.tanh_grad,
  typedef struct {
    string addn;
    string add;
+    string add_v2;
    string avg_pool;
    string avg_pool_grad;
    string avg_pool3d;
    string avg_pool3d_grad;
    string batch_matmul;
+    string batch_matmul_v2;
    string bias_add;
    string bias_add_grad;
    string concat;
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@ -3776,6 +3776,65 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Slice_DeviceTest) {
            "B->D:1;C->D:2;D->E:1");
 }

+// The following positive and negative tests test the rewrite of Add and AddV2
+// to MKL versions. The operators will be rewritten only if one of the inputs
+// comes from another MKL operator.
+TEST_F(MklLayoutPassTest, PositiveRewriteAdd) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A']}"
+      "node { name: 'N' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['M', 'B']}");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Input);DMT/_0(Const);DMT/_1(Const);M(_MklRelu);N(_MklAdd)"
+      "|A->M;A:control->DMT/_0:control;B->N:1;DMT/_0->M:1;DMT/_1->N:3;M->N;"
+      "M:1->N:2;M:control->DMT/_1:control");
+}
+
+TEST_F(MklLayoutPassTest, NegativeRewriteAdd) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'Add'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);N(Add)|A->N;B->N:1");
+}
+
+TEST_F(MklLayoutPassTest, PositiveRewriteAddV2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A']}"
+      "node { name: 'N' op: 'AddV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['M', 'B']}");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Input);DMT/_0(Const);DMT/_1(Const);M(_MklRelu);N(_MklAddV2)"
+      "|A->M;A:control->DMT/_0:control;B->N:1;DMT/_0->M:1;DMT/_1->N:3;M->N;"
+      "M:1->N:2;M:control->DMT/_1:control");
+}
+
+TEST_F(MklLayoutPassTest, NegativeRewriteAddV2) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'N' op: 'AddV2'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);N(AddV2)|A->N;B->N:1");
+}
+
 /////////////////////////////////////////////////////////////////////
 //         Post-rewrite fixup pass test
 /////////////////////////////////////////////////////////////////////
@ -4307,6 +4366,39 @@ TEST_F(MklLayoutPassTest,
            "H->K:7;I->K:8;J->L:1;K->L");
 }

+TEST_F(MklLayoutPassTest, MatMul_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'MatMul'"
+      " attr { key: 'T'      value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklMatMul)|A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, BatchMatMul_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'BatchMatMul'"
+      " attr { key: 'T'      value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklBatchMatMul)|A->C;B->C:1");
+}
+
+TEST_F(MklLayoutPassTest, BatchMatMulV2_Positive) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'C' op: 'BatchMatMulV2'"
+      " attr { key: 'T'      value { type: DT_FLOAT } }"
+      " input: ['A', 'B']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklBatchMatMulV2)|A->C;B->C:1");
+}
+
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
  testing::StopTiming();
  string s;
--- a/tensorflow/core/grappler/clusters/utils_test.cc
+++ b/tensorflow/core/grappler/clusters/utils_test.cc
@ -40,6 +40,18 @@ TEST(UtilsTest, GetLocalGPUInfo) {
  properties = GetLocalGPUInfo(PlatformGpuId(0));
  EXPECT_EQ("GPU", properties.type());
  EXPECT_EQ("NVIDIA", properties.vendor());
+#elif TENSORFLOW_USE_ROCM
+  LOG(INFO) << "ROCm is enabled.";
+  DeviceProperties properties;
+
+  // Invalid platform GPU ID.
+  properties = GetLocalGPUInfo(PlatformGpuId(100));
+  EXPECT_EQ("UNKNOWN", properties.type());
+
+  // Succeed when a valid platform GPU id was inserted.
+  properties = GetLocalGPUInfo(PlatformGpuId(0));
+  EXPECT_EQ("GPU", properties.type());
+  EXPECT_EQ("Advanced Micro Devices, Inc", properties.vendor());
 #else
  LOG(INFO) << "CUDA is not enabled.";
  DeviceProperties properties;
@ -73,6 +85,8 @@ TEST(UtilsTest, GetDeviceInfo) {
  EXPECT_EQ("GPU", properties.type());
 #if GOOGLE_CUDA
  EXPECT_EQ("NVIDIA", properties.vendor());
+#elif TENSORFLOW_USE_ROCM
+  EXPECT_EQ("Advanced Micro Devices, Inc", properties.vendor());
 #endif

  // TF to platform GPU id mapping entry doesn't exist.
@ -81,7 +95,7 @@ TEST(UtilsTest, GetDeviceInfo) {
  properties = GetDeviceInfo(device);
  EXPECT_EQ("UNKNOWN", properties.type());

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
  // Invalid platform GPU id.
  TF_ASSERT_OK(
      GpuIdManager::InsertTfPlatformGpuIdPair(TfGpuId(0), PlatformGpuId(100)));
@ -94,7 +108,11 @@ TEST(UtilsTest, GetDeviceInfo) {
  device.id = 1;
  properties = GetDeviceInfo(device);
  EXPECT_EQ("GPU", properties.type());
+#if GOOGLE_CUDA
  EXPECT_EQ("NVIDIA", properties.vendor());
+#elif TENSORFLOW_USE_ROCM
+  EXPECT_EQ("Advanced Micro Devices, Inc", properties.vendor());
+#endif
 #endif
 }

--- a/tensorflow/core/grappler/optimizers/data/rebatch.cc
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.cc
@ -39,7 +39,7 @@ Status RebatchOptimizer::Init(
    return errors::InvalidArgument(
        "Cannot initialize RebatchOptimizer without config.");

-  num_workers_ = config->parameter_map().at("num_workers").i();
+  num_replicas_ = config->parameter_map().at("num_replicas").i();
  use_fallback_ = config->parameter_map().at("use_fallback").b();
  return Status::OK();
 }
@ -200,11 +200,13 @@ Status AddConstBoolNode(bool value, FunctionDef* fdef, NodeDef** result) {
  return Status::OK();
 }

-Status AddShapeNode(const NodeDefBuilder::NodeOut& input, FunctionDef* fdef,
-                    NodeDef** result) {
+Status AddShapeNode(const NodeDefBuilder::NodeOut& input, DataType out_type,
+                    FunctionDef* fdef, NodeDef** result) {
  *result = fdef->add_node_def();
-  TF_RETURN_IF_ERROR(
-      NodeDefBuilder("", "Shape").Input(input).Finalize(*result));
+  TF_RETURN_IF_ERROR(NodeDefBuilder("", "Shape")
+                         .Input(input)
+                         .Attr("out_type", out_type)
+                         .Finalize(*result));
  function_utils::SetUniqueFunctionNodeName("rebatch/shape", fdef, *result);
  return Status::OK();
 }
@ -276,45 +278,60 @@ void SetUnknownShapes(int num_components, AttrValue* output_shapes) {
  }
 }

-Status GetBatchDim(AttrValue output_shapes, int* batch_dim) {
-  const auto& shape_0 = output_shapes.list().shape(0);
-  if (shape_0.unknown_rank() || shape_0.dim(0).size() == -1) {
+// If the batch dimension is known and divisible by num_replicas, we set
+// result = batch_dim / num_replicas. If the batch dimension is unknown,
+// result = -1. If the dataset node is missing an output shapes attr,
+// or the batch dimensions of its components don't match, we return an error
+// status.
+Status GetMinibatchDimForReshape(const NodeDef& dataset_node,
+                                 int64 num_replicas, int64* result) {
+  AttrValue output_shapes;
+  if (!dataset_node.attr().contains(kOutputShapesAttr)) {
    return errors::InvalidArgument(
-        "Cannot use rebatching fallback when 0th dimensions of dataset "
-        "components are not fully known. Component 0 has shape: ",
-        shape_0.ShortDebugString());
+        "Cannot use rebatching fallback when the final dataset node does not "
+        "have an `output_shapes` attr. Node: ",
+        dataset_node.name(), " Op: ", dataset_node.op());
  }
+  output_shapes = dataset_node.attr().at(kOutputShapesAttr);

-  *batch_dim = output_shapes.list().shape(0).dim(0).size();
-
-  for (int i = 1; i < output_shapes.list().shape_size(); ++i) {
+  // Get the batch dimension by checking the 0th dimension of all the inputs.
+  int batch_dim = -1;
+  for (int i = 0; i < output_shapes.list().shape_size(); ++i) {
    const auto& shape_i = output_shapes.list().shape(i);

-    if (shape_i.unknown_rank() || shape_i.dim(0).size() == -1) {
+    // If unknown, ignore.
+    if (shape_i.unknown_rank()) continue;
+    int batch_dim_i = shape_i.dim(0).size();
+    if (batch_dim_i == -1) continue;
+
+    // Update batch_dim with known dimension.
+    if (batch_dim_i != batch_dim && batch_dim != -1) {
      return errors::InvalidArgument(
-          "Cannot use rebatching fallback when 0th dimensions of dataset "
-          "components are not fully known. Component ",
-          i, " has shape: ", shape_i.ShortDebugString());
-    }
-    if (shape_i.dim(0).size() != *batch_dim) {
-      return errors::InvalidArgument(
-          "Cannot use rebatching fallback when 0th dimensions of dataset "
+          "Cannot use rebatching fallback: 0th dimensions of dataset "
          "components don't match. Component ",
-          i, " has batch dimension: ", shape_i.dim(0).size(),
-          " while previous components have batch dimension: ", *batch_dim);
+          i, " has batch dimension: ", batch_dim_i,
+          " while previous components have batch dimension: ", batch_dim);
    }
+    batch_dim = batch_dim_i;
  }
+
+  if (batch_dim == -1 || batch_dim % num_replicas != 0) {
+    *result = -1;
+  } else {
+    *result = batch_dim / num_replicas;
+  }
+
  return Status::OK();
 }

-Status UpdateOutputShapes(const string& node_name, int64 num_workers,
+Status UpdateOutputShapes(const string& node_name, int64 num_replicas,
                          MutableGraphView* graph) {
  NodeDef* node = graph->GetNode(node_name);
  if (node->attr().contains(kOutputShapesAttr)) {
    AttrValue output_shapes = node->attr().at(kOutputShapesAttr);
    for (auto& shape : *output_shapes.mutable_list()->mutable_shape()) {
      if (!shape.unknown_rank() && shape.dim(0).size() != -1) {
-        shape.mutable_dim(0)->set_size(shape.dim(0).size() / num_workers);
+        shape.mutable_dim(0)->set_size(shape.dim(0).size() / num_replicas);
      }
    }
    (*node->mutable_attr())[kOutputShapesAttr] = output_shapes;
@ -335,16 +352,16 @@ int64 GetBatchSizeArgIndex(const NodeDef& batch_node) {
 }

 Status MakeNewBatchSizeNode(const string& global_batch_size_name,
-                            int64 num_workers, FunctionDef* fdef,
+                            int64 num_replicas, FunctionDef* fdef,
                            NodeDef** result) {
  NodeDef* one_node;
  TF_RETURN_IF_ERROR(AddConstInt64Node(1, fdef, &one_node));
-  NodeDef* num_workers_node;
-  TF_RETURN_IF_ERROR(AddConstInt64Node(num_workers, fdef, &num_workers_node));
+  NodeDef* num_replicas_node;
+  TF_RETURN_IF_ERROR(AddConstInt64Node(num_replicas, fdef, &num_replicas_node));

  NodeDef* numerator_node =
      AddBinaryNode(global_batch_size_name,
-                    strings::StrCat(num_workers_node->name(), ":output:0"),
+                    strings::StrCat(num_replicas_node->name(), ":output:0"),
                    kAddOp, DT_INT64, fdef);
  numerator_node = AddBinaryNode(
      strings::StrCat(numerator_node->name(), ":z:0"),
@ -352,14 +369,14 @@ Status MakeNewBatchSizeNode(const string& global_batch_size_name,

  *result =
      AddBinaryNode(strings::StrCat(numerator_node->name(), ":z:0"),
-                    strings::StrCat(num_workers_node->name(), ":output:0"),
+                    strings::StrCat(num_replicas_node->name(), ":output:0"),
                    kTruncateDivOp, DT_INT64, fdef);
  return Status::OK();
 }

 // Given a "batch" dataset node, we replace the `batch_size` input with a new
-// input that corresponds to the original input divided by `num_workers`.
-Status MutateBatchSize(const NodeDef& node, int64 num_workers,
+// input that corresponds to the original input divided by `num_replicas`.
+Status MutateBatchSize(const NodeDef& node, int64 num_replicas,
                       MutableGraphView* graph) {
  // For all the batching datasets the batch_size is input number 1 except for
  // MapAndBatchDataset.
@ -369,8 +386,8 @@ Status MutateBatchSize(const NodeDef& node, int64 num_workers,
  int64 batch_size;
  TF_RETURN_IF_ERROR(
      graph_utils::GetScalarConstNodeValue(*batch_size_node, &batch_size));
-  DCHECK_EQ(batch_size % num_workers, 0);
-  batch_size = batch_size / num_workers;
+  DCHECK_EQ(batch_size % num_replicas, 0);
+  batch_size = batch_size / num_replicas;
  NodeDef* new_batch_size_node =
      graph_utils::AddScalarConstNode<int64>(batch_size, graph);
  // We don't call UpdateFanouts here because CSE elimination might lead to
@ -411,10 +428,12 @@ Status AddFlatMapNode(const string& input_dataset,
 }

 // def flat_map_fn(*batched_components):
+//   batch_size = tf.shape(batched_components[0])[0]
+//   minibatch_size = (batch_size + num_replicas - 1) // num_replicas
 //   ds = tf.data.Dataset.from_tensor_slices(batched_components)
 //   return ds.batch(minibatch_size, drop_remainder=False)
-Status CreateFlatMapFnWithBatch(const DataTypeVector& dtypes, int64 num_workers,
-                                FunctionDef* result) {
+Status CreateFlatMapFnWithBatch(const DataTypeVector& dtypes,
+                                int64 num_replicas, FunctionDef* result) {
  NodeDef* tensor_slice_node = result->add_node_def();
  tensor_slice_node->set_op("TensorSliceDataset");
  for (int i = 0; i < dtypes.size(); ++i) {
@ -439,13 +458,32 @@ Status CreateFlatMapFnWithBatch(const DataTypeVector& dtypes, int64 num_workers,
  batch_node->add_input(
      strings::StrCat(tensor_slice_node->name(), ":handle:0"));

-  // `batch_size` input
-  // Here, we capture the original batch size from outside the flat map fn.
-  auto* original_batch_size =
-      function_utils::AddFunctionInput("captured_batch_size", result, DT_INT64);
+  // `batch_size` is tf.shape(arg)[0]
+  NodeDef* shape;
+  TF_RETURN_IF_ERROR(AddShapeNode({tensor_slice_node->input(0), 0, dtypes[0]},
+                                  DT_INT64, result, &shape));
+
+  // Const with value [0]
+  NodeDef* const_vec_0;
+  TF_RETURN_IF_ERROR(AddConstIntNode({0}, {1}, result, &const_vec_0));
+
+  // Const with value [1]
+  NodeDef* const_vec_1;
+  TF_RETURN_IF_ERROR(AddConstIntNode({1}, {1}, result, &const_vec_1));
+
+  // Extracts the 0th dimension from the shape node.
+  NodeDef* original_batch_size;
+  TF_RETURN_IF_ERROR(AddStridedSliceNode(
+      {strings::StrCat(shape->name(), ":output"), 0, DT_INT64},
+      {strings::StrCat(const_vec_0->name(), ":output"), 0, DT_INT32},
+      {strings::StrCat(const_vec_1->name(), ":output"), 0, DT_INT32},
+      {strings::StrCat(const_vec_1->name(), ":output"), 0, DT_INT32}, DT_INT32,
+      0, 0, 0, 0, 1, result, &original_batch_size));
+
  NodeDef* new_batch_size;
  TF_RETURN_IF_ERROR(MakeNewBatchSizeNode(
-      original_batch_size->name(), num_workers, result, &new_batch_size));
+      strings::StrCat(original_batch_size->name(), ":output:0"), num_replicas,
+      result, &new_batch_size));
  batch_node->add_input(strings::StrCat(new_batch_size->name(), ":z:0"));

  // `drop_remainder` input
@ -470,9 +508,9 @@ Status CreateFlatMapFnWithBatch(const DataTypeVector& dtypes, int64 num_workers,
 // in a step adds up to the global batch size. However, since this adds
 // additional data copies (both from_tensor_slices and batch), we only use
 // this approach when necessary, i.e. when we need to drop remainder on the
-// global batch, or when the global batch size does not divide num_workers
+// global batch, or when the global batch size does not divide num_replicas
 // evenly.
-Status AppendFlatMap(const NodeDef& batch_node, int64 num_workers,
+Status AppendFlatMap(const NodeDef& batch_node, int64 num_replicas,
                     FunctionLibraryDefinition* flib, MutableGraphView* graph) {
  // `.flat_map(lambda x: tf.data.Dataset.from_tensor_slices(x).
  //     batch(minibatch_size, drop_remainder=False))`
@ -484,9 +522,7 @@ Status AppendFlatMap(const NodeDef& batch_node, int64 num_workers,
  TF_RETURN_IF_ERROR(
      graph_utils::GetDatasetOutputTypesAttr(batch_node, &dtypes));
  TF_RETURN_IF_ERROR(
-      CreateFlatMapFnWithBatch(dtypes, num_workers, &flat_map_fn));
-
-  int64 batch_size_index = GetBatchSizeArgIndex(batch_node);
+      CreateFlatMapFnWithBatch(dtypes, num_replicas, &flat_map_fn));

  NodeDef* flat_map_node;

@ -496,15 +532,14 @@ Status AppendFlatMap(const NodeDef& batch_node, int64 num_workers,
      // Because the flat map function uses drop_remainder = False,
      // the shape might be unknown
      auto old_dim = shape.dim(0).size();
-      auto new_dim = old_dim % num_workers == 0 ? old_dim / num_workers : -1;
+      auto new_dim = old_dim % num_replicas == 0 ? old_dim / num_replicas : -1;
      shape.mutable_dim(0)->set_size(new_dim);
    }
  }

  TF_RETURN_IF_ERROR(AddFlatMapNode(strings::StrCat(batch_node.name(), ":0"),
-                                    {batch_node.input(batch_size_index)},
-                                    {DT_INT64}, flat_map_fn, output_shapes,
-                                    dtypes, flib, graph, &flat_map_node));
+                                    {}, {}, flat_map_fn, output_shapes, dtypes,
+                                    flib, graph, &flat_map_node));

  TF_RETURN_IF_ERROR(
      graph->UpdateFanouts(batch_node.name(), flat_map_node->name()));
@ -514,12 +549,13 @@ Status AppendFlatMap(const NodeDef& batch_node, int64 num_workers,

 // There are several things we do here, depending on the values of
 // batch_size and drop_remainder.
-// (1) If batch size is known and divisible by num_workers, and drop_remainder
+// (1) If batch size is known and divisible by num_replicas, and drop_remainder
 // is known to be False, we mutate the batch size directly.
-//   .batch(global_batch_size) -> .batch(global_batch_size // num_workers)
+//   .batch(global_batch_size) -> .batch(global_batch_size // num_replicas)
 // (2) Otherwise, we add a flat_map transformation to preserve the global batch
-// size across the workers and to preserve the drop remainder behavior.
-bool ShouldMutateBatchSizeDirectly(const NodeDef& batch_node, int64 num_workers,
+// size across the replicas and to preserve the drop remainder behavior.
+bool ShouldMutateBatchSizeDirectly(const NodeDef& batch_node,
+                                   int64 num_replicas,
                                   MutableGraphView* graph) {
  int64 batch_size_arg_index = GetBatchSizeArgIndex(batch_node);
  NodeDef* batch_size_node =
@ -528,9 +564,9 @@ bool ShouldMutateBatchSizeDirectly(const NodeDef& batch_node, int64 num_workers,
  int64 batch_size;
  Status s =
      graph_utils::GetScalarConstNodeValue(*batch_size_node, &batch_size);
-  // If batch size is unknown or indivisible by num workers, we don't
+  // If batch size is unknown or indivisible by num replicas, we don't
  // mutate it directly
-  if (!s.ok() || batch_size % num_workers != 0) return false;
+  if (!s.ok() || batch_size % num_replicas != 0) return false;

  if (batch_node.op() == kBatchOp || batch_node.op() == kPaddedBatchOp) {
    // These ops don't have a `drop_remainder` input, and behave like
@ -547,16 +583,16 @@ bool ShouldMutateBatchSizeDirectly(const NodeDef& batch_node, int64 num_workers,
  return s.ok() && !drop_remainder;
 }

-Status RewriteBatchNode(const NodeDef& batch_node, int64 num_workers,
+Status RewriteBatchNode(const NodeDef& batch_node, int64 num_replicas,
                        FunctionLibraryDefinition* flib,
                        MutableGraphView* graph) {
-  if (ShouldMutateBatchSizeDirectly(batch_node, num_workers, graph)) {
-    return MutateBatchSize(batch_node, num_workers, graph);
+  if (ShouldMutateBatchSizeDirectly(batch_node, num_replicas, graph)) {
+    return MutateBatchSize(batch_node, num_replicas, graph);
  }
-  return AppendFlatMap(batch_node, num_workers, flib, graph);
+  return AppendFlatMap(batch_node, num_replicas, flib, graph);
 }

-Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
+Status OptimizeGraph(const GrapplerItem& item, int64 num_replicas,
                     bool use_fallback, GraphDef* output);

 // Helper function that starts from a node in the graph and recurses into its
@ -567,16 +603,16 @@ Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
 //      as they are datasets themselves.
 // 3. Core dataset ops + Identity op: Recurses into first input parameter.
 // 4. FlatMap type mapping dataset ops: Recurses into the function definition.
-Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers,
+Status RecursivelyHandleOp(const NodeDef& node, int64 num_replicas,
                           bool use_fallback, FunctionLibraryDefinition* flib,
                           MutableGraphView* graph) {
  if (IsDatasetNodeOfType(node, kBatchDatasetOps)) {
-    TF_RETURN_IF_ERROR(RewriteBatchNode(node, num_workers, flib, graph));
+    TF_RETURN_IF_ERROR(RewriteBatchNode(node, num_replicas, flib, graph));
  } else if (IsDatasetNodeOfType(node, kMultipleInputsDatasetOps)) {
    // For all multiple input datasets, all inputs are datasets themselves.
    for (int i = 0; i < node.input_size(); ++i) {
      NodeDef* input_node = graph_utils::GetInputNode(node, *graph, i);
-      TF_RETURN_IF_ERROR(RecursivelyHandleOp(*input_node, num_workers,
+      TF_RETURN_IF_ERROR(RecursivelyHandleOp(*input_node, num_replicas,
                                             use_fallback, flib, graph));
    }
  } else if (IsDatasetNodeOfType(node, kPassThroughOps) || IsRetval(node)) {
@ -584,7 +620,7 @@ Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers,
    // function body graph in place of function outputs, the input dataset is
    // input 0.
    NodeDef* input_node = graph_utils::GetInputNode(node, *graph, 0);
-    TF_RETURN_IF_ERROR(RecursivelyHandleOp(*input_node, num_workers,
+    TF_RETURN_IF_ERROR(RecursivelyHandleOp(*input_node, num_replicas,
                                           use_fallback, flib, graph));
  } else if (IsDatasetNodeOfType(node, kFuncDatasetOps)) {
    const string func_name =
@ -594,7 +630,7 @@ Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers,
    TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(
        *fdef, *flib, graph->graph()->versions().producer(), &f_item));
    GraphDef optimized_func_graph;
-    TF_RETURN_IF_ERROR(OptimizeGraph(f_item, num_workers, use_fallback,
+    TF_RETURN_IF_ERROR(OptimizeGraph(f_item, num_replicas, use_fallback,
                                     &optimized_func_graph));

    // Function body optimization might have created new specialized
@ -623,7 +659,7 @@ Status RecursivelyHandleOp(const NodeDef& node, int64 num_workers,
  }
  // If we've successfully updated the batch size of this node or any nodes
  // in the dataset tree rooted in this node, we update the output_shapes attr.
-  TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_workers, graph));
+  TF_RETURN_IF_ERROR(UpdateOutputShapes(node.name(), num_replicas, graph));
  return Status::OK();
 }

@ -649,7 +685,7 @@ Status ReshapeComponent(int new_batch_dim, const string& arg, DataType dtype,

  // shape = tf.shape(arg)
  NodeDef* shape;
-  TF_RETURN_IF_ERROR(AddShapeNode({arg, 0, dtype}, fdef, &shape));
+  TF_RETURN_IF_ERROR(AddShapeNode({arg, 0, dtype}, DT_INT32, fdef, &shape));

  // later_dimensions = tf.shape(arg)[1:]
  NodeDef* later_dimensions;
@ -689,7 +725,7 @@ Status CreateFlatMapFnWithReshape(int new_batch_dim,

  // For each component of the dataset, we reshape it from shape
  // (old_batch_size, ...) to (-1, new_batch_size, ...)
-  // where new_batch_size = (old_batch_size + num_workers - 1) // num_workers
+  // where new_batch_size = (old_batch_size + num_replicas - 1) // num_replicas
  for (int i = 0; i < types.size(); ++i) {
    auto* input_arg = function_utils::AddFunctionInput(
        strings::StrCat("args_", i), result, types.at(i));
@ -733,13 +769,13 @@ Status CreateFlatMapFnWithReshape(int new_batch_dim,
 //     return tf.data.Dataset.from_tensor_slices(
 //       tf.reshape(
 //         x,
-//         tf.concat([[-1, old_batch_dim / num_workers], tf.shape(x)[1:]], 0)
+//         tf.concat([[-1, old_batch_dim / num_replicas], tf.shape(x)[1:]], 0)
 //       )
 //     )
 //
 //   dataset = dataset.flat_map(fn)
 // ```
-Status RebatchWithFallback(const NodeDef* fetch_node, int64 num_workers,
+Status RebatchWithFallback(const NodeDef* fetch_node, int64 num_replicas,
                           FunctionLibraryDefinition* flib,
                           MutableGraphView* graph) {
  if (IsRetval(*fetch_node) || fetch_node->op() == kIdentityOp) {
@ -747,26 +783,6 @@ Status RebatchWithFallback(const NodeDef* fetch_node, int64 num_workers,
    fetch_node = graph_utils::GetInputNode(*fetch_node, *graph, 0);
  }

-  // Note: Here, we are conservative with only using the fallback when
-  // the output_shapes attr has the 0th dimension defined for every component.
-  // This because the flat_map_fn will fail if the batch does not divide evenly
-  // because of the use of the "Reshape" op. This ensures that the error is
-  // surfaced correctly.
-  AttrValue output_shapes;
-  if (!fetch_node->attr().contains(kOutputShapesAttr)) {
-    return errors::InvalidArgument(
-        "Cannot use rebatching fallback without output_shapes attr. Node: ",
-        fetch_node->name(), " Op: ", fetch_node->op());
-  } else {
-    output_shapes = fetch_node->attr().at(kOutputShapesAttr);
-  }
-  int batch_dim;
-  TF_RETURN_IF_ERROR(GetBatchDim(output_shapes, &batch_dim));
-  if (batch_dim % num_workers != 0) {
-    return errors::InvalidArgument(
-        "Cannot use rebatching fallback when batch dimension doesn't divide "
-        "num_workers evenly.");
-  }

  // Create the flat map fn
  FunctionDef flat_map_fn;
@ -778,15 +794,32 @@ Status RebatchWithFallback(const NodeDef* fetch_node, int64 num_workers,
  DataTypeVector output_types;
  TF_RETURN_IF_ERROR(
      graph_utils::GetDatasetOutputTypesAttr(*fetch_node, &output_types));
-  TF_RETURN_IF_ERROR(CreateFlatMapFnWithReshape(batch_dim / num_workers,
-                                                output_types, &flat_map_fn));

+  int64 minibatch_dim;
+  // If the batch dimension is known and perfectly divisible by num_replicas,
+  // we use a fallback with `tf.reshape` for better performance.
+  TF_RETURN_IF_ERROR(
+      GetMinibatchDimForReshape(*fetch_node, num_replicas, &minibatch_dim));
+  if (minibatch_dim != -1) {
+    TF_RETURN_IF_ERROR(
+        CreateFlatMapFnWithReshape(minibatch_dim, output_types, &flat_map_fn));
+  } else {
+    TF_RETURN_IF_ERROR(
+        CreateFlatMapFnWithBatch(output_types, num_replicas, &flat_map_fn));
+  }
+
+  AttrValue output_shapes;
+  if (fetch_node->attr().contains(kOutputShapesAttr)) {
+    output_shapes = fetch_node->attr().at(kOutputShapesAttr);
+  } else {
+    SetUnknownShapes(output_types.size(), &output_shapes);
+  }
  NodeDef* flat_map_node;
  TF_RETURN_IF_ERROR(AddFlatMapNode(strings::StrCat(fetch_node->name(), ":0"),
                                    {}, {}, flat_map_fn, output_shapes,
                                    output_types, flib, graph, &flat_map_node));
  TF_RETURN_IF_ERROR(
-      UpdateOutputShapes(flat_map_node->name(), num_workers, graph));
+      UpdateOutputShapes(flat_map_node->name(), num_replicas, graph));

  TF_RETURN_IF_ERROR(
      graph->UpdateFanouts(fetch_node->name(), flat_map_node->name()));
@ -797,7 +830,7 @@ Status RebatchWithFallback(const NodeDef* fetch_node, int64 num_workers,
 // Helper function that given a GrapplerItem generates a mutated graph def
 // with the batch size changed. The GrapplerItem could be generated from the
 // main graph or could be a function graph.
-Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
+Status OptimizeGraph(const GrapplerItem& item, int64 num_replicas,
                     bool use_fallback, GraphDef* output) {
  *output = item.graph;
  MutableGraphView graph(output);
@ -807,8 +840,8 @@ Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
  NodeDef* sink_node;
  TF_RETURN_IF_ERROR(graph_utils::GetFetchNode(graph, item, &sink_node));

-  Status s =
-      RecursivelyHandleOp(*sink_node, num_workers, use_fallback, &flib, &graph);
+  Status s = RecursivelyHandleOp(*sink_node, num_replicas, use_fallback, &flib,
+                                 &graph);
  if (!s.ok()) {
    if (use_fallback) {
      VLOG(1) << "Failed to rebatch by rewriting the batch transformation ("
@ -818,7 +851,7 @@ Status OptimizeGraph(const GrapplerItem& item, int64 num_workers,
      *output = item.graph;
      graph = MutableGraphView(output);
      TF_RETURN_IF_ERROR(
-          RebatchWithFallback(sink_node, num_workers, &flib, &graph));
+          RebatchWithFallback(sink_node, num_replicas, &flib, &graph));
    } else {
      // Return the error
      return s;
@ -837,7 +870,7 @@ Status RebatchOptimizer::OptimizeAndCollectStats(Cluster* cluster,
  *output = item.graph;
  MutableGraphView graph(output);

-  TF_RETURN_IF_ERROR(OptimizeGraph(item, num_workers_, use_fallback_, output));
+  TF_RETURN_IF_ERROR(OptimizeGraph(item, num_replicas_, use_fallback_, output));
  stats->num_changes++;
  return Status::OK();
 }
--- a/tensorflow/core/grappler/optimizers/data/rebatch.h
+++ b/tensorflow/core/grappler/optimizers/data/rebatch.h
@ -23,7 +23,7 @@ namespace tensorflow {
 namespace grappler {

 // This optimizer changes the batch size of the output dataset by dividing the
-// current batch size by parameter `num_workers`. Currently, this works only
+// current batch size by parameter `num_replicas`. Currently, this works only
 // for very simple pipelines with a single BatchDatasetV2 transformation.
 class RebatchOptimizer : public TFDataOptimizerBase {
 public:
@ -43,7 +43,7 @@ class RebatchOptimizer : public TFDataOptimizerBase {
                const GraphDef& optimize_output, double result) override;

 private:
-  int64 num_workers_;
+  int64 num_replicas_;
  bool use_fallback_;
 };

--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@ -2265,7 +2265,11 @@ Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
  config.no_gemm = true;
  // TODO(yaozhang): Enable tuning with various TuningConfig choices with
  // the measurement-based estimator.
-  return Tune(item, graph_properties, config, output);
+  Status status = Tune(item, graph_properties, config, output);
+  if (!status.ok()) {
+    *output = item.graph;
+  }
+  return status;
 }

 void LayoutOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer_test.cc
@ -203,7 +203,7 @@ TEST_F(PinToHostOptimizerTest, Identity) {
      // If CUDA, then there is a GPU kernel registration that is pinned to Host
      // memory. Consequently, `b` will be mapped to Host correct if there is
      // a GPU kernel registered.
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
      EXPECT_EQ(node.device(), "/device:CPU:0");
 #else
      EXPECT_TRUE(node.device().empty());
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -5533,6 +5533,24 @@ tf_kernel_library(
    deps = STRING_DEPS,
 )

+tf_cc_test(
+    name = "as_string_op_test",
+    size = "small",
+    srcs = ["as_string_op_test.cc"],
+    deps = [
+        ":as_string_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_kernel_library(
    name = "unicode_ops",
    prefix = "unicode_ops",
@ -6174,6 +6192,7 @@ filegroup(
        "scatter_nd_op.h",
        "scatter_nd_op_cpu_impl.h",
        "segment_reduction_ops.h",
+        "segment_reduction_ops_impl.h",
        "softplus_op.h",
        "softsign_op.h",
        "spacetobatch_functor.h",
@ -6370,7 +6389,11 @@ filegroup(
        "scatter_nd_op_cpu_impl_5.cc",
        "scatter_nd_op_cpu_impl_6.cc",
        "scatter_nd_op_cpu_impl_7.cc",
-        "segment_reduction_ops.cc",
+        "segment_reduction_ops_impl_1.cc",
+        "segment_reduction_ops_impl_2.cc",
+        "segment_reduction_ops_impl_3.cc",
+        "segment_reduction_ops_impl_4.cc",
+        "segment_reduction_ops_impl_5.cc",
        "session_ops.cc",
        "softplus_op.cc",
        "softsign_op.cc",
@ -7944,6 +7967,7 @@ cc_library(
        "cwise_ops_gpu_common.cu.h",
        "cwise_ops_gpu_gradients.cu.h",
        "cwise_ops_gradients.h",
+        "fill_functor.h",
        "meta_support.h",
    ],
    deps = [
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@ -65,9 +65,26 @@ class AsStringOp : public OpKernel {
    OP_REQUIRES(ctx, !(scientific && shortest),
                errors::InvalidArgument(
                    "Cannot select both scientific and shortest notation"));
+
    format_ = "%";
+    if (!fill_string.empty()) {
+      switch (fill_string[0]) {
+        case ' ':
+        case '+':
+        case '-':
+        case '0':
+        case '#':
+          strings::Appendf(&format_, "%s", fill_string.c_str());
+          break;
+        default:
+          bool fill_not_supported = true;
+          OP_REQUIRES(ctx, !fill_not_supported,
+                      errors::InvalidArgument("Fill argument not supported: \"",
+                                              fill_string, "\""));
+      }
+    }
    if (width > -1) {
-      strings::Appendf(&format_, "%s%d", fill_string.c_str(), width);
+      strings::Appendf(&format_, "%d", width);
    }
    if (precision > -1) {
      strings::Appendf(&format_, ".%d", precision);
--- a/tensorflow/core/kernels/as_string_op_test.cc
+++ b/tensorflow/core/kernels/as_string_op_test.cc
@ -0,0 +1,245 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+class AsStringGraphTest : public OpsTestBase {
+ protected:
+  Status Init(DataType input_type, const string& fill = "", int width = -1,
+              int precision = -1, bool scientific = false,
+              bool shortest = false) {
+    TF_CHECK_OK(NodeDefBuilder("op", "AsString")
+                    .Input(FakeInput(input_type))
+                    .Attr("fill", fill)
+                    .Attr("precision", precision)
+                    .Attr("scientific", scientific)
+                    .Attr("shortest", shortest)
+                    .Attr("width", width)
+                    .Finalize(node_def()));
+    return InitOp();
+  }
+};
+
+TEST_F(AsStringGraphTest, Int8) {
+  TF_ASSERT_OK(Init(DT_INT8));
+
+  AddInputFromArray<int8>(TensorShape({3}), {-42, 0, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({3}));
+  test::FillValues<tstring>(&expected, {"-42", "0", "42"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, Int64) {
+  TF_ASSERT_OK(Init(DT_INT64));
+
+  AddInputFromArray<int64>(TensorShape({3}), {-42, 0, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({3}));
+  test::FillValues<tstring>(&expected, {"-42", "0", "42"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FloatDefault) {
+  TF_ASSERT_OK(Init(DT_FLOAT));
+
+  AddInputFromArray<float>(TensorShape({4}), {-42, 0, 3.14159, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(
+      &expected, {"-42.000000", "0.000000", "3.141590", "42.000000"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FloatScientific) {
+  TF_ASSERT_OK(Init(DT_FLOAT, /*fill=*/"", /*width=*/-1, /*precision=*/-1,
+                    /*scientific=*/true));
+
+  AddInputFromArray<float>(TensorShape({4}), {-42, 0, 3.14159, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(&expected, {"-4.200000e+01", "0.000000e+00",
+                                        "3.141590e+00", "4.200000e+01"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FloatShortest) {
+  TF_ASSERT_OK(Init(DT_FLOAT, /*fill=*/"", /*width=*/-1, /*precision=*/-1,
+                    /*scientific=*/false, /*shortest=*/true));
+
+  AddInputFromArray<float>(TensorShape({4}), {-42, 0, 3.14159, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(&expected, {"-42", "0", "3.14159", "42"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FloatPrecisionOnly) {
+  TF_ASSERT_OK(Init(DT_FLOAT, /*fill=*/"", /*width=*/-1, /*precision=*/2));
+
+  AddInputFromArray<float>(TensorShape({4}), {-42, 0, 3.14159, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(&expected, {"-42.00", "0.00", "3.14", "42.00"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FloatWidthOnly) {
+  TF_ASSERT_OK(Init(DT_FLOAT, /*fill=*/"", /*width=*/5));
+
+  AddInputFromArray<float>(TensorShape({4}), {-42, 0, 3.14159, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(
+      &expected, {"-42.000000", "0.000000", "3.141590", "42.000000"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, Float_5_2_Format) {
+  TF_ASSERT_OK(Init(DT_FLOAT, /*fill=*/"", /*width=*/5, /*precision=*/2));
+
+  AddInputFromArray<float>(TensorShape({4}), {-42, 0, 3.14159, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({4}));
+  test::FillValues<tstring>(&expected, {"-42.00", " 0.00", " 3.14", "42.00"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, Complex) {
+  TF_ASSERT_OK(Init(DT_COMPLEX64, /*fill=*/"", /*width=*/5, /*precision=*/2));
+
+  AddInputFromArray<complex64>(TensorShape({3}), {{-4, 2}, {0}, {3.14159, -1}});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({3}));
+  test::FillValues<tstring>(
+      &expected, {"(-4.00, 2.00)", "( 0.00, 0.00)", "( 3.14,-1.00)"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, Bool) {
+  TF_ASSERT_OK(Init(DT_BOOL));
+
+  AddInputFromArray<bool>(TensorShape({2}), {true, false});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({2}));
+  test::FillValues<tstring>(&expected, {"true", "false"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, String) {
+  Status s = Init(DT_STRING);
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(absl::StrContains(
+      s.error_message(),
+      "Value for attr 'T' of string is not in the list of allowed values"));
+}
+
+TEST_F(AsStringGraphTest, OnlyOneOfScientificAndShortest) {
+  Status s = Init(DT_FLOAT, /*fill=*/"", /*width=*/-1, /*precision=*/-1,
+                  /*scientific=*/true, /*shortest=*/true);
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(
+      absl::StrContains(s.error_message(),
+                        "Cannot select both scientific and shortest notation"));
+}
+
+TEST_F(AsStringGraphTest, NoShortestForNonFloat) {
+  Status s = Init(DT_INT32, /*fill=*/"", /*width=*/-1, /*precision=*/-1,
+                  /*scientific=*/false, /*shortest=*/true);
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(absl::StrContains(
+      s.error_message(),
+      "scientific and shortest format not supported for datatype"));
+}
+
+TEST_F(AsStringGraphTest, NoScientificForNonFloat) {
+  Status s = Init(DT_INT32, /*fill=*/"", /*width=*/-1, /*precision=*/-1,
+                  /*scientific=*/true);
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(absl::StrContains(
+      s.error_message(),
+      "scientific and shortest format not supported for datatype"));
+}
+
+TEST_F(AsStringGraphTest, NoPrecisionForNonFloat) {
+  Status s = Init(DT_INT32, /*fill=*/"", /*width=*/-1, /*precision=*/5);
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(absl::StrContains(s.error_message(),
+                                "precision not supported for datatype"));
+}
+
+TEST_F(AsStringGraphTest, LongFill) {
+  Status s = Init(DT_INT32, /*fill=*/"asdf");
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(absl::StrContains(s.error_message(),
+                                "Fill string must be one or fewer characters"));
+}
+
+TEST_F(AsStringGraphTest, FillWithZero) {
+  TF_ASSERT_OK(Init(DT_INT64, /*fill=*/"0", /*width=*/4));
+
+  AddInputFromArray<int64>(TensorShape({3}), {-42, 0, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({3}));
+  test::FillValues<tstring>(&expected, {"-042", "0000", "0042"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FillWithSpace) {
+  TF_ASSERT_OK(Init(DT_INT64, /*fill=*/" ", /*width=*/4));
+
+  AddInputFromArray<int64>(TensorShape({3}), {-42, 0, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({3}));
+  test::FillValues<tstring>(&expected, {" -42", "   0", "  42"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FillWithChar1) {
+  TF_ASSERT_OK(Init(DT_INT64, /*fill=*/"-", /*width=*/4));
+
+  AddInputFromArray<int64>(TensorShape({3}), {-42, 0, 42});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({3}));
+  test::FillValues<tstring>(&expected, {"-42 ", "0   ", "42  "});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+TEST_F(AsStringGraphTest, FillWithChar3) {
+  Status s = Init(DT_INT32, /*fill=*/"s");
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(
+      absl::StrContains(s.error_message(), "Fill argument not supported"));
+}
+
+TEST_F(AsStringGraphTest, FillWithChar4) {
+  Status s = Init(DT_INT32, /*fill=*/"n");
+  ASSERT_EQ(error::INVALID_ARGUMENT, s.code());
+  ASSERT_TRUE(
+      absl::StrContains(s.error_message(), "Fill argument not supported"));
+}
+
+}  // end namespace
+}  // end namespace tensorflow
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@ -109,7 +109,7 @@ class BoostedTreesTrainingPredictOp : public OpKernel {
      auto do_work = [&resource, &batch_bucketized_features, &cached_tree_ids,
                      &cached_node_ids, &output_partial_logits,
                      &output_node_ids, latest_tree,
-                      this](int32 start, int32 end) {
+                      this](int64 start, int64 end) {
        for (int32 i = start; i < end; ++i) {
          int32 tree_id = cached_tree_ids(i);
          int32 node_id = cached_node_ids(i);
@ -227,7 +227,7 @@ class BoostedTreesPredictOp : public OpKernel {

    const int32 last_tree = resource->num_trees() - 1;
    auto do_work = [&resource, &batch_bucketized_features, &output_logits,
-                    last_tree, this](int32 start, int32 end) {
+                    last_tree, this](int64 start, int64 end) {
      for (int32 i = start; i < end; ++i) {
        std::vector<float> tree_logits(logits_dimension_, 0.0);
        int32 tree_id = 0;
@ -332,7 +332,7 @@ class BoostedTreesExampleDebugOutputsOp : public OpKernel {
    // path. Note: feature_ids has one less value than logits_path because the
    // first value of each logit path will be the bias.
    auto do_work = [&resource, &batch_bucketized_features, &output_debug_info,
-                    last_tree](int32 start, int32 end) {
+                    last_tree](int64 start, int64 end) {
      for (int32 i = start; i < end; ++i) {
        // Proto to store debug outputs, per example.
        boosted_trees::DebugOutput example_debug_info;
--- a/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/quantile_ops.cc
@ -264,6 +264,7 @@ class BoostedTreesFlushQuantileSummariesOp : public OpKernel {
        *context->device()->tensorflow_cpu_worker_threads();
    Shard(worker_threads.num_threads, worker_threads.workers, num_features_,
          kCostPerUnit, do_quantile_summary_gen);
+    stream_resource->ResetStreams();
  }

 private:
@ -424,6 +425,7 @@ class BoostedTreesQuantileStreamResourceFlushOp : public OpKernel {
    Shard(worker_threads.num_threads, worker_threads.workers, num_streams,
          kCostPerUnit, do_quantile_flush);

+    stream_resource->ResetStreams();
    stream_resource->set_buckets_ready(true);
  }

--- a/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
+++ b/tensorflow/core/kernels/boosted_trees/quantiles/quantile_stream_resource.h
@ -67,6 +67,14 @@ class BoostedTreesQuantileStreamResource : public ResourceBase {
    are_buckets_ready_ = are_buckets_ready;
  }

+  void ResetStreams() {
+    streams_.clear();
+    streams_.reserve(num_streams_);
+    for (int64 idx = 0; idx < num_streams_; ++idx) {
+      streams_.push_back(QuantileStream(epsilon_, max_elements_));
+    }
+  }
+
 private:
  ~BoostedTreesQuantileStreamResource() override {}

--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@ -1001,6 +1001,10 @@ class FusedConv2DWithBatchNormOpTest : public FusedConv2DOpTest<T> {};
 TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest);
 TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest);

+// ROCm does not yet support the _FusedConv2D op,
+// Therefore disable tests that check _FusedConv2D, when building with ROCm
+
+#ifndef TENSORFLOW_USE_ROCM
 // -------------------------------------------------------------------------- //
 // Conv2D + BiasAdd + {Activation}                                            //
 // -------------------------------------------------------------------------- //
@ -1165,4 +1169,5 @@ using FusedBatchNormDataTypes = ::testing::Types<float>;
 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBatchNormOpTest,
                               FusedBatchNormDataTypes);

+#endif  // TENSORFLOW_USE_ROCM
 }  // namespace tensorflow
--- a/tensorflow/core/kernels/cwise_ops_common.cc
+++ b/tensorflow/core/kernels/cwise_ops_common.cc
@ -57,11 +57,23 @@ BinaryOpShared::BinaryOpState::BinaryOpState(OpKernelContext* ctx)
      in1(ctx->input(1)),
      bcast(BCast::FromShape(in0.shape()), BCast::FromShape(in1.shape())) {
  if (!bcast.IsValid()) {
+    bool incompatible_shape_error;
+    bool has_attr =
+        GetNodeAttrSimple(ctx->op_kernel().def(), "incompatible_shape_error",
+                          &(incompatible_shape_error));
+    if (has_attr && !incompatible_shape_error) {
+      const string& op = ctx->op_kernel().type_string();
+      OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &out));
+      result = (op == "NotEqual");
+      return;
+    }
+
    ctx->SetStatus(errors::InvalidArgument(
        "Incompatible shapes: ", in0.shape().DebugString(), " vs. ",
        in1.shape().DebugString()));
    return;
  }
+
  const TensorShape output_shape = BCast::ToShape(bcast.output_shape());
  out_num_elements = output_shape.num_elements();
  in0_num_elements = in0.NumElements();
--- a/tensorflow/core/kernels/cwise_ops_common.h
+++ b/tensorflow/core/kernels/cwise_ops_common.h
@ -26,13 +26,13 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_sycl_common.h"
 #endif

-#include "tensorflow/core/kernels/cwise_ops.h"
-#include "tensorflow/core/kernels/cwise_ops_gradients.h"
-
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/kernels/cwise_ops.h"
+#include "tensorflow/core/kernels/cwise_ops_gradients.h"
+#include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/bcast.h"

@ -56,7 +56,7 @@ class BinaryOpShared : public OpKernel {
    // in-place computation.
    // Caller must check ctx->status() upon return for non-ok status.
    // If ctx->status().ok() is true, then out is guaranteed to be allocated.
-    BinaryOpState(OpKernelContext* ctx);
+    explicit BinaryOpState(OpKernelContext* ctx);

    const Tensor& in0;
    const Tensor& in1;
@ -69,6 +69,7 @@ class BinaryOpShared : public OpKernel {
    int64 in1_num_elements;

    int ndims;
+    bool result;
  };

  void SetUnimplementedError(OpKernelContext* ctx);
@ -91,16 +92,29 @@ class BinaryOp : public BinaryOpShared {
  void Compute(OpKernelContext* ctx) override {
    // 'state': Shared helper not dependent on T to reduce code size
    BinaryOpState state(ctx);
-    if (!ctx->status().ok()) return;
+    auto& bcast = state.bcast;
+    const Device& eigen_device = ctx->eigen_device<Device>();
    Tensor* out = state.out;
-    BCast* bcast = &state.bcast;
+    if (!bcast.IsValid()) {
+      if (ctx->status().ok()) {
+        if (state.result) {
+          functor::SetOneFunctor<Device, bool>()(eigen_device,
+                                                 out->flat<bool>());
+        } else {
+          functor::SetZeroFunctor<Device, bool>()(eigen_device,
+                                                  out->flat<bool>());
+        }
+      }
+      return;
+    }
+
    auto& in0 = state.in0;
    auto& in1 = state.in1;
    if (state.out_num_elements == 0) {
      return;
    }
+
    const int ndims = state.ndims;
-    const Device& eigen_device = ctx->eigen_device<Device>();
    bool error = false;
    bool* const error_ptr = Functor::has_errors ? &error : nullptr;
    if (ndims <= 1) {
@ -122,32 +136,32 @@ class BinaryOp : public BinaryOpShared {
      }
    } else if (ndims == 2) {
      functor::BinaryFunctor<Device, Functor, 2>().BCast(
-          eigen_device, out->shaped<Tout, 2>(bcast->result_shape()),
-          in0.template shaped<Tin, 2>(bcast->x_reshape()),
-          BCast::ToIndexArray<2>(bcast->x_bcast()),
-          in1.template shaped<Tin, 2>(bcast->y_reshape()),
-          BCast::ToIndexArray<2>(bcast->y_bcast()), error_ptr);
+          eigen_device, out->shaped<Tout, 2>(bcast.result_shape()),
+          in0.template shaped<Tin, 2>(bcast.x_reshape()),
+          BCast::ToIndexArray<2>(bcast.x_bcast()),
+          in1.template shaped<Tin, 2>(bcast.y_reshape()),
+          BCast::ToIndexArray<2>(bcast.y_bcast()), error_ptr);
    } else if (ndims == 3) {
      functor::BinaryFunctor<Device, Functor, 3>().BCast(
-          eigen_device, out->shaped<Tout, 3>(bcast->result_shape()),
-          in0.template shaped<Tin, 3>(bcast->x_reshape()),
-          BCast::ToIndexArray<3>(bcast->x_bcast()),
-          in1.template shaped<Tin, 3>(bcast->y_reshape()),
-          BCast::ToIndexArray<3>(bcast->y_bcast()), error_ptr);
+          eigen_device, out->shaped<Tout, 3>(bcast.result_shape()),
+          in0.template shaped<Tin, 3>(bcast.x_reshape()),
+          BCast::ToIndexArray<3>(bcast.x_bcast()),
+          in1.template shaped<Tin, 3>(bcast.y_reshape()),
+          BCast::ToIndexArray<3>(bcast.y_bcast()), error_ptr);
    } else if (ndims == 4) {
      functor::BinaryFunctor<Device, Functor, 4>().BCast(
-          eigen_device, out->shaped<Tout, 4>(bcast->result_shape()),
-          in0.template shaped<Tin, 4>(bcast->x_reshape()),
-          BCast::ToIndexArray<4>(bcast->x_bcast()),
-          in1.template shaped<Tin, 4>(bcast->y_reshape()),
-          BCast::ToIndexArray<4>(bcast->y_bcast()), error_ptr);
+          eigen_device, out->shaped<Tout, 4>(bcast.result_shape()),
+          in0.template shaped<Tin, 4>(bcast.x_reshape()),
+          BCast::ToIndexArray<4>(bcast.x_bcast()),
+          in1.template shaped<Tin, 4>(bcast.y_reshape()),
+          BCast::ToIndexArray<4>(bcast.y_bcast()), error_ptr);
    } else if (ndims == 5) {
      functor::BinaryFunctor<Device, Functor, 5>().BCast(
-          eigen_device, out->shaped<Tout, 5>(bcast->result_shape()),
-          in0.template shaped<Tin, 5>(bcast->x_reshape()),
-          BCast::ToIndexArray<5>(bcast->x_bcast()),
-          in1.template shaped<Tin, 5>(bcast->y_reshape()),
-          BCast::ToIndexArray<5>(bcast->y_bcast()), error_ptr);
+          eigen_device, out->shaped<Tout, 5>(bcast.result_shape()),
+          in0.template shaped<Tin, 5>(bcast.x_reshape()),
+          BCast::ToIndexArray<5>(bcast.x_bcast()),
+          in1.template shaped<Tin, 5>(bcast.y_reshape()),
+          BCast::ToIndexArray<5>(bcast.y_bcast()), error_ptr);
    } else {
      SetUnimplementedError(ctx);
    }
--- a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
@ -36,14 +36,15 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
 protected:
  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                   DatasetBase** output) override {
-    int64 num_workers;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_workers", &num_workers));
+    int64 num_replicas;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "num_replicas", &num_replicas));
    OP_REQUIRES(
-        ctx, num_workers > 0,
-        errors::InvalidArgument("num_workers must be greater than zero."));
+        ctx, num_replicas > 0,
+        errors::InvalidArgument("num_replicas must be greater than zero."));

-    auto config_factory = [num_workers, this]() {
-      return CreateConfig(num_workers, this->use_fallback_);
+    auto config_factory = [num_replicas, this]() {
+      return CreateConfig(num_replicas, this->use_fallback_);
    };

    // We only want to optimize functions for some particular datasets like
@ -56,17 +57,17 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
  }

 private:
-  static RewriterConfig CreateConfig(int64 num_workers, bool use_fallback) {
+  static RewriterConfig CreateConfig(int64 num_replicas, bool use_fallback) {
    RewriterConfig rewriter_config;
    rewriter_config.set_fail_on_optimizer_errors(true);
    rewriter_config.add_optimizers(kOptimizerName);
    rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
    auto custom_optimizer = rewriter_config.add_custom_optimizers();
    custom_optimizer->set_name(kOptimizerName);
-    AttrValue num_workers_attr;
-    num_workers_attr.set_i(num_workers);
-    (*custom_optimizer->mutable_parameter_map())["num_workers"] =
-        num_workers_attr;
+    AttrValue num_replicas_attr;
+    num_replicas_attr.set_i(num_replicas);
+    (*custom_optimizer->mutable_parameter_map())["num_replicas"] =
+        num_replicas_attr;
    AttrValue use_fallback_attr;
    use_fallback_attr.set_b(use_fallback);
    (*custom_optimizer->mutable_parameter_map())["use_fallback"] =
--- a/tensorflow/core/kernels/data/random_seed_ops.cc
+++ b/tensorflow/core/kernels/data/random_seed_ops.cc
@ -81,8 +81,16 @@ AnonymousRandomSeedGeneratorHandleOp::AnonymousRandomSeedGeneratorHandleOp(
    : AnonymousResourceOp<RandomSeedGenerator>(ctx) {}

 void AnonymousRandomSeedGeneratorHandleOp::Compute(OpKernelContext* ctx) {
-  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed, &seed_));
-  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed2, &seed2_));
+  int64 seed;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed, &seed));
+  int64 seed2;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, kSeed2, &seed2));
+  if (seed == 0 && seed2 == 0) {
+    seed = random::New64();
+    seed2 = random::New64();
+  }
+  seed_ = seed;
+  seed2_ = seed2;
  AnonymousResourceOp<RandomSeedGenerator>::Compute(ctx);
 }

--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@ -18,16 +18,52 @@ limitations under the License.
 #define EIGEN_USE_THREADS

 #include "tensorflow/core/kernels/data_format_ops.h"
+
+#include <map>
+
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"

 namespace tensorflow {

 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef Eigen::GpuDevice GPUDevice;

+// Ensure that `src` and `dst` define a valid permutation.
+// Ops defined in this file assume that user specifies a permutation via two
+// string attributes. This check validates that these attributes properly define
+// it to prevent security vulnerabilities.
+static bool IsValidPermutation(const std::string& src, const std::string& dst) {
+  if (src.size() != dst.size()) {
+    return false;
+  }
+
+  std::map<char, bool> characters;
+
+  // Every character in `src` must be present only once
+  for (const auto c : src) {
+    if (characters[c]) {
+      return false;
+    }
+    characters[c] = true;
+  }
+
+  // Every character in `dst` must show up in `src` exactly once
+  for (const auto c : dst) {
+    if (!characters[c]) {
+      return false;
+    }
+    characters[c] = false;
+  }
+
+  // At this point, characters[] has been switched to true and false exactly
+  // once for all character in `src` (and `dst`) so we have a valid permutation
+  return true;
+}
+
 template <typename Device, typename T>
 class DataFormatDimMapOp : public OpKernel {
 public:
@ -37,15 +73,20 @@ class DataFormatDimMapOp : public OpKernel {
    OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
    string dst_format;
    OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
-    OP_REQUIRES(context, src_format.size() == 4,
-                errors::InvalidArgument(strings::StrCat(
-                    "Source format must of length 4, received src_format = ",
-                    src_format)));
+    OP_REQUIRES(context, src_format.size() == 4 || src_format.size() == 5,
+                errors::InvalidArgument(
+                    "Source format must be of length 4 or 5, received "
+                    "src_format = ",
+                    src_format));
+    OP_REQUIRES(context, dst_format.size() == 4 || dst_format.size() == 5,
+                errors::InvalidArgument("Destination format must be of length "
+                                        "4 or 5, received dst_format = ",
+                                        dst_format));
    OP_REQUIRES(
-        context, dst_format.size() == 4,
-        errors::InvalidArgument(strings::StrCat(
-            "Destination format must of length 4, received dst_format = ",
-            dst_format)));
+        context, IsValidPermutation(src_format, dst_format),
+        errors::InvalidArgument(
+            "Destination and source format must determine a permutation, got ",
+            src_format, " and ", dst_format));
    dst_idx_ = Tensor(DT_INT32, {static_cast<int64>(src_format.size())});
    for (int i = 0; i < src_format.size(); ++i) {
      for (int j = 0; j < dst_format.size(); ++j) {
@ -77,8 +118,22 @@ class DataFormatVecPermuteOp : public OpKernel {
      : OpKernel(context) {
    string src_format;
    OP_REQUIRES_OK(context, context->GetAttr("src_format", &src_format));
+    OP_REQUIRES(context, src_format.size() == 4 || src_format.size() == 5,
+                errors::InvalidArgument(
+                    "Source format must be of length 4 or 5, received "
+                    "src_format = ",
+                    src_format));
    string dst_format;
    OP_REQUIRES_OK(context, context->GetAttr("dst_format", &dst_format));
+    OP_REQUIRES(context, dst_format.size() == 4 || dst_format.size() == 5,
+                errors::InvalidArgument("Destination format must be of length "
+                                        "4 or 5, received dst_format = ",
+                                        dst_format));
+    OP_REQUIRES(
+        context, IsValidPermutation(src_format, dst_format),
+        errors::InvalidArgument(
+            "Destination and source format must determine a permutation, got ",
+            src_format, " and ", dst_format));
    src_format_ = src_format;
    dst_format_ = dst_format;
  }
@ -112,6 +167,24 @@ class DataFormatVecPermuteOp : public OpKernel {
                   context->allocate_output(0, input.shape(), &output));
    // Support 1D and 2D cases.
    Eigen::DSizes<Eigen::DenseIndex, 8> dst_idx;
+    string src_format_str = src_format_;
+    string dst_format_str = dst_format_;
+    if (input.dim_size(0) == 2) {
+      // If the input is a vector of size 2, treat the two elements as spatial
+      // dimensions.
+      auto keep_only_spatial_dimensions = [](string* format_str) -> void {
+        auto new_end = std::remove_if(
+            format_str->begin(), format_str->end(),
+            [](const char dim) { return dim != 'H' && dim != 'W'; });
+        format_str->erase(new_end, format_str->end());
+      };
+      keep_only_spatial_dimensions(&src_format_str);
+      keep_only_spatial_dimensions(&dst_format_str);
+      OP_REQUIRES(context,
+                  src_format_str.size() == 2 && dst_format_str.size() == 2,
+                  errors::InvalidArgument(
+                      "Format specifier must contain H and W for 2D case"));
+    }
    ComputeDstIndex(input.dims(), &dst_idx);

    functor::DataFormatVecPermute<Device, T>()(context->eigen_device<Device>(),
--- a/tensorflow/core/kernels/immutable_constant_op.cc
+++ b/tensorflow/core/kernels/immutable_constant_op.cc
@ -62,6 +62,12 @@ class MemmappedTensorAllocator : public Allocator {

  void set_delete_on_deallocate() { delete_on_deallocate_ = true; }

+  // Make sure tensors or complex types (strings, variants, resources) don't get
+  // their constructor called via a placement new since that would require
+  // writing to immutable data.
+  // See also: tensorflow/core/framework/typed_allocator.h
+  bool AllocatesOpaqueHandle() const override { return true; }
+
 private:
  std::unique_ptr<ReadOnlyMemoryRegion> memory_region_;
  // If there is an error during allocation we keep it in this status.
--- a/tensorflow/core/kernels/in_topk_op.cc
+++ b/tensorflow/core/kernels/in_topk_op.cc
@ -116,7 +116,7 @@ REGISTER_KERNEL_BUILDER(Name("InTopKV2")
                            .TypeConstraint<int64>("T"),
                        InTopK<CPUDevice, float, int64>);

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 // Forward declarations of the functor specializations for GPU.
 namespace functor {
@ -142,6 +142,6 @@ REGISTER_KERNEL_BUILDER(
    Name("InTopKV2").Device(DEVICE_GPU).TypeConstraint<int64>("T"),
    InTopK<GPUDevice, float, int64>);

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 }  // namespace tensorflow
--- a/tensorflow/core/kernels/in_topk_op.h
+++ b/tensorflow/core/kernels/in_topk_op.h
@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_IN_TOPK_OP_H_
 #define TENSORFLOW_CORE_KERNELS_IN_TOPK_OP_H_

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
--- a/tensorflow/core/kernels/in_topk_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/in_topk_op_gpu.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA)
+#if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -41,7 +41,7 @@ __global__ void ComputePredictionMaskKernel(
    const TargetT* targets,  // dims: [ num_targets ]
    int64* mask,             // dims: [ num_targets x num_classes ]
    int num_targets, int num_classes) {
-  CUDA_1D_KERNEL_LOOP(i, num_targets * num_classes) {
+  GPU_1D_KERNEL_LOOP(i, num_targets * num_classes) {
    const int batch_index = i / num_classes;
    TargetT target_idx = ldg(targets + batch_index);

@ -118,7 +118,7 @@ struct InTopKFunctor<GPUDevice, T, TargetT> {
    const auto& d = context->eigen_device<GPUDevice>();

    // Compute a mask for all predictions.
-    CudaLaunchConfig config = GetGpuLaunchConfig(num_targets * num_classes, d);
+    GpuLaunchConfig config = GetGpuLaunchConfig(num_targets * num_classes, d);
    OP_REQUIRES_OK(
        context, GpuLaunchKernel(ComputePredictionMaskKernel<T, TargetT>,
                                 config.block_count, config.thread_per_block, 0,
@ -173,4 +173,4 @@ DEFINE_GPU_KERNELS(float, int64);

 }  // end namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@ -29,7 +29,6 @@ limitations under the License.
 #include <vector>

 #include "mkl_cblas.h"
-#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@ -41,13 +40,17 @@ limitations under the License.
 #include "tensorflow/core/kernels/fill_functor.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/matmul_bcast.h"
 #include "tensorflow/core/util/mkl_util.h"
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"

 namespace tensorflow {

 typedef Eigen::ThreadPoolDevice CPUDevice;

-template <typename Device, typename Scalar>
+//  The third parameter v2_bcast is set to true if we are using V2 otherwise
+//  we set it to false.
+template <typename Device, typename Scalar, bool v2_bcast>
 class BatchMatMulMkl : public OpKernel {
 public:
  explicit BatchMatMulMkl(OpKernelConstruction *context) : OpKernel(context) {
@ -60,28 +63,54 @@ class BatchMatMulMkl : public OpKernel {
  void Compute(OpKernelContext *ctx) override {
    const Tensor &lhs = ctx->input(0);
    const Tensor &rhs = ctx->input(1);
-    OP_REQUIRES(ctx, lhs.dims() == rhs.dims(),
-                errors::InvalidArgument("lhs and rhs has different ndims: ",
-                                        lhs.shape().DebugString(), " vs. ",
-                                        rhs.shape().DebugString()));
-    const int ndims = lhs.dims();
-    OP_REQUIRES(
-        ctx, ndims >= 2,
-        errors::InvalidArgument("lhs and rhs ndims must be >= 2: ", ndims));
-    TensorShape out_shape;
-    for (int i = 0; i < ndims - 2; ++i) {
-      OP_REQUIRES(ctx, lhs.dim_size(i) == rhs.dim_size(i),
-                  errors::InvalidArgument(
-                      "lhs.dim(", i, ") and rhs.dim(", i,
-                      ") must be the same: ", lhs.shape().DebugString(), " vs ",
-                      rhs.shape().DebugString()));
-      out_shape.AddDim(lhs.dim_size(i));
+
+    if (!v2_bcast) {
+      // Using V1, so check to make sure lhs and rhs dimensions are correct and
+      // no broadcasting is needed.
+      OP_REQUIRES(ctx, lhs.dims() == rhs.dims(),
+                  errors::InvalidArgument("lhs and rhs has different ndims: ",
+                                          lhs.shape().DebugString(), " vs. ",
+                                          rhs.shape().DebugString()));
+      const int ndims = lhs.dims();
+      OP_REQUIRES(
+          ctx, ndims >= 2,
+          errors::InvalidArgument("lhs and rhs ndims must be >= 2: ", ndims));
+      for (int i = 0; i < ndims - 2; ++i) {
+        OP_REQUIRES(ctx, lhs.dim_size(i) == rhs.dim_size(i),
+                    errors::InvalidArgument("lhs.dim(", i, ") and rhs.dim(", i,
+                                            ") must be the same: ",
+                                            lhs.shape().DebugString(), " vs ",
+                                            rhs.shape().DebugString()));
+      }
+    } else {
+      OP_REQUIRES(
+          ctx, lhs.dims() >= 2,
+          errors::InvalidArgument("In[0] ndims must be >= 2: ", lhs.dims()));
+      OP_REQUIRES(
+          ctx, rhs.dims() >= 2,
+          errors::InvalidArgument("In[1] ndims must be >= 2: ", rhs.dims()));
    }
-    auto batch_size = (ndims == 2) ? 1 : out_shape.num_elements();
-    auto lhs_rows = lhs.dim_size(ndims - 2);
-    auto lhs_cols = lhs.dim_size(ndims - 1);
-    auto rhs_rows = rhs.dim_size(ndims - 2);
-    auto rhs_cols = rhs.dim_size(ndims - 1);
+
+    // lhs and rhs can have different dimensions
+    const int ndims_lhs = lhs.dims();
+    const int ndims_rhs = rhs.dims();
+
+    // Get broadcast info
+    MatMulBCast bcast(lhs.shape().dim_sizes(), rhs.shape().dim_sizes());
+    OP_REQUIRES(
+        ctx, bcast.IsValid(),
+        errors::InvalidArgument(
+            "In[0] and In[1] must have compatible batch dimensions: ",
+            lhs.shape().DebugString(), " vs. ", rhs.shape().DebugString()));
+
+    TensorShape out_shape = bcast.output_batch_shape();
+    auto batch_size = bcast.output_batch_size();
+
+    auto lhs_rows = lhs.dim_size(ndims_lhs - 2);
+    auto lhs_cols = lhs.dim_size(ndims_lhs - 1);
+    auto rhs_rows = rhs.dim_size(ndims_rhs - 2);
+    auto rhs_cols = rhs.dim_size(ndims_rhs - 1);
+
    if (adj_x_) std::swap(lhs_rows, lhs_cols);
    if (adj_y_) std::swap(rhs_rows, rhs_cols);
    OP_REQUIRES(ctx, lhs_cols == rhs_rows,
@ -89,8 +118,10 @@ class BatchMatMulMkl : public OpKernel {
                    "lhs mismatch rhs shape: ", lhs_cols, " vs. ", rhs_rows,
                    ": ", lhs.shape().DebugString(), " ",
                    rhs.shape().DebugString(), " ", adj_x_, " ", adj_y_));
+
    out_shape.AddDim(lhs_rows);
    out_shape.AddDim(rhs_cols);
+
    Tensor *out = nullptr;
    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));
    if (out->NumElements() == 0) {
@ -122,10 +153,24 @@ class BatchMatMulMkl : public OpKernel {
    a_array.reserve(batch_size);
    b_array.reserve(batch_size);
    c_array.reserve(batch_size);
-    for (int64 i = 0; i < batch_size; i++) {
-      a_array.push_back(&lhs_reshaped(i, 0, 0));
-      b_array.push_back(&rhs_reshaped(i, 0, 0));
-      c_array.push_back(&out_reshaped(i, 0, 0));
+
+    if (!bcast.IsBroadcastingRequired()) {
+      for (int64 i = 0; i < batch_size; i++) {
+        a_array.push_back(&lhs_reshaped(i, 0, 0));
+        b_array.push_back(&rhs_reshaped(i, 0, 0));
+        c_array.push_back(&out_reshaped(i, 0, 0));
+      }
+    } else {
+      // Broadcasting is needed, so get the mapping from flattened output batch
+      // indices to x's and y's flattened batch indices.
+      const std::vector<int64> &a_batch_indices = bcast.x_batch_indices();
+      const std::vector<int64> &b_batch_indices = bcast.y_batch_indices();
+
+      for (int64 i = 0; i < batch_size; i++) {
+        a_array.push_back(&lhs_reshaped(a_batch_indices[i], 0, 0));
+        b_array.push_back(&rhs_reshaped(b_batch_indices[i], 0, 0));
+        c_array.push_back(&out_reshaped(i, 0, 0));
+      }
    }

    MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, &m_array[0], &n_array[0],
@ -226,13 +271,25 @@ class BatchMatMulMkl : public OpKernel {
                              .Device(DEVICE_CPU)                             \
                              .TypeConstraint<TYPE>("T")                      \
                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
-                          BatchMatMulMkl<CPUDevice, TYPE>)
+                          BatchMatMulMkl<CPUDevice, TYPE, false>)
+
+#define REGISTER_BATCH_MATMUL_MKL_V2(TYPE)                                    \
+  REGISTER_KERNEL_BUILDER(Name("_MklBatchMatMulV2")                           \
+                              .Device(DEVICE_CPU)                             \
+                              .TypeConstraint<TYPE>("T")                      \
+                              .Label(mkl_op_registry::kMklNameChangeOpLabel), \
+                          BatchMatMulMkl<CPUDevice, TYPE, true>)

 #ifdef ENABLE_MKL
 TF_CALL_float(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_double(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_complex64(REGISTER_BATCH_MATMUL_MKL);
 TF_CALL_complex128(REGISTER_BATCH_MATMUL_MKL);
+
+TF_CALL_float(REGISTER_BATCH_MATMUL_MKL_V2);
+TF_CALL_double(REGISTER_BATCH_MATMUL_MKL_V2);
+TF_CALL_complex64(REGISTER_BATCH_MATMUL_MKL_V2);
+TF_CALL_complex128(REGISTER_BATCH_MATMUL_MKL_V2);
 #endif  // ENABLE_MKL

 }  // end namespace tensorflow
--- a/tensorflow/core/kernels/mkl_cwise_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_cwise_ops_common.cc
@ -70,6 +70,8 @@ class MklBinaryOp : public BinaryOp<Device, Functor> {

 REGISTER6(MklBinaryOp, CPU, "_MklAdd", functor::add, float, Eigen::half, double,
          int32, int64, bfloat16);
+REGISTER6(MklBinaryOp, CPU, "_MklAddV2", functor::add, float, Eigen::half,
+          double, int32, int64, bfloat16);
 REGISTER8(MklBinaryOp, CPU, "_MklSub", functor::sub, float, Eigen::half, double,
          int32, int64, complex64, complex128, bfloat16);
 REGISTER6(MklBinaryOp, CPU, "_MklMul", functor::mul, float, Eigen::half, double,
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@ -95,7 +95,8 @@ struct NthElementFunctor<CPUDevice, T> {
    const int last_dim = input_tensor.dim_size(input_tensor.dims() - 1);

    // Allocate each row to different shard.
-    auto SubNthElement = [&, input, output, last_dim, n](int start, int limit) {
+    auto SubNthElement = [&, input, output, last_dim, n](int64 start,
+                                                         int64 limit) {
      // std::nth_element would rearrange the array, so we need a new buffer.
      std::vector<T> buf(last_dim);

--- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
+++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc
@ -69,8 +69,8 @@ struct TruncatedNormalFunctor<CPUDevice, T> {

    auto DoWork = [samples_per_batch, num_elements, &ctx, &means, &stddevs,
                   &minvals, &maxvals, &gen, &output,
-                   kStdDevsInsideBoundsToUseRandnSampler](int start_batch,
-                                                          int limit_batch) {
+                   kStdDevsInsideBoundsToUseRandnSampler](int64 start_batch,
+                                                          int64 limit_batch) {
      // Capturing "gen" by-value would only make a copy for the _shared_
      // lambda.  Since we want to let each worker have its own copy, we pass
      // "gen" by reference and explicitly do a copy assignment here.
--- a/tensorflow/core/kernels/random_binomial_op.cc
+++ b/tensorflow/core/kernels/random_binomial_op.cc
@ -176,7 +176,7 @@ struct RandomBinomialFunctor<CPUDevice, T, U> {
    auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());

    auto DoWork = [samples_per_batch, num_elements, &counts, &probs, &gen,
-                   &output](int start_batch, int limit_batch) {
+                   &output](int64 start_batch, int64 limit_batch) {
      // Capturing "gen" by-value would only make a copy for the _shared_
      // lambda.  Since we want to let each worker have its own copy, we pass
      // "gen" by reference and explicitly do a copy assignment here.
--- a/tensorflow/core/kernels/random_op.cc
+++ b/tensorflow/core/kernels/random_op.cc
@ -204,7 +204,7 @@ class RandomGammaOp : public OpKernel {
    // avoid a couple flops which can be done on a per-alpha basis.

    auto DoWork = [num_samples, num_alphas, &rng, samples_flat, alpha_flat](
-                      int start_output, int limit_output) {
+                      int64 start_output, int64 limit_output) {
      using Eigen::numext::exp;
      using Eigen::numext::log;
      using Eigen::numext::pow;
--- a/tensorflow/core/kernels/random_poisson_op.cc
+++ b/tensorflow/core/kernels/random_poisson_op.cc
@ -103,7 +103,7 @@ struct PoissonFunctor<CPUDevice, T, U> {
    typedef random::UniformDistribution<random::PhiloxRandom, CT> Uniform;

    auto DoWork = [num_samples, num_rate, &rng, samples_flat, rate_flat](
-                      int start_output, int limit_output) {
+                      int64 start_output, int64 limit_output) {
      // Capturing "rng" by value would only make a copy for the _shared_
      // lambda.  Since we want to let each worker have its own copy, we pass
      // "rng" by reference and explicitly do a copy assignment.
--- a/tensorflow/core/kernels/reduction_ops_all.cc
+++ b/tensorflow/core/kernels/reduction_ops_all.cc
@ -30,7 +30,7 @@ REGISTER_KERNEL_BUILDER(
        .HostMemory("reduction_indices"),
    ReductionOp<CPUDevice, bool, int64, Eigen::internal::AndReducer>);

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(
    Name("All")
        .TypeConstraint<int32>("Tidx")
--- a/tensorflow/core/kernels/reduction_ops_any.cc
+++ b/tensorflow/core/kernels/reduction_ops_any.cc
@ -30,7 +30,7 @@ REGISTER_KERNEL_BUILDER(
        .HostMemory("reduction_indices"),
    ReductionOp<CPUDevice, bool, int64, Eigen::internal::OrReducer>);

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER_KERNEL_BUILDER(
    Name("Any")
        .TypeConstraint<int32>("Tidx")
--- a/tensorflow/core/kernels/reduction_ops_common_gpu.h
+++ b/tensorflow/core/kernels/reduction_ops_common_gpu.h
@ -15,8 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_GPU_H_
 #define TENSORFLOW_CORE_KERNELS_REDUCTION_OPS_COMMON_GPU_H_

-#if !GOOGLE_CUDA
-#error This file must only be included when building with Cuda support
+#if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
+#error This file must only be included when building with GPU support
 #endif

 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
--- a/tensorflow/core/kernels/reduction_ops_euclidean.cc
+++ b/tensorflow/core/kernels/reduction_ops_euclidean.cc
@ -33,7 +33,7 @@ namespace tensorflow {
 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define REGISTER_GPU_KERNELS(type)                                           \
  REGISTER_KERNEL_BUILDER(Name("EuclideanNorm")                              \
@ -51,8 +51,10 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
                          ReductionOp<GPUDevice, type, int64,                \
                                      functor::EuclideanNormReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#if GOOGLE_CUDA
 TF_CALL_complex64(REGISTER_GPU_KERNELS);
 TF_CALL_complex128(REGISTER_GPU_KERNELS);
+#endif
 #undef REGISTER_GPU_KERNELS

 #endif
--- a/tensorflow/core/kernels/reduction_ops_gpu_bool.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_bool.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -59,4 +59,4 @@ DEFINE_FOR_TYPE_AND_R(bool, Eigen::internal::OrReducer);
 }  // end namespace functor
 }  // end namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_double.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -67,4 +67,4 @@ DEFINE_FOR_ALL_REDUCERS(double);
 }  // end namespace functor
 }  // end namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_float.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -67,4 +67,4 @@ DEFINE_FOR_ALL_REDUCERS(float);
 }  // end namespace functor
 }  // end namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_gpu_int.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -68,4 +68,4 @@ DEFINE_FOR_ALL_REDUCERS(int64);
 }  // end namespace functor
 }  // end namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_half_mean_sum.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -64,4 +64,4 @@ DEFINE_FOR_ALL_REDUCERS(Eigen::half);
 }  // end namespace functor
 }  // end namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc
+++ b/tensorflow/core/kernels/reduction_ops_half_prod_max_min.cu.cc
@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define EIGEN_USE_GPU

@ -64,4 +64,4 @@ DEFINE_FOR_ALL_REDUCERS(Eigen::half);
 }  // end namespace functor
 }  // end namespace tensorflow

-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
--- a/tensorflow/core/kernels/reduction_ops_max.cc
+++ b/tensorflow/core/kernels/reduction_ops_max.cc
@ -33,7 +33,7 @@ namespace tensorflow {
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define REGISTER_GPU_KERNELS(type)                                             \
  REGISTER_KERNEL_BUILDER(                                                     \
--- a/tensorflow/core/kernels/reduction_ops_mean.cc
+++ b/tensorflow/core/kernels/reduction_ops_mean.cc
@ -33,7 +33,7 @@ namespace tensorflow {
 TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define REGISTER_GPU_KERNELS(type)                                      \
  REGISTER_KERNEL_BUILDER(                                              \
@ -51,8 +51,10 @@ TF_CALL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
          .HostMemory("reduction_indices"),                             \
      ReductionOp<GPUDevice, type, int64, functor::MeanReducer<type>>);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU_KERNELS);
+#if GOOGLE_CUDA
 TF_CALL_complex64(REGISTER_GPU_KERNELS);
 TF_CALL_complex128(REGISTER_GPU_KERNELS);
+#endif
 #undef REGISTER_GPU_KERNELS

 #endif
--- a/tensorflow/core/kernels/reduction_ops_min.cc
+++ b/tensorflow/core/kernels/reduction_ops_min.cc
@ -33,7 +33,7 @@ namespace tensorflow {
 TF_CALL_REAL_NUMBER_TYPES(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS

-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

 #define REGISTER_GPU_KERNELS(type)                                             \
  REGISTER_KERNEL_BUILDER(                                                     \
--- a/Show More
+++ b/Show More