Add standalone XLA AOT runtime target + relevant .cc sources to pip package.

This target and its src/hdr filegroups can be used as a "single source of truth" to identify the .cc and .h files necessary to build the XLA AOT runtime. We export these .cc files as part of the tf pip package so that users who use e.g. saved_model_cli aot compilation know which additional files are required to build a library or binary around their object file. As a followup we'll provide a cmake file that builds these to .o and includes the correct dependencies on absl and nsync cmake files. Additionally update the version of abseil so we can get the new cord dependency. PiperOrigin-RevId: 297986445 Change-Id: Ia5a4d9a6b0673c9edcd5d889d888235ca5f5453b
2020-02-28 19:23:08 -08:00 · 2020-02-28 19:23:08 -08:00 · a7a7e8ae75
commit a7a7e8ae75
parent 03d5fe920a
22 changed files with 409 additions and 68 deletions
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test", "tf_cuda_cc_test", "tf_openmp_copts")
 load(
    "//tensorflow/core/platform/default:cuda_build_defs.bzl",
    "if_cuda_is_configured",
@ -11,6 +11,7 @@ load(
 load("//tensorflow/compiler/xla:xla.bzl", "xla_py_proto_library")
 load("//tensorflow:tensorflow.bzl", "tf_portable_proto_library")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("//tensorflow/compiler/xla/service/cpu:build_defs.bzl", "runtime_copts")

 package(
    default_visibility = [":internal"],
@ -180,6 +181,76 @@ cc_library(
    ],
 )

+# The filegroups below are explicitly used by
+# tensorflow/tools/pip_package:build_pip_package to ensure we include the proper
+# sources for the XLA AOT CPU runtime; as these are necessary outside of bazel
+# when linking tfcompile objects using saved_model_cli (e.g. using the
+# tensorflow pip package). The associated .cc files are included in tensorflow
+# pip package's xla_aot_runtime_srcs/ subdirectory. All necessary headers are
+# also included in the pip package's include/tensorflow/ and include/external/
+# subdirectories. Note however that sometimes additional object files may need
+# to be linked when linking aot xla objects, e.g. abseil libraries. See the deps
+# attribute of the "xla_compiled_cpu_runtime_standalone" target below for an
+# exhaustive list.
+filegroup(
+    name = "xla_compiled_cpu_runtime_hdrs",
+    srcs = [
+        "xla_compiled_cpu_function.h",
+        "//tensorflow/compiler/xla:cpu_runtime_hdrs",
+        "//tensorflow/compiler/xla/service/cpu:single_threaded_runtime_hdrs",
+        "//tensorflow/core/kernels:xla_cpu_runtime_hdrs",
+        "//tensorflow/core/platform:xla_cpu_runtime_srcs",
+    ],
+    visibility = ["//tensorflow/tools/pip_package:__pkg__"],
+)
+
+filegroup(
+    name = "xla_compiled_cpu_runtime_srcs",
+    srcs = [
+        "xla_compiled_cpu_function.cc",
+        "//tensorflow/compiler/xla:cpu_runtime_srcs",
+        "//tensorflow/compiler/xla/service/cpu:single_threaded_runtime_srcs",
+        "//tensorflow/core/kernels:xla_cpu_runtime_srcs",
+        "//tensorflow/core/platform:xla_cpu_runtime_srcs",
+    ],
+    visibility = ["//tensorflow/tools/pip_package:__pkg__"],
+)
+
+# This stand-alone target is used to ensure that we can build tf_library type
+# targets against the subset of sources declared in
+# xla_compiled_cpu_runtime_{srcs,hdrs}.
+#
+# The macros in tensorflow/python/tools/tools.bzl produce AOT compiled binaries
+# that rely on this target, as do unit tests in tensorflow/python/tools.
+#
+# See above for the significance of the source filegroups.
+cc_library(
+    name = "xla_compiled_cpu_runtime_standalone",
+    srcs = [
+        ":xla_compiled_cpu_runtime_srcs",
+    ],
+    hdrs = [
+        ":xla_compiled_cpu_runtime_hdrs",
+    ],
+    copts = runtime_copts() + tf_openmp_copts(),
+    features = ["fully_static_link"],
+    linkstatic = 1,
+    visibility = [":friends"],
+    # Note, we specifically remove MKL dependencies so the standalone does
+    # not require the MKL binary blob.
+    deps = [
+        "//tensorflow/core/framework:numeric_types",
+        "//third_party/eigen3",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/base:dynamic_annotations",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/synchronization",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
    name = "xla_compiled_cpu_function",
    srcs = ["xla_compiled_cpu_function.cc"],
@ -190,7 +261,7 @@ cc_library(
        # binary produced by tfcompile.
        "//tensorflow/compiler/xla:cpu_function_runtime",
        "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/platform:types",
    ],
 )

--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@ -36,6 +36,25 @@ filegroup(
    ]),
 )

+filegroup(
+    name = "cpu_runtime_srcs",
+    srcs = [
+        "cpu_function_runtime.cc",
+        "executable_run_options.cc",
+    ],
+    visibility = [":friends"],
+)
+
+filegroup(
+    name = "cpu_runtime_hdrs",
+    srcs = [
+        "cpu_function_runtime.h",
+        "executable_run_options.h",
+        "types.h",
+    ],
+    visibility = [":friends"],
+)
+
 tf_proto_library_cc(
    name = "xla_data_proto",
    srcs = ["xla_data.proto"],
@ -142,7 +161,8 @@ cc_library(
    hdrs = ["types.h"],
    visibility = [":friends"],
    deps = [
-        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/framework:numeric_types",
+        "//tensorflow/core/platform:types",
        "//third_party/eigen3",
    ],
 )
@ -620,7 +640,6 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = [
        ":types",
-        "@com_google_absl//absl/strings",
    ],
 )

@ -896,7 +915,10 @@ cc_library(
    srcs = ["cpu_function_runtime.cc"],
    hdrs = ["cpu_function_runtime.h"],
    visibility = [":friends"],
-    deps = ["//tensorflow/core:framework_lite"],
+    deps = [
+        "//tensorflow/core/platform:dynamic_annotations",
+        "//tensorflow/core/platform:types",
+    ],
 )

 tf_cc_test(
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@ -17,8 +17,6 @@ limitations under the License.

 #include <atomic>

-#include "absl/strings/str_cat.h"
-
 namespace xla {

 RunId::RunId() {
@ -28,7 +26,9 @@ RunId::RunId() {

 bool operator==(const RunId& a, const RunId& b) { return a.data_ == b.data_; }

-std::string RunId::ToString() const { return absl::StrCat("RunId: ", data_); }
+std::string RunId::ToString() const {
+  return "RunId: " + std::to_string(data_);
+}

 ExecutableRunOptions& ExecutableRunOptions::set_device_ordinal(
    int device_ordinal) {
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@ -30,6 +30,32 @@ filegroup(
    ]),
 )

+filegroup(
+    name = "single_threaded_runtime_srcs",
+    srcs = [
+        "runtime_fp16.cc",
+        "runtime_key_value_sort.cc",
+        "runtime_single_threaded_conv2d.cc",
+        "runtime_single_threaded_fft.cc",
+        "runtime_single_threaded_matmul.cc",
+    ],
+    visibility = [":friends"],
+)
+
+filegroup(
+    name = "single_threaded_runtime_hdrs",
+    srcs = [
+        "runtime_conv2d_impl.h",
+        "runtime_fft_impl.h",
+        "runtime_fp16.h",
+        "runtime_key_value_sort.h",
+        "runtime_single_threaded_conv2d.h",
+        "runtime_single_threaded_fft.h",
+        "runtime_single_threaded_matmul.h",
+    ],
+    visibility = [":friends"],
+)
+
 cc_library(
    name = "cpu_transfer_manager",
    srcs = ["cpu_transfer_manager.cc"],
@ -219,7 +245,8 @@ cc_library(
    ],
    copts = runtime_copts(),
    deps = [
-        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:types",
    ],
 )

@ -545,8 +572,10 @@ cc_library(
    deps = [
        ":runtime_lightweight_check",
        "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/core:framework_lite",
-        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/core/kernels:eigen_helpers_no_mkl",
+        "//tensorflow/core/platform:dynamic_annotations",
+        "//tensorflow/core/platform:mutex",
+        "//tensorflow/core/platform:types",
        "//third_party/eigen3",
    ],
 )
@ -563,7 +592,8 @@ cc_library(
        ":runtime_conv2d",
        ":runtime_single_threaded_conv2d",
        "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/platform:dynamic_annotations",
+        "//tensorflow/core/platform:types",
        "//tensorflow/core/kernels:eigen_helpers",
        "//third_party/eigen3",
    ] + mkl_deps(),
@ -581,8 +611,10 @@ cc_library(
    deps = [
        ":runtime_lightweight_check",
        "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/framework:numeric_types",
+        "//tensorflow/core/platform:dynamic_annotations",
+        "//tensorflow/core/platform:mutex",
+        "//tensorflow/core/platform:types",
        "//third_party/eigen3",
    ],
 )
@ -596,8 +628,10 @@ cc_library(
    deps = [
        ":runtime_lightweight_check",
        "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/core:framework_lite",
        "//tensorflow/core/kernels:eigen_contraction_kernel",
+        "//tensorflow/core/platform:dynamic_annotations",
+        "//tensorflow/core/platform:mutex",
+        "//tensorflow/core/platform:types",
        "//third_party/eigen3",
    ],
 )
@ -610,7 +644,7 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/platform:types",
        "//third_party/eigen3",
    ] + mkl_deps(),
 )
@ -626,8 +660,9 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = [
        ":runtime_lightweight_check",
-        "//tensorflow/core:framework_lite",
        "//tensorflow/core/kernels:eigen_helpers",
+        "//tensorflow/core/platform:dynamic_annotations",
+        "//tensorflow/core/platform:types",
        "//third_party/eigen3",
    ],
 )
@ -643,7 +678,9 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/framework:numeric_types",
+        "//tensorflow/core/platform:dynamic_annotations",
+        "//tensorflow/core/platform:types",
        "//third_party/eigen3",
    ],
 )
@ -655,8 +692,9 @@ cc_library(
    copts = runtime_copts(),
    visibility = ["//visibility:public"],
    deps = [
-        "//tensorflow/core:framework_lite",
        "//tensorflow/core/kernels:eigen_contraction_kernel",
+        "//tensorflow/core/platform:dynamic_annotations",
+        "//tensorflow/core/platform:types",
        "//third_party/eigen3",
    ],
 )
@ -668,7 +706,9 @@ cc_library(
    copts = runtime_copts(),
    visibility = ["//visibility:public"],
    deps = [
-        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/platform:dynamic_annotations",
+        "//tensorflow/core/platform:macros",
+        "//tensorflow/core/platform:types",
        "//third_party/eigen3",
    ],
 )
@ -681,8 +721,10 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:blocking_counter",
+        "//tensorflow/core/platform:dynamic_annotations",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/platform:types",
        "//third_party/eigen3",
    ],
 )
@ -711,6 +753,23 @@ tf_cc_test(
    ],
 )

+tf_cc_test(
+    name = "runtime_fft_test",
+    srcs = [
+        "runtime_fft_impl.h",
+        "runtime_fft_test.cc",
+    ],
+    deps = [
+        ":runtime_single_threaded_fft",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core/framework:numeric_types",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_cc_test(
    name = "cpu_instruction_fusion_test",
    srcs = ["cpu_instruction_fusion_test.cc"],
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft.cc
@ -33,7 +33,8 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenFft(
  const xla::ExecutableRunOptions* run_options =
      static_cast<const xla::ExecutableRunOptions*>(run_options_ptr);
  XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  tensorflow::xla::EigenFftImpl(*run_options->intra_op_thread_pool(), out,
-                                operand, fft_type, fft_rank, input_batch,
-                                fft_length0, fft_length1, fft_length2);
+  tensorflow::xla::EigenFftImpl(
+      *run_options->intra_op_thread_pool(), out, operand,
+      static_cast<tensorflow::xla::FftType>(fft_type), fft_rank, input_batch,
+      fft_length0, fft_length1, fft_length2);
 }
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@ -19,7 +19,6 @@ limitations under the License.

 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/platform/types.h"

@ -28,6 +27,15 @@ limitations under the License.
 namespace tensorflow {
 namespace xla {

+enum class FftType : int32 {
+  FFT = 0,    // Forward FFT; complex in, complex out.
+  IFFT = 1,   // Inverse FFT; complex in, complex out.
+  RFFT = 2,   // Forward real FFT; real in, fft_length / 2 + 1 complex out
+  IRFFT = 3,  // Inverse real FFT; fft_length / 2 + 1 complex in,
+              //                   fft_length real out
+};
+static constexpr int kFftTypeArraySize = 4;
+
 namespace internal {

 // Computes either a forward or reverse complex-to-complex FFT.
@ -170,27 +178,27 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,

 template <int FFTRank, typename EigenDevice>
 void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
-                      int32 fft_type, int64 input_batch, int64 fft_length0,
+                      FftType fft_type, int64 input_batch, int64 fft_length0,
                      int64 fft_length1, int64 fft_length2) {
  switch (fft_type) {
-    case ::xla::FftType::FFT:
+    case FftType::FFT:
      EigenFftC2C<true, FFTRank, EigenDevice>(
          device, static_cast<complex64*>(out),
          static_cast<complex64*>(operand), input_batch, fft_length0,
          fft_length1, fft_length2);
      break;
-    case ::xla::FftType::IFFT:
+    case FftType::IFFT:
      EigenFftC2C<false, FFTRank, EigenDevice>(
          device, static_cast<complex64*>(out),
          static_cast<complex64*>(operand), input_batch, fft_length0,
          fft_length1, fft_length2);
      break;
-    case ::xla::FftType::RFFT:
+    case FftType::RFFT:
      EigenFftR2C<FFTRank, EigenDevice>(
          device, static_cast<complex64*>(out), static_cast<float*>(operand),
          input_batch, fft_length0, fft_length1, fft_length2);
      break;
-    case ::xla::FftType::IRFFT:
+    case FftType::IRFFT:
      EigenFftC2R<FFTRank, EigenDevice>(
          device, static_cast<float*>(out), static_cast<complex64*>(operand),
          input_batch, fft_length0, fft_length1, fft_length2);
@ -205,7 +213,7 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,

 template <typename EigenDevice>
 void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
-                  int32 fft_type, int32 fft_rank, int64 input_batch,
+                  FftType fft_type, int32 fft_rank, int64 input_batch,
                  int64 fft_length0, int64 fft_length1, int64 fft_length2) {
  switch (fft_rank) {
    case 1:
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_test.cc
@ -0,0 +1,31 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+TEST(FftTypeTest, MatchesProto) {
+  EXPECT_EQ(::xla::FftType_ARRAYSIZE, 4);
+  EXPECT_EQ(::tensorflow::xla::kFftTypeArraySize, 4);
+  EXPECT_EQ(::xla::FftType::FFT,
+            static_cast<::tensorflow::int32>(::tensorflow::xla::FftType::FFT));
+  EXPECT_EQ(::xla::FftType::IFFT,
+            static_cast<::tensorflow::int32>(::tensorflow::xla::FftType::IFFT));
+  EXPECT_EQ(::xla::FftType::RFFT,
+            static_cast<::tensorflow::int32>(::tensorflow::xla::FftType::RFFT));
+  EXPECT_EQ(::xla::FftType::IRFFT, static_cast<::tensorflow::int32>(
+                                       ::tensorflow::xla::FftType::IRFFT));
+}
--- a/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fork_join.cc
@ -19,7 +19,7 @@ limitations under the License.

 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/executable_run_options.h"
-#include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/dynamic_annotations.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
--- a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
@ -26,7 +26,8 @@ TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenSingleThreadedFft(
    const void* run_options_ptr, void* out, void* operand, int32 fft_type,
    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
    int64 fft_length2) {
-  tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand, fft_type,
+  tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand,
+                                static_cast<tensorflow::xla::FftType>(fft_type),
                                fft_rank, input_batch, fft_length0, fft_length1,
                                fft_length2);
 }
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@ -595,7 +595,10 @@ cc_library(
 cc_library(
    name = "numeric_types",
    hdrs = ["numeric_types.h"],
-    visibility = ["//tensorflow/core:__subpackages__"],
+    visibility = [
+        "//tensorflow/compiler:__subpackages__",
+        "//tensorflow/core:__subpackages__",
+    ],
    deps = [
        "//tensorflow/core/lib/bfloat16",
        "//tensorflow/core/platform:types",
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@ -836,6 +836,23 @@ cc_library(
    ],
 )

+filegroup(
+    name = "xla_cpu_runtime_hdrs",
+    srcs = [
+        "eigen_contraction_kernel.h",
+        "eigen_convolution_helpers.h",
+        "eigen_spatial_convolutions.h",
+        "eigen_spatial_convolutions-inl.h",
+    ],
+)
+
+filegroup(
+    name = "xla_cpu_runtime_srcs",
+    srcs = [
+        "eigen_contraction_kernel.cc",
+    ],
+)
+
 cc_library(
    name = "redux_functor",
    hdrs = ["redux_functor.h"],
--- a/tensorflow/core/kernels/eigen_convolution_helpers.h
+++ b/tensorflow/core/kernels/eigen_convolution_helpers.h
@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_
 #define TENSORFLOW_CORE_KERNELS_EIGEN_CONVOLUTION_HELPERS_H_

+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+
 namespace Eigen {
 namespace internal {

--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@ -335,6 +335,21 @@ cc_library(
    hdrs = ["macros.h"],
 )

+filegroup(
+    name = "xla_cpu_runtime_srcs",
+    srcs = tf_platform_deps("xla_cpu_runtime_srcs") + [
+        "cord.h",
+        "ctstring.h",
+        "ctstring_internal.h",
+        "dynamic_annotations.h",
+        "env_time.h",
+        "macros.h",
+        "platform.h",
+        "tstring.h",
+        "types.h",
+    ],
+)
+
 cc_library(
    name = "mutex",
    textual_hdrs = ["mutex.h"],
--- a/tensorflow/core/platform/default/BUILD
+++ b/tensorflow/core/platform/default/BUILD
@ -193,6 +193,16 @@ cc_library(
    ],
 )

+filegroup(
+    name = "xla_cpu_runtime_srcs",
+    srcs = [
+        "cord.h",
+        "dynamic_annotations.h",
+        "env_time.cc",
+        "integral_types.h",
+    ],
+)
+
 cc_library(
    name = "mutex",
    srcs = [
--- a/tensorflow/python/tools/saved_model_aot_compile.py
+++ b/tensorflow/python/tools/saved_model_aot_compile.py
@ -215,7 +215,8 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
                                   signature_def_key,
                                   cpp_class,
                                   target_triple,
-                                   variables_to_feed=()):
+                                   variables_to_feed=(),
+                                   enable_multithreading=False):
  """Compile a `MetaGraphDef` to header+object files in `output_prefix`.

  Use XLA AOT (`tfcompile`) to convert the given meta graph and
@ -242,6 +243,8 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
      user; these won't be frozen.  If `None`, then we will extract all the
      variables in the graph and mark them as to-feed.  The default behavior is
      an empty tuple: all variables must be frozen.
+    enable_multithreading: Not implemented.  Enable multithreading in the
+      compiled computation.

  Raises:
    RuntimeError: If tensorflow was not built with XLA.
@ -249,10 +252,25 @@ def aot_compile_cpu_meta_graph_def(checkpoint_path,
      issue importing the tfcompile python wrapper.
    ValueError: If `meta_graph_def.signature_def[signature_def_key]` is
      missing or has empty outputs.
+    NotImplementedError: If `enable_multithreading is True`.
  """
  if _pywrap_tfcompile_import_error:
    raise _pywrap_tfcompile_import_error

+  if enable_multithreading:
+    raise NotImplementedError(
+        'Multithreading is not currently supported because it requires '
+        'additional dependencies in the AOT runtime.')
+  else:
+    # TODO(ebrevdo): Pipe DebugOptions through tfcompile::Main and pywrap
+    # so that we can set these directly instead of relying on env vars.
+    xla_flags = os.environ.get('XLA_FLAGS')
+    if not xla_flags:
+      xla_flags = '--xla_cpu_multi_thread_eigen=false'
+    else:
+      xla_flags += ',--xla_cpu_multi_thread_eigen=false'
+    os.environ['XLA_FLAGS'] = xla_flags
+
  signature_def_map = meta_graph_def.signature_def
  if signature_def_key not in signature_def_map:
    raise ValueError(
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@ -807,7 +807,8 @@ def aot_compile_cpu(args):
      variables_to_feed=variables_to_feed,
      output_prefix=args.output_prefix,
      target_triple=args.target_triple,
-      cpp_class=args.cpp_class)
+      cpp_class=args.cpp_class,
+      enable_multithreading=args.enable_multithreading)


 def add_show_subparser(subparsers):
@ -1034,9 +1035,8 @@ def add_aot_compile_cpu_subparser(subparsers):
       '',
       'Some possibly useful flags:',
       '  --xla_cpu_enable_fast_math=false',
-       '  --xla_cpu_multi_thread_eigen=false',
       '  --xla_force_host_platform_device_count=<num threads>',
-       '    (useful in conjunction with disabling eigen multi threading)'
+       '    (useful in conjunction with disabling multi threading)'
      ])

  parser_compile = subparsers.add_parser(
@ -1103,6 +1103,12 @@ def add_aot_compile_cpu_subparser(subparsers):
            'values will be uninitialized in the compiled object '
            '(this applies to all input arguments from the signature as '
            'well).'))
+  parser_compile.add_argument(
+      '--enable_multithreading',
+      type=bool,
+      default='',
+      help=('*NOT CURRENTLY SUPPORTED*  '
+            'Enable multithreading in the compiled computation.'))

  parser_compile.set_defaults(func=aot_compile_cpu)

--- a/tensorflow/python/tools/tools.bzl
+++ b/tensorflow/python/tools/tools.bzl
@ -154,8 +154,7 @@ def saved_model_compile_aot(
        tags = tags,
        deps = _maybe_force_compile(
            [
-                "//tensorflow/compiler/tf2xla:xla_compiled_cpu_function",
-                "//tensorflow/core/platform:types",
+                "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_standalone",
            ],
            force_compile = force_without_xla_support_flag,
        ),
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@ -2709,3 +2709,18 @@ def tfcompile_extra_flags():
 def tf_external_workspace_visible(visibility):
    # External workspaces can see this target.
    return ["//visibility:public"]
+
+def _filegroup_as_file(ctx):
+    out = ctx.actions.declare_file(ctx.label.name)
+    ctx.actions.write(
+        output = out,
+        content = "\n".join([f.short_path for f in ctx.files.dep]),
+    )
+    return DefaultInfo(files = depset([out]))
+
+filegroup_as_file = rule(
+    implementation = _filegroup_as_file,
+    attrs = {
+        "dep": attr.label(),
+    },
+)
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@ -1,7 +1,7 @@
 # Description:
 #  Tools for building the TensorFlow pip package.

-load("//tensorflow:tensorflow.bzl", "if_windows", "transitive_hdrs")
+load("//tensorflow:tensorflow.bzl", "filegroup_as_file", "if_windows", "transitive_hdrs")
 load("//third_party/mkl:build_defs.bzl", "if_mkl", "if_mkl_ml")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
@ -66,6 +66,11 @@ py_binary(
    deps = ["//tensorflow:tensorflow_py"],
 )

+filegroup_as_file(
+    name = "xla_compiled_cpu_runtime_srcs.txt",
+    dep = "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_srcs",
+)
+
 # Add dynamic kernel dso files here.
 DYNAMIC_LOADED_KERNELS = [
    "//tensorflow/core/kernels:libtfkernel_sobol_op.so",
@ -77,7 +82,10 @@ COMMON_PIP_DEPS = [
    "README",
    "setup.py",
    ":included_headers",
+    ":xla_compiled_cpu_runtime_srcs.txt",
    "//tensorflow:tensorflow_py",
+    "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_hdrs",
+    "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_srcs",
    "//tensorflow/core:protos_all_proto_srcs",
    "//tensorflow/examples/saved_model/integration_tests:mnist_util",
    "//tensorflow/lite/python/testdata:interpreter_test_data",
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@ -41,6 +41,30 @@ function cp_external() {
  cp "${src_dir}/local_config_cuda/cuda/cuda/cuda_config.h" "${dest_dir}/local_config_cuda/cuda/cuda/"
 }

+function copy_xla_aot_runtime_sources() {
+  local src_dir=$1
+  local dst_dir=$2
+
+  pushd $src_dir
+  for file in $(cat tensorflow/tools/pip_package/xla_compiled_cpu_runtime_srcs.txt)
+  do
+    # Sometimes $file has a prefix bazel-out/host/ we want to remove.
+    prefix=${file%%tensorflow/*}  # Find the location of "tensorflow/*"
+    candidate_file=${file#$prefix}  # Remove the prefix
+    if [ ! -z "$candidate_file" ]; then
+      file=$candidate_file
+    fi
+    dn=$(dirname $file)
+    if test -f "$file"; then
+      mkdir -p "${dst_dir}/${dn}"
+      cp $file "${dst_dir}/${file}"
+    else
+      echo "Missing xla source file: ${file}" 1>&2
+    fi
+  done
+  popd
+}
+
 function move_to_root_if_exists () {
  arg_to_move="$1"
  if [ -e "${arg_to_move}" ]; then
@ -84,6 +108,7 @@ function prepare_src() {
  TMPDIR="${1%/}"
  mkdir -p "$TMPDIR"
  EXTERNAL_INCLUDES="${TMPDIR}/tensorflow/include/external"
+  XLA_AOT_RUNTIME_SOURCES="${TMPDIR}/tensorflow/xla_aot_runtime_src"

  echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"

@ -108,6 +133,9 @@ function prepare_src() {
    cp_external \
      bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
      "${EXTERNAL_INCLUDES}/"
+    copy_xla_aot_runtime_sources \
+      bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles \
+      "${XLA_AOT_RUNTIME_SOURCES}/"
    RUNFILES=bazel-bin/tensorflow/tools/pip_package/simple_console_for_window_unzip/runfiles/org_tensorflow
  else
    RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow
@ -122,6 +150,9 @@ function prepare_src() {
      cp_external \
        bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/external \
        "${EXTERNAL_INCLUDES}"
+      copy_xla_aot_runtime_sources \
+        bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow \
+        "${XLA_AOT_RUNTIME_SOURCES}"
      # Copy MKL libs over so they can be loaded at runtime
      so_lib_dir=$(ls $RUNFILES | grep solib) || true
      if [ -n "${so_lib_dir}" ]; then
@ -142,6 +173,9 @@ function prepare_src() {
      cp_external \
        bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles \
        "${EXTERNAL_INCLUDES}"
+      copy_xla_aot_runtime_sources \
+        bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow \
+        "${XLA_AOT_RUNTIME_SOURCES}"
      # Copy MKL libs over so they can be loaded at runtime
      so_lib_dir=$(ls $RUNFILES | grep solib) || true
      if [ -n "${so_lib_dir}" ]; then
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@ -189,11 +189,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""):
        # TODO: Remove the patch when https://github.com/abseil/abseil-cpp/issues/326 is resolved
        # and when TensorFlow is build against CUDA 10.2
        patch_file = clean_dep("//third_party:com_google_absl_fix_mac_and_nvcc_build.patch"),
-        sha256 = "acd93f6baaedc4414ebd08b33bebca7c7a46888916101d8c0b8083573526d070",  # SHARED_ABSL_SHA
-        strip_prefix = "abseil-cpp-43ef2148c0936ebf7cb4be6b19927a9d9d145b8f",
+        sha256 = "dfe63f014801d5bb1be64c0f94545e3a4a957916a2d353e49f7b746c25636198",  # SHARED_ABSL_SHA
+        strip_prefix = "abseil-cpp-b69c7d880caddfc25bf348dbcfe9d45fdd8bc6e6",
        urls = [
-            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/43ef2148c0936ebf7cb4be6b19927a9d9d145b8f.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/43ef2148c0936ebf7cb4be6b19927a9d9d145b8f.tar.gz",
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/b69c7d880caddfc25bf348dbcfe9d45fdd8bc6e6.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/b69c7d880caddfc25bf348dbcfe9d45fdd8bc6e6.tar.gz",
        ],
    )

--- a/third_party/com_google_absl_fix_mac_and_nvcc_build.patch
+++ b/third_party/com_google_absl_fix_mac_and_nvcc_build.patch
@ -1,6 +1,44 @@
--- ./absl/time/internal/cctz/BUILD.bazel	2019-09-23 13:20:52.000000000 -0700
-+++ ./absl/time/internal/cctz/BUILD.bazel.fixed	2019-09-23 13:20:48.000000000 -0700
-@@ -76,15 +76,6 @@
+diff -u -r old/absl/strings/string_view.h new/absl/strings/string_view.h
+--- old/absl/strings/string_view.h	2020-02-21 12:56:04.000000000 -0800
+++ new/absl/strings/string_view.h	2020-02-25 18:39:26.377782568 -0800
+@@ -292,11 +292,18 @@
+   // and an exception of type `std::out_of_range` will be thrown on invalid
+   // access.
+   constexpr const_reference at(size_type i) const {
+#if defined(__NVCC__)
+    // An nvcc bug treats the original return expression as a non-constant,
+    // which is not allowed in a constexpr function. This only happens when
+    // NDEBUG is not defined. This will be fixed in the CUDA 10.2 release.
+    return ptr_[i];
+#else
+     return ABSL_PREDICT_TRUE(i < size())
+                ? ptr_[i]
+                : ((void)base_internal::ThrowStdOutOfRange(
+                       "absl::string_view::at"),
+                   ptr_[i]);
+#endif
+   }
+ 
+   // string_view::front()
+@@ -519,7 +526,14 @@
+       (std::numeric_limits<difference_type>::max)();
+ 
+   static constexpr size_type CheckLengthInternal(size_type len) {
+#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__<10 || (__CUDACC_VER_MAJOR__==10 && __CUDACC_VER_MINOR__<2)) && !defined(NDEBUG)
+    // An nvcc bug treats the original return expression as a non-constant,
+    // which is not allowed in a constexpr function. This only happens when
+    // NDEBUG is not defined. This will be fixed in the CUDA 10.2 release.
+    return len;
+#else
+     return (void)ABSL_ASSERT(len <= kMaxSize), len;
+#endif
+   }
+ 
+   static constexpr size_type StrlenInternal(const char* str) {
+diff -u -r old/absl/time/internal/cctz/BUILD.bazel new/absl/time/internal/cctz/BUILD.bazel
+--- old/absl/time/internal/cctz/BUILD.bazel	2020-02-21 12:56:04.000000000 -0800
+++ new/absl/time/internal/cctz/BUILD.bazel	2020-02-25 15:19:29.013710932 -0800
+@@ -74,15 +74,6 @@
         "include/cctz/time_zone.h",
         "include/cctz/zone_info_source.h",
     ],
@ -14,22 +52,5 @@
 -        "//conditions:default": [],
 -    }),
     visibility = ["//visibility:public"],
-     deps = [":civil_time"],
- )
--- ./absl/strings/string_view.h	2019-09-23 13:20:52.000000000 -0700
-+++ ./absl/strings/string_view.h.fixed	2019-09-23 13:20:48.000000000 -0700
-@@ -492,7 +492,14 @@
-       (std::numeric_limits<difference_type>::max)();
- 
-   static constexpr size_type CheckLengthInternal(size_type len) {
-+#if defined(__NVCC__) && (__CUDACC_VER_MAJOR__<10 || (__CUDACC_VER_MAJOR__==10 && __CUDACC_VER_MINOR__<2)) && !defined(NDEBUG)
-+    // An nvcc bug treats the original return expression as a non-constant,
-+    // which is not allowed in a constexpr function. This only happens when
-+    // NDEBUG is not defined. This will be fixed in the CUDA 10.2 release.
-+    return len;
-+#else
-     return ABSL_ASSERT(len <= kMaxSize), len;
-+#endif
-   }
- 
-   const char* ptr_;
+     deps = [
+         ":civil_time",