Adjust TPU build dependencies.

PiperOrigin-RevId: 323453889 Change-Id: I6b33b57830d3414d45aaa066735c3b52f217e739
2020-07-27 15:28:42 -07:00 · 2020-07-27 15:28:42 -07:00 · 9872ddd15a
commit 9872ddd15a
parent 20cd718248
36 changed files with 393 additions and 282 deletions
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@ -225,6 +225,8 @@ cc_library(
        "xla_device_context.h",
        "xla_device_ops.h",
    ],
    # Public visibility is needed for external TF/XLA backends.
    visibility = ["//visibility:public"],
    deps = XLA_DEVICE_DEPS,
 )
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@ -295,19 +295,6 @@ bool EqualFunctionDefLibrary(const FunctionDefLibrary& expected,
        << diff << "\nActual: " << actual.DebugString();          \
  } while (false)
 // These dummy Op registrations are here because the real Op registrations live
 // in contrib and there can't be a dependence from this test to contrib.
 REGISTER_OP("XlaHostCompute")
    .Input("inputs: Tinputs")
    .Output("outputs: Toutputs")
    .Attr("Tinputs: list(type) >= 0")
    .Attr("Toutputs: list(type) >= 0")
    .Attr("ancestors: list(string) >= 0")
    .Attr("key: string")
    .Attr("shape_inference_graph: func")
    .Attr("shapes: list(shape) >= 0")
    .SetShapeFn(::tensorflow::shape_inference::UnknownShape);
 REGISTER_OP("InputTest")
    .Output("o: float")
    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
@ -947,6 +934,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionOneOutside) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O1"},
            {"shape_inference_graph", shape_inference_graph},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes", absl::Span<const DataType>({})},
            {"_outside_compilation_subgraph", "O1"},
            {"_xla_token_input_nodes",
@ -1114,6 +1103,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O2"},
            {"shape_inference_graph", shape_inference_graph2},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes", absl::Span<const DataType>({})},
            {"_outside_compilation_subgraph", "O2"},
            {"_xla_token_input_nodes",
@ -1130,6 +1121,8 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O1"},
            {"shape_inference_graph", shape_inference_graph1},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes", absl::Span<const DataType>({})},
            {"_outside_compilation_subgraph", "O1"},
            {"_xla_token_input_nodes",
@ -1266,6 +1259,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O1"},
            {"shape_inference_graph", NameAttrList()},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes",
             absl::Span<const TensorShapeProto>({shape_proto_expected})},
            {"_outside_compilation_subgraph", "O1"},
@ -1295,6 +1290,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutside) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F2_F2_O1"},
            {"shape_inference_graph", NameAttrList()},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes",
             absl::Span<const TensorShapeProto>({shape_proto_expected})},
            {"_outside_compilation_subgraph", "O1"},
@ -1428,6 +1425,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O1"},
            {"shape_inference_graph", NameAttrList()},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes",
             absl::Span<const TensorShapeProto>({shape_proto_expected})},
            {"_outside_compilation_subgraph", "O1"},
@ -1454,6 +1453,8 @@ TEST(EncapsulateSubgraphsTest, TwoFunctionsTwoOutsideDependencyFromOutside) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F2_F2_O1"},
            {"shape_inference_graph", NameAttrList()},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes",
             absl::Span<const TensorShapeProto>({shape_proto_expected})},
            {"_outside_compilation_subgraph", "O1"},
@ -1566,6 +1567,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputs) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O1"},
            {"shape_inference_graph", NameAttrList()},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes",
             absl::Span<const TensorShapeProto>({shape_proto_expected})},
            {"_outside_compilation_subgraph", "O1"},
@ -1658,6 +1661,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlInput) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O1"},
            {"shape_inference_graph", NameAttrList()},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes",
             absl::Span<const TensorShapeProto>({shape_proto_expected})},
            {"_outside_compilation_subgraph", "O1"},
@ -1765,6 +1770,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoOutputs) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O1"},
            {"shape_inference_graph", shape_inference_graph},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes", absl::Span<const TensorShapeProto>({})},
            {"_outside_compilation_subgraph", "O1"},
            {"_xla_token_input_nodes",
@ -1875,6 +1882,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationControlOutput) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O1"},
            {"shape_inference_graph", shape_inference_graph},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes", absl::Span<const TensorShapeProto>({})},
            {"_outside_compilation_subgraph", "O1"},
            {"_xla_token_input_nodes",
@ -2009,6 +2018,8 @@ TEST(EncapsulateSubgraphsTest,
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O1"},
            {"shape_inference_graph", shape_inference_graph1},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes", absl::Span<const TensorShapeProto>({})},
            {"_outside_compilation_subgraph", "O1"},
            {"_xla_token_input_nodes",
@ -2023,6 +2034,8 @@ TEST(EncapsulateSubgraphsTest,
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O2"},
            {"shape_inference_graph", shape_inference_graph2},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes", absl::Span<const TensorShapeProto>({})},
            {"_outside_compilation_subgraph", "O2"},
            {"_xla_token_input_nodes",
@ -2153,6 +2166,8 @@ TEST(EncapsulateSubgraphsTest,
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O2"},
            {"shape_inference_graph", NameAttrList()},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes", absl::Span<const TensorShapeProto>({})},
            {"_outside_compilation_subgraph", "O2"},
            {"_xla_token_input_nodes",
@ -2169,6 +2184,8 @@ TEST(EncapsulateSubgraphsTest,
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O1"},
            {"shape_inference_graph", shape_inference_graph},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes", absl::Span<const TensorShapeProto>({})},
            {"_outside_compilation_subgraph", "O1"},
            {"_xla_token_input_nodes",
@ -2296,6 +2313,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
         {"ancestors", absl::Span<const string>({})},
         {"key", "host_compute_channel_F1_F1_O1"},
         {"shape_inference_graph", shape_inference_graph},
         {"tpu_core", 0},
         {"cost_estimate_ns", 1000000},
         {"shapes", absl::Span<const TensorShapeProto>({})},
         {"_outside_compilation_subgraph", "O1"},
         {"_xla_token_input_nodes",
@ -2310,6 +2329,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
         {"ancestors", absl::Span<const string>({})},
         {"key", "host_compute_channel_F1_F1_O2"},
         {"shape_inference_graph", NameAttrList()},
         {"tpu_core", 0},
         {"cost_estimate_ns", 1000000},
         {"shapes", absl::Span<const TensorShapeProto>({})},
         {"_outside_compilation_subgraph", "O2"},
         {"_xla_token_input_nodes",
@ -2325,6 +2346,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationClusterDependency) {
         {"ancestors", absl::Span<const string>({})},
         {"key", "host_compute_channel_F1_F1_O3"},
         {"shape_inference_graph", NameAttrList()},
         {"tpu_core", 0},
         {"cost_estimate_ns", 1000000},
         {"shapes", absl::Span<const TensorShapeProto>({})},
         {"_outside_compilation_subgraph", "O3"},
         {"_xla_token_input_nodes",
@ -2451,6 +2474,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationNoInputsOrOutputs) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O1"},
            {"shape_inference_graph", shape_inference_graph},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes", absl::Span<const TensorShapeProto>({})},
            {"_outside_compilation_subgraph", "O1"},
            {"_xla_token_input_nodes",
@ -2567,6 +2592,8 @@ TEST(EncapsulateSubgraphsTest, OutsideCompilationShapeInference) {
            {"ancestors", absl::Span<const string>({})},
            {"key", "host_compute_channel_F1_F1_O1"},
            {"shape_inference_graph", shape_inference_graph},
            {"tpu_core", 0},
            {"cost_estimate_ns", 1000000},
            {"shapes", absl::Span<const DataType>({})},
            {"_outside_compilation_subgraph", "O1"},
            {"_xla_token_input_nodes",
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@ -2420,6 +2420,7 @@ Status ExtractOutsideCompilationForFunction(
    auto updated_fdef = absl::make_unique<FunctionDef>();
    TF_RETURN_IF_ERROR(
        GraphToFunctionDef(*g, new_func_name, updated_fdef.get()));
    updated_fdef->mutable_signature()->set_is_stateful(true);
    const FunctionDef* original_fdef = fld->Find(func_name);
    if (original_fdef) {
      for (const auto& attr : original_fdef->attr()) {
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@ -422,19 +422,6 @@ TEST_F(ExtractOutsideCompilationForFunctionTest, NoHostGraph) {
  EXPECT_EQ(fld.Find("host_graph"), nullptr);
 }
 REGISTER_OP("XlaSendToHost")
    .Input("input: Tinput")
    .Attr("Tinput: type")
    .Attr("key: string")
    .SetIsStateful();
 REGISTER_OP("XlaRecvFromHost")
    .Output("output: Toutput")
    .Attr("Toutput: type")
    .Attr("shape: shape")
    .Attr("key: string")
    .SetIsStateful();
 TEST_F(ExtractOutsideCompilationForFunctionTest, OutsideCompilationInIf) {
  // Build the XLA computation func.
  // "const0" (bool)
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@ -880,6 +880,7 @@ cc_library(
            ":tpu_outfeed_ops_op_lib",
            ":tpu_ordinal_selector_ops_op_lib",
            ":tpu_replication_ops_op_lib",
            "//tensorflow/core/tpu/ops",
        ],
    ) + if_mkl([
        ":mkl_array_ops_op_lib",
--- a/tensorflow/core/api_def/base_api/api_def_DataServiceDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DataServiceDataset.pbtxt
@ -0,0 +1,3 @@
 op {
  graph_op_name: "DataServiceDataset"
 }
--- a/tensorflow/core/api_def/base_api/api_def_KthOrderStatistic.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_KthOrderStatistic.pbtxt
@ -0,0 +1,20 @@
 op {
  graph_op_name: "KthOrderStatistic"
  summary: "Computes the Kth order statistic of a data set. The current"
  description: <<END
 implementation uses a binary search requiring exactly 32 passes over
 the input data. The running time is linear with respect to input
 size. The median-of-medians algorithm is probably faster, but is
 difficult to implement efficiently in XLA. The implementation imposes
 a total ordering on floats. The ordering is consistent with the usual
 partial order.  Positive NaNs are greater than positive
 infinity. Negative NaNs are less than negative infinity. NaNs with
 distinct payloads are treated as distinct. Subnormal numbers are
 preserved (not flushed to zero). Positive infinity is greater than all
 numbers. Negative infinity is less than all numbers. Positive is
 greater than negative zero. There are less than k values greater than
 the kth order statistic. There are at least k values greater than or
 equal to the Kth order statistic. The semantics are not the same as
 top_k_unique.
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_MakeUnique.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MakeUnique.pbtxt
@ -0,0 +1,10 @@
 op {
  graph_op_name: "MakeUnique"
  summary: "Make all elements in the non-Batch dimension unique, but \\\"close\\\" to"
  description: <<END
 their initial value. Never returns a sub-normal number. Never returns
 zero. The sign of each input element is always identical to the sign
 of the corresponding output element. Behavior for infinite elements is
 undefined. Behavior for subnormal elements is undefined.
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_TPUCompile.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TPUCompile.pbtxt
@ -0,0 +1,21 @@
 op {
  graph_op_name: "TPUCompile"
  summary: "Compiles a computations for execution on one or more TPU devices."
  description: <<END
 For the internal use of the distributed TPU compiler.
 'num_computations' is the number of computations to be compiled.
 'function' is a function containing the computation to compile.
 'dynamic_shapes' contains dynamic shapes of arguments whose shapes were not
 known statically at TPUReplication rewrite time.
 'guaranteed_constants' is a list of tensors which have been guaranteed to not
 change their values during the session lifetime. These contain tensors marked as
 constant using the GuaranteeConstOp.
 'metadata' is a serialized TPUCompileMetadataProto describing
 the shapes and types of the inputs to the computation, as well as a mapping onto
 the TPU pod topology.
 Each 'program' output is a string key that is passed to the _TPUExecute op and
 used to look up the program in the compilation cache.
 'may_modify_variables' indicates whether variables may be modified.
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_TPUCompileSucceededAssert.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TPUCompileSucceededAssert.pbtxt
@ -0,0 +1,9 @@
 op {
  graph_op_name: "TPUCompileSucceededAssert"
  summary: "Asserts that compilation succeeded. This op produces no output and closes the"
  description: <<END
 device during failure to ensure all pending device interactions fail.
 'compilation_status' is a serialized CompilationResultProto.
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_TPUExecute.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TPUExecute.pbtxt
@ -0,0 +1,7 @@
 op {
  graph_op_name: "TPUExecute"
  summary: "Op that loads and executes a TPU program on a TPU device."
  description: <<END
 For the internal use of the distributed TPU compiler.
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_TPUExecuteAndUpdateVariables.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TPUExecuteAndUpdateVariables.pbtxt
@ -0,0 +1,13 @@
 op {
  graph_op_name: "TPUExecuteAndUpdateVariables"
  summary: "Op that executes a program with optional in-place variable updates."
  description: <<END
 It (optionally) reads device variables, loads and executes a TPU program on a
 TPU device, and then (optionally) in-place updates variables using the program
 outputs, as specified in attributes device_var_reads_indices (program input
 indices from directly reading variables) and device_var_updates_indices (program
 output indices used to update variables, -1 means no-update/read-only). Such
 program outputs are consumed by these variables will not appear in the op
 output. For the internal use of the distributed TPU compiler.
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_TPUPartitionedInput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedInput.pbtxt
@ -0,0 +1,23 @@
 op {
  graph_op_name: "TPUPartitionedInput"
  in_arg {
    name: "inputs"
    description: <<END
 A list of partitioned inputs which must have the same shape.
 END
  }
  out_arg {
    name: "output"
    description: <<END
 A handle which represents the full shape of partitioned tensors.
 END
  }
  attr {
    name: "partition_dim"
    description: <<END
 An integer describles which dimension is partitioned. -1 means
 those inputs are replicated.
 END
  }
  summary: "An op that groups a list of partitioned inputs together. This op"
 }
--- a/tensorflow/core/api_def/base_api/api_def_TPUPartitionedOutput.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TPUPartitionedOutput.pbtxt
@ -0,0 +1,25 @@
 op {
  graph_op_name: "TPUPartitionedOutput"
  in_arg {
    name: "inputs"
    description: <<END
 A tensor which represents the full shape of partitioned tensors.
 END
  }
  out_arg {
    name: "output"
    description: <<END
 A list of partitioned inputs which must have the same shape.
 END
  }
  attr {
    name: "partition_dim"
    description: <<END
 An integer describles which dimension is partitioned.
 END
  }
  summary: "An op that demultiplexes a tensor to be sharded by XLA to a list of partitioned"
  description: <<END
 outputs outside the XLA computation.
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_TopKUnique.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TopKUnique.pbtxt
@ -0,0 +1,18 @@
 op {
  graph_op_name: "TopKUnique"
  summary: "Returns the TopK unique values in the array in sorted order. The"
  description: <<END
 running time is proportional to the product of K and the input
 size. Sorting the whole array is more efficient for sufficiently large
 values of K. The median-of-medians algorithm is probably faster, but
 difficult to implement efficiently in XLA. If there are fewer than K
 unique numbers (not NANs), the results are padded with negative
 infinity. NaNs are never returned. Subnormal numbers are flushed to
 zero. If an element appears at multiple indices, the highest index is
 returned. If a TopK element never appears in the input due to padding
 values, the indices are padded with negative one. If a padding value
 appears in the input and padding is needed, the highest index of the
 padding value will be returned. The semantics are not the same as
 kth_order_statistic.
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_TopKWithUnique.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TopKWithUnique.pbtxt
@ -0,0 +1,10 @@
 op {
  graph_op_name: "TopKWithUnique"
  summary: "Returns the TopK values in the array in sorted order. This is a combination"
  description: <<END
 of MakeUnique and TopKUnique. The returned top-K will have its lower bits
 replaced by iota, thus it will be close to the original value but not exactly
 the same. The running time is proportional to the product of K and the input
 size. NaNs are never returned. Subnormal numbers are flushed to zero.
 END
 }
--- a/tensorflow/core/api_def/base_api/api_def_XlaHostCompute.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_XlaHostCompute.pbtxt
@ -0,0 +1,66 @@
 op {
  graph_op_name: "XlaHostCompute"
  in_arg {
    name: "inputs"
    description: <<END
 A list of tensors that will be sent to the host.
 END
  }
  out_arg {
    name: "outputs"
    description: <<END
 A list of tensors that will be returned to the device.
 END
  }
  attr {
    name: "Tinputs"
    description: <<END
 The element types of each element in `inputs`.
 END
  }
  attr {
    name: "Toutputs"
    description: <<END
 The element types of each element in `outputs`.
 END
  }
  attr {
    name: "ancestors"
    description: <<END
 A list of names of HostCompute computations that must be
 sequenced before this computation.
 END
  }
  attr {
    name: "shapes"
    description: <<END
 If shape_inference_graph is empty, a list of the shapes of `outputs`.
 END
  }
  attr {
    name: "shape_inference_graph"
    description: <<END
 If non-empty, a serialized GraphDef representing a graph
 that must be analyzed at compile time to determine the shapes of the outputs.
 END
  }
  attr {
    name: "key"
    description: <<END
 A unique identifier for this region used to match up host transfers.
 END
  }
  attr {
    name: "cost_estimate_ns"
    description: <<END
 Estimated duration of the host computation in nanoseconds.
 END
  }
  attr {
    name: "tpu_core"
    description: <<END
 Default core to use for host to device transfers.
 END
  }
  summary: "A pseudo-op to represent host-side computation in an XLA program."
 }
--- a/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_XlaRecvFromHost.pbtxt
@ -0,0 +1,3 @@
 op {
  graph_op_name: "XlaRecvFromHost"
 }
--- a/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_XlaSendToHost.pbtxt
@ -0,0 +1,12 @@
 op {
  graph_op_name: "XlaSendToHost"
  in_arg: {
    name: "input"
  }
  attr {
    name: "Tinput"
  }
  attr {
    name: "key"
  }
 }
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@ -1,6 +1,7 @@
 load(
    "//tensorflow:tensorflow.bzl",
    "cc_header_only_library",
    "if_tpu",
    "tf_cc_test",
    "tf_cc_test_mkl",
    "tf_cc_tests",
@ -91,7 +92,7 @@ cc_library(
        ":core_cpu",
        "//tensorflow/core/common_runtime/gpu:gpu_runtime",
        "//tensorflow/core/common_runtime/sycl:sycl_runtime",
-    ],
+    ] + if_tpu(["//tensorflow/core/tpu:tpu_runtime"]),
 )
 filegroup(
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@ -78,8 +78,7 @@ cc_library(
    srcs = ["tpu_compile_interface.cc"],
    hdrs = ["tpu_compile_interface.h"],
    deps = [
-        "//tensorflow/core/platform:fingerprint",
+        "//tensorflow/core:lib",
        "//tensorflow/core/platform:logging",
        "@com_google_absl//absl/strings",
    ],
 )
@ -146,8 +145,7 @@ cc_library(
        ":tpu_api",
        ":tpu_config_c_api",
        ":tpu_library_init_fns",
-        "//tensorflow/core/platform:errors",
+        "//tensorflow/core:lib",
        "//tensorflow/core/platform:status",
        "//tensorflow/core/tpu/graph_rewrite:tpu_rewrite_pass_registration",
        "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
        "//tensorflow/core/tpu/kernels:tpu_execute_c_api_hdrs",
@ -155,22 +153,7 @@ cc_library(
        "//tensorflow/core/tpu/kernels:tpu_util_c_api_hdrs",
        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
        "//tensorflow/stream_executor/tpu:tpu_node_context_c_api_hdrs",
-    ] + select({
+    ],
        "//tensorflow:oss": [
            ":tpu_compilation_device",
            ":tpu_node_device",
            ":tpu_system_device",
            "//tensorflow/core/tpu/ops:host_compute_ops",
            "//tensorflow/core/tpu/ops:topk_ops",
            "//tensorflow/core/tpu/ops:tpu_compile_op",
            "//tensorflow/core/tpu/ops:tpu_execute_op",
            "//tensorflow/core/tpu/ops:tpu_partitioned_ops",
            "//tensorflow/stream_executor/tpu:tpu_executor",
            "//tensorflow/stream_executor/tpu:tpu_transfer_manager",
            "//tensorflow/core/tpu:tpu_on_demand_compiler",
        ],
        "//conditions:default": [],
    }),
 )
 cc_library(
@ -193,17 +176,12 @@ cc_library(
        "//tensorflow/compiler/tf2xla:common",
        "//tensorflow/compiler/tf2xla:tf2xla_util",
        "//tensorflow/compiler/tf2xla:xla_compiler",
        "//tensorflow/core:core_cpu_internal",
        "//tensorflow/core:framework",
        "//tensorflow/core:framework_internal",
        "//tensorflow/core:lib",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core:session_options",
        "//tensorflow/core/common_runtime:copy_tensor",
        "//tensorflow/core/common_runtime:device",
        "//tensorflow/core/common_runtime:device_factory",
        "//tensorflow/core/common_runtime:dma_helper",
        "//tensorflow/core/framework:kernel_def_proto_cc",
        "//tensorflow/core/lib/core:status",
        "//tensorflow/core/platform:status",
        "//tensorflow/core/tpu/kernels:tpu_configuration_ops",
        "//tensorflow/core/tpu/kernels:tpu_util",
        "//tensorflow/stream_executor/tpu:c_api_conversions",
        "//tensorflow/stream_executor/tpu:status_helper",
        "//tensorflow/stream_executor/tpu:tpu_node_context",
@ -219,11 +197,10 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = [
        ":virtual_device",
        "//tensorflow/core:core_cpu",
        "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
+        "//tensorflow/core:lib",
        "//tensorflow/core:session_options",
        "//tensorflow/core/common_runtime:device_factory",
        "//tensorflow/core/lib/core:status",
        "//tensorflow/stream_executor/tpu:tpu_executor_base",
    ],
 )
@ -245,7 +222,6 @@ cc_library(
    hdrs = ["tpu_execute.h"],
    deps = [
        ":tpu_api",
        "//tensorflow/compiler/jit:xla_device",
        "//tensorflow/compiler/xla:executable_run_options",
        "//tensorflow/compiler/xla:shape_layout",
        "//tensorflow/compiler/xla:shape_util",
@ -303,3 +279,20 @@ cc_library(
    ],
    alwayslink = True,
 )
 cc_library(
    name = "tpu_runtime",
    srcs = [],
    visibility = ["//visibility:public"],
    deps = [
        ":tpu_api_dlsym_initializer",
        ":tpu_compilation_device",
        ":tpu_node_device",
        ":tpu_system_device",
        "//tensorflow/core/tpu:tpu_on_demand_compiler",
        "//tensorflow/core/tpu/graph_rewrite:tpu_rewrite_pass_registration",
        "//tensorflow/core/tpu/ops",
        "//tensorflow/stream_executor/tpu:tpu_executor",
        "//tensorflow/stream_executor/tpu:tpu_transfer_manager",
    ],
 )
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@ -25,7 +25,14 @@ package(
 tf_kernel_library(
    name = "kernels",
    visibility = ["//visibility:public"],
-    deps = [":tpu_configuration_ops"],
+    deps = [
        ":cross_replica_ops",
        ":host_compute_ops",
        ":topk_ops",
        ":tpu_compile_op",
        ":tpu_configuration_ops",
        ":tpu_execute_op",
    ],
 )
 cc_library(
@ -347,7 +354,6 @@ cc_library(
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core/platform:casts",  # buildcleaner: keep
        "//tensorflow/core/profiler/lib:traceme",
        "//tensorflow/core/tpu:tpu_api",
        "@com_google_absl//absl/base:core_headers",
@ -383,8 +389,10 @@ cc_library(
        "//tensorflow/compiler/xla/service",
        "//tensorflow/compiler/xla/service:hlo_proto_cc",
        "//tensorflow/core:framework",
        "//tensorflow/core:framework_internal",
        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:refcount",
+        "//tensorflow/core:lib_internal",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core/profiler/lib:traceme",
        "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
        "@com_google_absl//absl/container:node_hash_map",
@ -398,7 +406,7 @@ cc_library(
    name = "tpu_compilation_metrics_hdrs",
    hdrs = ["tpu_compilation_metrics.h"],
    deps = [
-        "//tensorflow/core/platform:types",
+        "//tensorflow/core:lib",
        "@com_google_absl//absl/strings",
    ],
 )
@ -553,20 +561,20 @@ cc_library(
        DEFAULT: [],
    }),
    deps = [
        ":tpu_compilation_cache_key",
        ":tpu_compile_c_api_hdrs",
        ":tpu_compile_op_common",
        ":tpu_compile_op_support",
        ":tpu_compile_proto_cc",
        ":tpu_mesh_state_c_api_hdrs",
        ":tpu_program_group",
        ":tpu_program_group_interface",
        ":tpu_util",
        "//tensorflow/compiler/jit:shape_inference",
        "//tensorflow/compiler/tf2xla:xla_compiler",
        "//tensorflow/compiler/xla:status",
        "//tensorflow/core:core_cpu_internal",
        "//tensorflow/core:framework",
        "//tensorflow/core/tpu/kernels:tpu_compilation_cache_key",
        "//tensorflow/core/tpu/kernels:tpu_compile_c_api_hdrs",
        "//tensorflow/core/tpu/kernels:tpu_compile_op_common",
        "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
        "//tensorflow/core/tpu/kernels:tpu_compile_proto_cc",
        "//tensorflow/core/tpu/kernels:tpu_mesh_state_c_api_hdrs",
        "//tensorflow/core/tpu/kernels:tpu_program_group",
        "//tensorflow/core/tpu/kernels:tpu_program_group_interface",
        "//tensorflow/core/tpu/kernels:tpu_util",
        "//tensorflow/stream_executor/tpu:tpu_executor",
        "//tensorflow/stream_executor/tpu:tpu_executor_c_api_hdrs",
        "@com_google_absl//absl/types:variant",
@ -612,7 +620,7 @@ cc_library(
        ":tpu_compilation_cache_lookup",
        ":tpu_executable_info_proto_cc",
        ":tpu_op_consts",
-        "//tensorflow/compiler/jit:xla_device",
+        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
        "//tensorflow/compiler/jit:xla_launch_util",
        "//tensorflow/compiler/jit:xla_tensor",
        "//tensorflow/compiler/tf2xla:common",
@ -628,12 +636,12 @@ cc_library(
        "//tensorflow/core:framework_internal",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core:protos_all_cc",
        "//tensorflow/core/profiler/lib:traceme",
        "//tensorflow/core/tpu:tpu_configuration",
        "//tensorflow/core/tpu:tpu_defs",
        "//tensorflow/core/tpu:tpu_execute",
-        "//tensorflow/stream_executor:device_memory_allocator",
+        "//tensorflow/stream_executor",
        "//tensorflow/stream_executor/tpu:tpu_node_context",
        "@com_google_absl//absl/container:flat_hash_map",
        "@com_google_absl//absl/memory",
@ -654,7 +662,6 @@ cc_library(
        "//tensorflow/compiler/xla/client:xla_builder",
        "//tensorflow/core:core_cpu_internal",
        "//tensorflow/core:framework",
        "//tensorflow/core:graph",
        "//tensorflow/core:lib",
        "@com_google_absl//absl/strings",
    ],
--- a/tensorflow/core/tpu/kernels/tpu_execute_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
@ -40,7 +40,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
@ -364,16 +363,16 @@ struct OutputBuffers {
        memory_allocator(allocator) {}
  ~OutputBuffers() {
-    buffers.buffers().ForEachElement([&](const xla::ShapeIndex& index,
+    buffers.buffers().ForEachElement(
-                                         const se::DeviceMemoryBase& buffer) {
+        [&](const xla::ShapeIndex& index, const se::DeviceMemoryBase& buffer) {
-      if (owned_buffers.element(index) && !buffer.is_null()) {
+          if (owned_buffers.element(index) && !buffer.is_null()) {
-        Status status =
+            Status status =
-            memory_allocator->Deallocate(buffers.device_ordinal(), buffer);
+                memory_allocator->Deallocate(buffers.device_ordinal(), buffer);
-        if (!status.ok()) {
+            if (!status.ok()) {
-          LOG(ERROR) << "Error deallocating buffer " << status;
+              LOG(ERROR) << "Error deallocating buffer " << status;
-        }
+            }
-      }
+          }
-    });
+        });
  }
  // Which of the buffers do we own?
--- a/tensorflow/core/tpu/ops/BUILD
+++ b/tensorflow/core/tpu/ops/BUILD
@ -3,12 +3,26 @@ package(
    licenses = ["notice"],  # Apache 2.0
 )
 cc_library(
    name = "ops",
    linkstatic = 1,
    deps = [
        ":host_compute_ops",
        ":topk_ops",
        ":tpu_compile_op",
        ":tpu_execute_op",
        ":tpu_partitioned_ops",
    ],
    alwayslink = 1,
 )
 cc_library(
    name = "tpu_partitioned_ops",
    srcs = [
        "tpu_partitioned_input_op.cc",
        "tpu_partitioned_output_op.cc",
    ],
    linkstatic = 1,
    deps = [
        "//tensorflow/core:framework",
        "//tensorflow/core:graph",
@ -22,6 +36,7 @@ cc_library(
    srcs = [
        "tpu_compile_op.cc",
    ],
    linkstatic = 1,
    deps = [
        "//tensorflow/core:framework",
        "//tensorflow/core:graph",
@ -35,6 +50,7 @@ cc_library(
    srcs = [
        "tpu_execute_op.cc",
    ],
    linkstatic = 1,
    deps = [
        "//tensorflow/core:framework",
        "//tensorflow/core:graph",
@ -48,6 +64,7 @@ cc_library(
    srcs = [
        "host_compute_ops.cc",
    ],
    linkstatic = 1,
    deps = [
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
@ -61,6 +78,7 @@ cc_library(
    srcs = [
        "topk_ops.cc",
    ],
    linkstatic = 1,
    deps = [
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
--- a/tensorflow/core/tpu/ops/host_compute_ops.cc
+++ b/tensorflow/core/tpu/ops/host_compute_ops.cc
@ -30,17 +30,10 @@ REGISTER_OP("_HostComputeMlir")
    .Attr("Toutputs: list(type) >= 0")
    .Attr("key: string")
    .Attr("tpu_core: int = 0")
-    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
-    .Doc(R"doc(
+      return ::tensorflow::shape_inference::UnknownShape(c);
-A host-side computation called from a TPU device.
+    })
-
+    .SetIsStateful();
 inputs: A list of tensors that will be sent to the host.
 outputs: A list of tensors that will be returned to the device.
 Tinputs: The element types of each element in `inputs`.
 Toutputs: The element types of each element in `outputs`.
 key: A unique identifier for this region used to match up host transfers.
 tpu_core: Default core to use for host to device transfers.
 )doc");
 REGISTER_OP("XlaHostCompute")
    .Input("inputs: Tinputs")
@ -78,36 +71,16 @@ REGISTER_OP("XlaHostCompute")
        // statically known.
        return ::tensorflow::shape_inference::UnknownShape(c);
      }
-    })
+    });
    .Doc(R"doc(
 A pseudo-op to represent host-side computation in an XLA program.
 inputs: A list of tensors that will be sent to the host.
 outputs: A list of tensors that will be returned to the device.
 Tinputs: The element types of each element in `inputs`.
 Toutputs: The element types of each element in `outputs`.
 ancestors: A list of names of HostCompute computations that must be
 sequenced before this computation.
 shape_inference_graph: If non-empty, a serialized GraphDef representing a graph
 that must be analyzed at compile time to determine the shapes of the outputs.
 shapes: If shape_inference_graph is empty, a list of the shapes of `outputs`.
 key: A unique identifier for this region used to match up host transfers.
 cost_estimate_ns: Estimated duration of the host computation in nanoseconds.
 tpu_core: Default core to use for host to device transfers.
 )doc");
 REGISTER_OP("XlaSendToHost")
    .Input("input: Tinput")
    .Attr("Tinput: type")
    .Attr("key: string")
-    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
-    .Doc(R"doc(
+      return ::tensorflow::shape_inference::UnknownShape(c);
-An op to send a tensor to the host.
+    })
-
+    .SetIsStateful();
 input: the tensor that will be sent to the host.
 Tinput: element type for input.
 key: A unique identifier for this region used to match up host transfers.
 )doc");
 REGISTER_OP("XlaRecvFromHost")
    .Output("output: Toutput")
@ -127,14 +100,6 @@ REGISTER_OP("XlaRecvFromHost")
          c->MakeShapeFromShapeProto(shape_attr->shape(), &handle));
      c->set_output(0, handle);
      return Status::OK();
-    })
+    });
    .Doc(R"doc(
 An op to receive a tensor from the host.
 output: the tensor that will be received from the host.
 Toutput: element type for output.
 shape: shape for output.
 key: A unique identifier for this region used to match up host transfers.
 )doc");
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/ops/topk_ops.cc
+++ b/tensorflow/core/tpu/ops/topk_ops.cc
@ -33,24 +33,7 @@ REGISTER_OP("KthOrderStatistic")
      TF_RETURN_IF_ERROR(c->Subshape(input, 0, -1, &s));
      c->set_output(0, s);
      return Status::OK();
-    })
+    });
    .Doc(R"doc(
 Computes the Kth order statistic of a data set. The current
 implementation uses a binary search requiring exactly 32 passes over
 the input data. The running time is linear with respect to input
 size. The median-of-medians algorithm is probably faster, but is
 difficult to implement efficiently in XLA. The implementation imposes
 a total ordering on floats. The ordering is consistent with the usual
 partial order.  Positive NaNs are greater than positive
 infinity. Negative NaNs are less than negative infinity. NaNs with
 distinct payloads are treated as distinct. Subnormal numbers are
 preserved (not flushed to zero). Positive infinity is greater than all
 numbers. Negative infinity is less than all numbers. Positive is
 greater than negative zero. There are less than k values greater than
 the kth order statistic. There are at least k values greater than or
 equal to the Kth order statistic. The semantics are not the same as
 top_k_unique.
 )doc");
 REGISTER_OP("TopKUnique")
    .Input("input: float32")
@ -69,22 +52,7 @@ REGISTER_OP("TopKUnique")
      c->set_output(0, s);
      c->set_output(1, s);
      return Status::OK();
-    })
+    });
    .Doc(R"doc(
 Returns the TopK unique values in the array in sorted order. The
 running time is proportional to the product of K and the input
 size. Sorting the whole array is more efficient for sufficiently large
 values of K. The median-of-medians algorithm is probably faster, but
 difficult to implement efficiently in XLA. If there are fewer than K
 unique numbers (not NANs), the results are padded with negative
 infinity. NaNs are never returned. Subnormal numbers are flushed to
 zero. If an element appears at multiple indices, the highest index is
 returned. If a TopK element never appears in the input due to padding
 values, the indices are padded with negative one. If a padding value
 appears in the input and padding is needed, the highest index of the
 padding value will be returned. The semantics are not the same as
 kth_order_statistic.
 )doc");
 REGISTER_OP("MakeUnique")
    .Input("input: float32")
@ -94,14 +62,7 @@ REGISTER_OP("MakeUnique")
      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
      c->set_output(0, input);
      return Status::OK();
-    })
+    });
    .Doc(R"doc(
 Make all elements in the non-Batch dimension unique, but \"close\" to
 their initial value. Never returns a sub-normal number. Never returns
 zero. The sign of each input element is always identical to the sign
 of the corresponding output element. Behavior for infinite elements is
 undefined. Behavior for subnormal elements is undefined.
 )doc");
 REGISTER_OP("TopKWithUnique")
    .Input("input: float32")
@ -120,11 +81,5 @@ REGISTER_OP("TopKWithUnique")
      c->set_output(0, s);
      c->set_output(1, s);
      return Status::OK();
-    })
+    });
    .Doc(R"doc(
 Returns the TopK values in the array in sorted order. This is a combination
 of MakeUnique and TopKUnique. The returned top-K will have its lower bits
 replaced by iota, thus it will be close to the original value but not exactly
 the same. The running time is proportional to the product of K and the input
 size. NaNs are never returned. Subnormal numbers are flushed to zero.)doc");
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/ops/tpu_compile_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_compile_op.cc
@ -43,23 +43,7 @@ REGISTER_OP("_TPUCompileMlir")
        c->set_output(i + 1, c->Vector(2));
      }
      return Status::OK();
-    })
+    });
    .Doc(
        R"(
 Compiles a computations for execution on one or more TPU devices.
 For the internal use of the distributed TPU compiler. Note that currently only
 single TPU device is supported.
 'mlir_module' is a serialized MLIR module with a `main` function that contains
 target computation.
 'dynamic_shapes' contains dynamic shapes of arguments whose shapes were not
 known statically at TPUReplication rewrite time.
 'metadata' is a serialized TPUCompileMetadataProto describing
 the shapes and types of the inputs to the computation, as well as a mapping onto
 the TPU pod topology.
 'program' output is a string key that is passed to the _TPUExecute op and
 used to look up the program in the compilation cache.
 )");
 REGISTER_OP("TPUCompile")
    .Attr("num_computations: int >= 0")
@ -91,39 +75,13 @@ REGISTER_OP("TPUCompile")
        c->set_output(num_computations + i + 1, c->Scalar());
      }
      return Status::OK();
-    })
+    });
    .Doc(
        R"(
 Compiles a computations for execution on one or more TPU devices.
 For the internal use of the distributed TPU compiler.
 'num_computations' is the number of computations to be compiled.
 'function' is a function containing the computation to compile.
 'dynamic_shapes' contains dynamic shapes of arguments whose shapes were not
 known statically at TPUReplication rewrite time.
 'guaranteed_constants' is a list of tensors which have been guaranteed to not
 change their values during the session lifetime. These contain tensors marked as
 constant using the GuaranteeConstOp.
 'metadata' is a serialized TPUCompileMetadataProto describing
 the shapes and types of the inputs to the computation, as well as a mapping onto
 the TPU pod topology.
 Each 'program' output is a string key that is passed to the _TPUExecute op and
 used to look up the program in the compilation cache.
 'may_modify_variables' indicates whether variables may be modified.
 )");
 REGISTER_OP("TPUCompileSucceededAssert")
    .Input("compilation_status: string")
    // Do not optimize me away. Read the comment on TPUCompileOp for more
    // details.
    .SetIsStateful()
-    .SetShapeFn(shape_inference::NoOutputs)
+    .SetShapeFn(shape_inference::NoOutputs);
    .Doc(
        R"(
 Asserts that compilation succeeded. This op produces no output and closes the
 device during failure to ensure all pending device interactions fail.
 'compilation_status' is a serialized CompilationResultProto.
        )");
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/ops/tpu_execute_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_execute_op.cc
@ -35,10 +35,7 @@ REGISTER_OP("TPUExecute")
        c->set_output(i, c->UnknownShape());
      }
      return Status::OK();
-    })
+    });
    .Doc(R"(
 Op that loads and executes a TPU program on a TPU device.
 For the internal use of the distributed TPU compiler.)");
 REGISTER_OP("TPUExecuteAndUpdateVariables")
    .Input("args: Targs")
@ -58,14 +55,6 @@ REGISTER_OP("TPUExecuteAndUpdateVariables")
        c->set_output(i, c->UnknownShape());
      }
      return Status::OK();
-    })
+    });
    .Doc(R"(Op that executes a program with optional in-place variable updates.
 It (optionally) reads device variables, loads and executes a TPU program on a
 TPU device, and then (optionally) in-place updates variables using the program
 outputs, as specified in attributes device_var_reads_indices (program input
 indices from directly reading variables) and device_var_updates_indices (program
 output indices used to update variables, -1 means no-update/read-only). Such
 program outputs are consumed by these variables will not appear in the op
 output. For the internal use of the distributed TPU compiler.)");
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/ops/tpu_partitioned_input_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_partitioned_input_op.cc
@ -94,14 +94,6 @@ REGISTER_OP("TPUPartitionedInput")
      }
      return Status::OK();
-    })
+    });
    .Doc(R"doc(
 An op that groups a list of partitioned inputs together. This op
 inputs: A list of partitioned inputs which must have the same shape.
 output: A handle which represents the full shape of partitioned tensors.
 partition_dim: An integer describles which dimension is partitioned. -1 means
  those inputs are replicated.
 )doc");
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/ops/tpu_partitioned_output_op.cc
+++ b/tensorflow/core/tpu/ops/tpu_partitioned_output_op.cc
@ -23,7 +23,6 @@ namespace tensorflow {
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 REGISTER_OP("TPUPartitionedOutput")
    .Input("inputs:  T")
    .Output("output: num_splits * T")
@ -53,14 +52,6 @@ REGISTER_OP("TPUPartitionedOutput")
        c->set_output(i, newoutput0);
      }
      return Status::OK();
-    })
+    });
    .Doc(R"doc(
 An op that demultiplexes a tensor to be sharded by XLA to a list of partitioned
 outputs outside the XLA computation.
 inputs: A tensor which represents the full shape of partitioned tensors.
 output: A list of partitioned inputs which must have the same shape.
 partition_dim: An integer describles which dimension is partitioned.
 )doc");
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/tpu_execute.cc
+++ b/tensorflow/core/tpu/tpu_execute.cc
@ -23,7 +23,6 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_input_output_alias_config.h"
@ -419,10 +418,6 @@ xla::StatusOr<xla::ExecutionOutput> TPUExecute(
  xla::Backend* backend = node_context->backend();
  XlaDevice* device =
      tensorflow::down_cast<XlaDevice*>(ctx->device()->UnderlyingDevice());
  TF_RET_CHECK(device);
  // Create a HostTransferManager to handle Send/Recv operations from the TPU.
  std::shared_ptr<HostTransferManager> host_transfer_manager =
      std::make_shared<HostTransferManager>(node_context, backend);
--- a/tensorflow/core/tpu/tpu_system_device.cc
+++ b/tensorflow/core/tpu/tpu_system_device.cc
@ -15,7 +15,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/graph/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/tpu/virtual_device.h"
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@ -3,7 +3,7 @@
 #  ":platform" - Low-level and platform-specific Python code.
 load("//tensorflow:tensorflow.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_tpu", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cc_test", "tf_cuda_library", "tf_enable_mlir_bridge", "tf_gen_op_wrapper_py")
+load("//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_mlir", "if_not_windows", "if_xla_available", "py_test", "py_tests", "tf_cc_shared_object", "tf_cc_test", "tf_cuda_library", "tf_enable_mlir_bridge", "tf_gen_op_wrapper_py")
 # buildifier: disable=same-origin-load
 load("//tensorflow:tensorflow.bzl", "tf_monitoring_python_deps")
@ -6093,8 +6093,6 @@ pywrap_tensorflow_macro(
        "@ngraph_tf//:ngraph_tf",
    ]) + if_xla_available([
        "//tensorflow/compiler/aot:tfcompile_lib",
    ]) + if_tpu([
        "//tensorflow/core/tpu:tpu_api_dlsym_initializer",
    ]) + if_static(extra_deps = ["//tensorflow/core/platform:tf32_utils"]),
 )
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@ -68,7 +68,7 @@ cc_library(
    deps = [
        ":c_api_decl",
        ":tpu_executor_c_api_hdrs",
-        "//tensorflow/core/platform:status",
+        "//tensorflow/core:lib",
        "//tensorflow/core/tpu:tpu_api",
    ],
 )
@ -103,9 +103,7 @@ cc_library(
        ":tpu_executor_interface",
        ":tpu_platform_interface",
        ":tpu_stream_interface",
-        "//tensorflow/core/platform:casts",
+        "//tensorflow/core:lib",
        "//tensorflow/core/platform:mutex",
        "//tensorflow/core/platform:types",
        "//tensorflow/core/tpu:tpu_api",
        "//tensorflow/stream_executor",
        "//tensorflow/stream_executor/lib",
@ -131,8 +129,6 @@ cc_library(
        ":status_helper",
        ":tpu_executor_c_api_hdrs",
        "//tensorflow/core:lib",
        "//tensorflow/core/platform:mutex",
        "//tensorflow/core/platform:types",
        "//tensorflow/core/tpu:tpu_api",
        "//tensorflow/stream_executor",
        "//tensorflow/stream_executor/lib",
@ -164,8 +160,6 @@ cc_library(
        "//tensorflow/c:tf_status",
        "//tensorflow/c:tf_status_helper",
        "//tensorflow/core:lib",
        "//tensorflow/core/platform:mutex",
        "//tensorflow/core/platform:types",
        "//tensorflow/core/tpu:tpu_api",
        "//tensorflow/stream_executor",
        "//tensorflow/stream_executor/lib",
@ -274,10 +268,8 @@ cc_library(
    hdrs = ["tpu_platform_interface.h"],
    visibility = ["//visibility:public"],
    deps = [
-        "//tensorflow/core/platform:mutex",
+        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:types",
+        "//tensorflow/stream_executor",
        "//tensorflow/stream_executor:multi_platform_manager",
        "//tensorflow/stream_executor:stream_executor_headers",
    ],
 )
@ -334,7 +326,7 @@ cc_library(
    visibility = ["//visibility:public"],
    deps = [
        ":c_api_decl",
-        "//tensorflow/core/platform:types",
+        "//tensorflow/core:lib",
        "//tensorflow/core/tpu:tpu_api",
    ],
 )
--- a/tensorflow/stream_executor/tpu/c_api_conversions.cc
+++ b/tensorflow/stream_executor/tpu/c_api_conversions.cc
@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
 namespace ApiConverter {
 xla::ShapedBuffer FromC(XLA_ShapedBuffer* c_buffer) {
  xla::Shape xla_on_host_shape = ApiConverter::FromC(&c_buffer->on_host_shape);
  xla::Shape xla_on_device_shape =
@ -114,6 +115,7 @@ SE_DeviceMemoryAllocator ToC(
  };
  return se_allocator;
 }
 SE_MaybeOwningDeviceMemory ToC(stream_executor::OwningDeviceMemory* mem) {
  SE_MaybeOwningDeviceMemory se_mem;
  se_mem.device_ordinal = mem->device_ordinal();
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
@ -349,7 +349,6 @@ struct TfTpu_ExecutorApiFn {
  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_New);
  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_Free);
  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_RunHloPasses);
  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_RunBackend);
  TFTPU_ADD_FN_IN_STRUCT(TpuCompiler_Compile);