Introduce additional TPU infeed and outfeed ops

PiperOrigin-RevId: 325542225 Change-Id: Ie972e60d6c5639b71719837c500ecc716eda2ebd
2020-08-07 17:51:06 -07:00 · 2020-08-07 17:51:06 -07:00 · 3ba0deba91
commit 3ba0deba91
parent 769155a21e
19 changed files with 1378 additions and 1 deletions
--- a/tensorflow/core/tpu/BUILD
+++ b/tensorflow/core/tpu/BUILD
@ -88,7 +88,13 @@ cc_library(
    name = "tpu_defs",
    srcs = ["tpu_defs.cc"],
    hdrs = ["tpu_defs.h"],
-    deps = ["//tensorflow/core:protos_all_cc"],
+    deps = [
        ":tpu_api",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/core:protos_all_cc",
        "//tensorflow/stream_executor/tpu:c_api_conversions",
        "//tensorflow/stream_executor/tpu:c_api_decl",
    ],
 )
 cc_library(
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@ -28,10 +28,16 @@ tf_kernel_library(
    deps = [
        ":cross_replica_ops",
        ":host_compute_ops",
        ":image_resize_ops",
        ":infeed_ops",
        ":outfeed_ops",
        ":replication_ops",
        ":topk_ops",
        ":tpu_compile_op",
        ":tpu_configuration_ops",
        ":tpu_execute_op",
        ":tpu_handle_to_key_op",
        ":transfer_ops",
    ],
 )
@ -684,3 +690,104 @@ cc_library(
    ],
    alwayslink = 1,
 )
 cc_library(
    name = "infeed_ops",
    srcs = ["infeed_ops.cc"],
    hdrs = ["infeed_ops.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":transfer_ops",
        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
        "//tensorflow/compiler/tf2xla:common",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/core:framework",
        "//tensorflow/core/common_runtime:dma_helper",
        "//tensorflow/core/framework:protos_all_cc",
        "//tensorflow/core/kernels:transpose_functor",
        "//tensorflow/core/platform:status",
        "//tensorflow/core/profiler/lib:traceme",
        "//tensorflow/core/tpu:tpu_defs",
        "//tensorflow/stream_executor:multi_platform_manager",
        "//tensorflow/stream_executor/tpu:tpu_transfer_manager_base",
        "//tensorflow/stream_executor/tpu:tpu_transfer_manager_interface",
    ],
    alwayslink = True,
 )
 cc_library(
    name = "transfer_ops",
    srcs = ["transfer_ops.cc"],
    hdrs = ["transfer_ops.h"],
    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
        "//tensorflow/core:framework",
        "//tensorflow/core:lib_internal",
        "//tensorflow/core/kernels:ops_util",
        "//tensorflow/core/profiler/lib:traceme",
        "//tensorflow/stream_executor:multi_platform_manager",
        "//tensorflow/stream_executor/tpu:tpu_node_context",
        "//tensorflow/stream_executor/tpu:tpu_platform_interface",
        "//tensorflow/stream_executor/tpu:tpu_transfer_manager_interface",
    ],
    alwayslink = True,
 )
 cc_library(
    name = "outfeed_ops",
    srcs = ["outfeed_ops.cc"],
    hdrs = ["outfeed_ops.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":transfer_ops",
        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
        "//tensorflow/compiler/tf2xla:common",
        "//tensorflow/core:framework",
        "//tensorflow/core/framework:protos_all_cc",
        "//tensorflow/core/tpu:tpu_defs",
        "//tensorflow/stream_executor:multi_platform_manager",
    ],
    alwayslink = True,
 )
 cc_library(
    name = "image_resize_ops",
    srcs = ["image_resize_ops.cc"],
    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/compiler/tf2xla:common",
        "//tensorflow/compiler/tf2xla:xla_compiler",
        "//tensorflow/compiler/xla/client:xla_builder",
        "//tensorflow/compiler/xla/client/lib:constants",
        "//tensorflow/core:framework",
        "//tensorflow/core/tpu:tpu_defs",
        "@com_google_absl//absl/strings",
    ],
    alwayslink = True,
 )
 cc_library(
    name = "replication_ops",
    srcs = ["replication_ops.cc"],
    visibility = ["//visibility:public"],
    deps = [
        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
        "//tensorflow/core:framework",
        "//tensorflow/core/tpu:tpu_defs",
    ],
    alwayslink = True,
 )
 cc_library(
    name = "tpu_handle_to_key_op",
    srcs = ["tpu_handle_to_key_op.cc"],
    visibility = ["//visibility:public"],
    deps = [
        ":tpu_compilation_cache_interface",
        ":tpu_op_consts",
        "//tensorflow/core:framework",
        "//tensorflow/core/tpu:tpu_configuration",
    ],
    alwayslink = True,
 )
--- a/tensorflow/core/tpu/kernels/image_resize_ops.cc
+++ b/tensorflow/core/tpu/kernels/image_resize_ops.cc
@ -0,0 +1,155 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 namespace tensorflow {
 class TpuCustomResizeOp : public XlaOpKernel {
 public:
  explicit TpuCustomResizeOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
    OP_REQUIRES_OK(ctx, ctx->GetAttr("align_corners", &align_corners_));
    OP_REQUIRES_OK(ctx,
                   ctx->GetAttr("half_pixel_centers", &half_pixel_centers_));
  }
  xla::Shape GetOutputShape(XlaOpKernelContext* ctx) const {
    std::vector<int64> out_size;
    auto status = ctx->ConstantInputAsIntVector(1, &out_size);
    CHECK_EQ(out_size.size(), 2) << status.ToString();
    xla::Shape output_shape =
        TensorShapeToXLAShape(ctx->output_xla_type(0), ctx->InputShape(0));
    output_shape.mutable_dimensions()[1] = out_size[0];
    output_shape.mutable_dimensions()[2] = out_size[1];
    return output_shape;
  }
  string OpaqueField() const {
    return absl::StrCat("\"", align_corners_, half_pixel_centers_, "\"");
  }
  void CompileGrad(XlaOpKernelContext* ctx, const char* target,
                   const xla::Shape& output_shape) {
    auto input_shape =
        TensorShapeToXLAShape(ctx->output_xla_type(0), ctx->InputShape(0));
    if (ctx->InputShape(1).dim_sizes() == ctx->InputShape(0).dim_sizes()) {
      ctx->SetOutput(
          0, xla::ConvertElementType(ctx->Input(0), ctx->output_xla_type(0)));
      return;
    }
    // The gradient should be done in two phases for large resizes.
    auto input = ctx->Input(0);
    if (input_shape.dimensions(1) / output_shape.dimensions(1) > 3 &&
        input_shape.dimensions(2) / output_shape.dimensions(2) > 3) {
      auto intermediate_shape = output_shape;
      intermediate_shape.mutable_dimensions()[1] = input_shape.dimensions(1);
      input = xla::CustomCall(ctx->builder(), target, {ctx->Input(0)},
                              intermediate_shape, OpaqueField());
    }
    ctx->SetOutput(0, xla::CustomCall(ctx->builder(), target, {input},
                                      output_shape, OpaqueField()));
  }
  void CompileForward(XlaOpKernelContext* ctx, const char* target) {
    auto output_shape = GetOutputShape(ctx);
    if (ctx->InputShape(0).dim_size(1) == output_shape.dimensions(1) &&
        ctx->InputShape(0).dim_size(2) == output_shape.dimensions(2)) {
      ctx->SetOutput(
          0, xla::ConvertElementType(ctx->Input(0), ctx->output_xla_type(0)));
      return;
    }
    if (ctx->InputShape(0).dim_size(1) == 1 &&
        ctx->InputShape(0).dim_size(2) == 1) {
      ctx->SetOutput(0,
                     ctx->Input(0) + xla::Zeros(ctx->builder(), output_shape));
      return;
    }
    ctx->SetOutput(0, xla::CustomCall(ctx->builder(), target, {ctx->Input(0)},
                                      output_shape, OpaqueField()));
  }
 private:
  bool align_corners_;
  bool half_pixel_centers_;
 };
 class TpuResizeNearestNeighborOp : public TpuCustomResizeOp {
 public:
  explicit TpuResizeNearestNeighborOp(OpKernelConstruction* ctx)
      : TpuCustomResizeOp(ctx) {}
  void Compile(XlaOpKernelContext* ctx) override {
    CompileForward(ctx, "ResizeNearest");
  }
 };
 class TpuResizeBilinearOp : public TpuCustomResizeOp {
 public:
  explicit TpuResizeBilinearOp(OpKernelConstruction* ctx)
      : TpuCustomResizeOp(ctx) {}
  void Compile(XlaOpKernelContext* ctx) override {
    CompileForward(ctx, "ResizeBilinear");
  }
 };
 class TpuResizeNearestNeighborGradOp : public TpuCustomResizeOp {
 public:
  explicit TpuResizeNearestNeighborGradOp(OpKernelConstruction* ctx)
      : TpuCustomResizeOp(ctx) {}
  void Compile(XlaOpKernelContext* ctx) override {
    CompileGrad(ctx, "ResizeNearestGrad", GetOutputShape(ctx));
  }
 };
 class TpuResizeBilinearGradOp : public TpuCustomResizeOp {
 public:
  explicit TpuResizeBilinearGradOp(OpKernelConstruction* ctx)
      : TpuCustomResizeOp(ctx) {}
  void Compile(XlaOpKernelContext* ctx) override {
    auto output_shape =
        TensorShapeToXLAShape(ctx->output_xla_type(0), ctx->InputShape(1));
    CompileGrad(ctx, "ResizeBilinearGrad", output_shape);
  }
 };
 REGISTER_XLA_OP(Name("ResizeNearestNeighbor")
                    .CompileTimeConstantInput("size")
                    .Device(DEVICE_TPU_XLA_JIT),
                TpuResizeNearestNeighborOp);
 REGISTER_XLA_OP(Name("ResizeNearestNeighborGrad")
                    .CompileTimeConstantInput("size")
                    .Device(DEVICE_TPU_XLA_JIT),
                TpuResizeNearestNeighborGradOp);
 REGISTER_XLA_OP(Name("ResizeBilinear")
                    .CompileTimeConstantInput("size")
                    .Device(DEVICE_TPU_XLA_JIT),
                TpuResizeBilinearOp);
 REGISTER_XLA_OP(Name("ResizeBilinearGrad").Device(DEVICE_TPU_XLA_JIT),
                TpuResizeBilinearGradOp);
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/kernels/infeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/infeed_ops.cc
@ -0,0 +1,529 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/infeed_ops.h"
 #include <algorithm>
 #include <vector>
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/kernels/transpose_functor.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/tpu/kernels/transfer_ops.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/tpu/tpu_transfer_manager.h"
 #include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
 namespace tensorflow {
 namespace {
 typedef Eigen::ThreadPoolDevice CPUDevice;
 typedef tensorflow::tpu::NoncopyableBuffer LinearizerBuffer;
 typedef std::deque<LinearizerBuffer> LinearizerBufferList;
 // Transposes the given tensor using the tensorflow C++ transpose implementation
 // to obtain a XLA literal for the host tensor laid out as the given layout. The
 // returned tensor is normalized to the dim0major layout -- F32[10,20,30]{2,0,1}
 // is returned as F32[20,10,30]{2,1,0}.
 xla::StatusOr<Tensor> TransposeTensor(OpKernelContext* ctx,
                                      const Tensor& input_tensor,
                                      const xla::Shape& xla_shape) {
  profiler::TraceMe trace_me("TransposeTensor", /*level=*/2);
  const int64 rank = xla_shape.rank();
  std::vector<int32> permutation(rank);
  std::vector<int64> transposed_shapes(rank);
  for (int64 i = 0; i < rank; ++i) {
    permutation[i] = xla_shape.layout().minor_to_major(rank - 1 - i);
    transposed_shapes[i] = xla_shape.dimensions(permutation[i]);
  }
  Tensor transposed_tensor;
  // If this is a trivial transpose (i.e., bitcast), just create an aliased
  // tensor with the transposed shape.
  if (xla::LayoutUtil::IsMonotonicWithDim0Major(
          xla::ShapeUtil::DropDegenerateDimensions(xla_shape).layout())) {
    TensorShape shape;
    TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(transposed_shapes, &shape));
    TF_RETURN_IF_ERROR(transposed_tensor.BitcastFrom(
        input_tensor, input_tensor.dtype(), shape));
    return transposed_tensor;
  }
  AllocatorAttributes alloc_attr;
  alloc_attr.set_on_host(true);
  TF_RETURN_IF_ERROR(ctx->allocate_temp(input_tensor.dtype(),
                                        TensorShape(transposed_shapes),
                                        &transposed_tensor, alloc_attr));
  // Eigen Transpose fails with SIGFPE if there is a dimension of size 0.
  if (input_tensor.NumElements() > 0) {
    TF_RETURN_IF_ERROR(DoTranspose<CPUDevice>(ctx->eigen_device<CPUDevice>(),
                                              input_tensor, permutation,
                                              &transposed_tensor));
  }
  return transposed_tensor;
 }
 xla::StatusOr<bool> GetLayoutOverride(OpKernelConstruction* ctx,
                                      const char* attrn_name,
                                      std::vector<int64>* minor_to_major) {
  if (!ctx->HasAttr(attrn_name)) {
    return false;
  }
  TF_RETURN_IF_ERROR(ctx->GetAttr(attrn_name, minor_to_major));
  return !minor_to_major->empty();
 }
 Status GetInfeedShapeWithLayout(OpKernelConstruction* ctx,
                                const char* attrn_name,
                                const xla::Shape& input_shape,
                                xla::Shape* output_shape) {
  std::vector<int64> minor_to_major;
  TF_ASSIGN_OR_RETURN(bool has_override,
                      GetLayoutOverride(ctx, attrn_name, &minor_to_major));
  if (!has_override) {
    *output_shape = input_shape;
    if (output_shape->IsTuple()) {
      int64 tuple_elements = xla::ShapeUtil::TupleElementCount(*output_shape);
      for (int64 i = 0; i < tuple_elements; ++i) {
        xla::Shape* sub_shape =
            xla::ShapeUtil::GetMutableSubshape(output_shape, {i});
        *sub_shape->mutable_layout() = GetTPUInfeedLayout(*sub_shape).layout();
      }
    } else {
      *output_shape->mutable_layout() =
          GetTPUInfeedLayout(*output_shape).layout();
    }
    return Status::OK();
  }
  auto layout_func = [](const xla::Shape& shape) -> xla::Layout {
    return GetTPUInfeedLayout(shape).layout();
  };
  return GetShapeWithLayout(input_shape, minor_to_major, layout_func,
                            output_shape);
 }
 // LinearizedBuffersWrapper is an opaque C++ data structure for the outputs of
 // PrelinearizeOp and PrelinearizeTupleOp. It holds the resultant linearized
 // buffers and references to input tensors whose underlying storage are shared
 // with linearized buffers.
 // NOTE: This is not a feature-complete implementation of the DT_VARIANT
 // specification. In particular, we cannot currently serialize an arbitrary
 // `LinearizerBufferList` (aka `std::deque<LinearizerBuffer>`)
 // object, so the `Encode()` and `Decode()` methods are not implemented.
 struct LinearizedBuffersWrapper {
  explicit LinearizedBuffersWrapper() {}
  explicit LinearizedBuffersWrapper(LinearizerBufferList bufs,
                                    std::vector<tensorflow::Tensor> ts)
      : buffers(std::move(bufs)), tensors(std::move(ts)) {}
  LinearizedBuffersWrapper(const LinearizedBuffersWrapper& wrapper) {
    // tensorflow::Variant requires this copy constructor to compile.
    LOG(FATAL) << "LinearizedBuffersWrapper should not copy.";
  }
  LinearizedBuffersWrapper& operator=(const LinearizedBuffersWrapper& wrapper) =
      delete;
  LinearizedBuffersWrapper(LinearizedBuffersWrapper&&) = default;
  LinearizedBuffersWrapper& operator=(LinearizedBuffersWrapper&&) = default;
  ~LinearizedBuffersWrapper() = default;
  // These functions are tensorflow::Variant requirements.
  string TypeName() const { return "(anonymous)::LinearizedBuffersWrapper"; }
  void Encode(tensorflow::VariantTensorData* data) const {
    LOG(ERROR) << "Encode() is not implemented for LinearizedBuffersWrapper "
                  "objects.";
  }
  bool Decode(const tensorflow::VariantTensorData& data) {
    LOG(ERROR) << "Decode() is not implemented for LinearizedBuffersWrapper "
                  "objects.";
    return false;
  }
  LinearizerBufferList buffers;
  // Save references on tensors whose underlying storage are shared with
  // LiteralLinearizer::Buffer in `buffers`.
  std::vector<tensorflow::Tensor> tensors;
 };
 Status AutoTransposeAndLinearize(OpKernelContext* ctx,
                                 const Tensor& input_tensor,
                                 const xla::Shape& shape,
                                 LinearizerBufferList* linearized_buffers,
                                 std::vector<Tensor>* saved_input_tensors) {
  const Tensor* tensor = &input_tensor;
  // If the given layout is not in dim0major layout, tranposes the tensor.
  bool has_transposed = false;
  Tensor transposed_tensor;
  if (!xla::LayoutUtil::IsMonotonicWithDim0Major(shape.layout())) {
    // If the given layout is not in dim0major layout, transpose the tensor.
    TF_ASSIGN_OR_RETURN(transposed_tensor,
                        TransposeTensor(ctx, input_tensor, shape));
    tensor = &transposed_tensor;
    has_transposed = true;
  }
  xla::BorrowingLiteral literal;
  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(*tensor, &literal));
  TF_RETURN_IF_ERROR(
      xla::TpuTransferManagerInterface::GetRegisteredTpuTransferManager()
          ->LinearizeToBuffers(literal, linearized_buffers));
  // The input tensor is ref-counted. Save a handle on the input tensor if
  // its underlying storage is shared with linearized buffers to prevent
  // input tensor from getting freed.
  for (const auto& buffer : *linearized_buffers) {
    if (!buffer.owns_data() && !has_transposed) {
      // `buffer` is created from zero-copy fast path from the un-transposed
      // input tensor so its underlying data is shared with input tensor.
      // Save a handle to input tensor to increment its ref-count and avoid
      // it getting deallocated after PrelinearizeTupleOp completes.
      saved_input_tensors->push_back(*tensor);
      // A literal can be linearized to zero to two buffers. If any of the
      // linearized buffer shares storage with input tensor. We save exactly
      // one handle on the input tensor.
      break;
    }
  }
  return Status::OK();
 }
 // PrelinearizeOp is used to linearize one tensor to the device format.
 class PrelinearizeOp : public OpKernel {
 public:
  explicit PrelinearizeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
    xla::Shape shape;
    OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape_, &shape));
    OP_REQUIRES_OK(ctx,
                   GetInfeedShapeWithLayout(ctx, "layout", shape, &xla_shape_));
  }
  void Compute(OpKernelContext* ctx) override {
    const Tensor& input_tensor = ctx->input(0);
    // Validate input.
    OP_REQUIRES(
        ctx, input_tensor.dtype() == dtype_,
        errors::InvalidArgument("Prelinearize dtype mismatch; expected ",
                                DataType_Name(dtype_), ", got ",
                                DataType_Name(input_tensor.dtype())));
    OP_REQUIRES(
        ctx, input_tensor.shape() == shape_,
        errors::InvalidArgument("Prelinearize shape mismatch; expected ",
                                shape_.DebugString(), ", got ",
                                input_tensor.shape().DebugString()));
    // Auto-transpose and prelinearize.
    LinearizerBufferList linearized_buffers;
    std::vector<Tensor> saved_input_tensors;
    auto status =
        AutoTransposeAndLinearize(ctx, input_tensor, xla_shape_,
                                  &linearized_buffers, &saved_input_tensors);
    OP_REQUIRES_OK(ctx, status);
    // Write to output.
    tensorflow::Tensor* output;
    OP_REQUIRES_OK(ctx,
                   ctx->allocate_output(0, tensorflow::TensorShape{}, &output));
    output->scalar<tensorflow::Variant>()() = LinearizedBuffersWrapper{
        std::move(linearized_buffers), std::move(saved_input_tensors)};
  }
  bool IsExpensive() override { return true; }
 private:
  TensorShape shape_;
  DataType dtype_;
  xla::Shape xla_shape_;
  // PrelinearizeOp is neither copyable nor movable.
  PrelinearizeOp(const PrelinearizeOp&) = delete;
  PrelinearizeOp& operator=(const PrelinearizeOp&) = delete;
 };
 // PrelinearizeTupleOp is used to linearize multiple tensors to the device
 // format.
 class PrelinearizeTupleOp : public OpKernel {
 public:
  explicit PrelinearizeTupleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &shapes_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
    OP_REQUIRES(
        ctx, shapes_.size() == dtypes_.size(),
        errors::InvalidArgument(
            "shapes and dtypes must be the same length. shapes length = ",
            shapes_.size(), ", dtypes length = ", dtypes_.size()));
    std::vector<xla::Shape> xla_shapes;
    for (int i = 0; i < shapes_.size(); i++) {
      xla::Shape xla_shape;
      OP_REQUIRES_OK(ctx,
                     TensorShapeToXLAShape(dtypes_[i], shapes_[i], &xla_shape));
      xla_shapes.push_back(xla_shape);
    }
    OP_REQUIRES_OK(
        ctx, GetInfeedShapeWithLayout(
                 ctx, "layouts", xla::ShapeUtil::MakeTupleShape(xla_shapes),
                 &tuple_shape_));
  }
  void Compute(OpKernelContext* ctx) override {
    OpInputList values;
    OP_REQUIRES_OK(ctx, ctx->input_list("inputs", &values));
    OP_REQUIRES(ctx, values.size() == shapes_.size(),
                errors::InvalidArgument(
                    "Wrong number of inputs to PrelinearizeTuple."));
    LinearizerBufferList all_linearized_buffers;
    std::vector<Tensor> all_saved_input_tensors;
    for (int i = 0; i < values.size(); i++) {
      // Validate input.
      const Tensor& input_tensor = values[i];
      OP_REQUIRES(ctx, input_tensor.dtype() == dtypes_[i],
                  errors::InvalidArgument(
                      "PrelinearizeTuple dtype mismatch at tuple element ", i,
                      "; expected ", DataType_Name(dtypes_[i]), ", got ",
                      DataType_Name(input_tensor.dtype())));
      OP_REQUIRES(ctx, input_tensor.shape() == shapes_[i],
                  errors::InvalidArgument(
                      "PrelinearizeTuple shape mismatch at tuple element ", i,
                      "; expected ", shapes_[i].DebugString(), ", got ",
                      input_tensor.shape().DebugString()));
      // Auto-transpose and prelinearize.
      LinearizerBufferList linearized_buffers;
      std::vector<Tensor> saved_input_tensors;
      auto status = AutoTransposeAndLinearize(
          ctx, input_tensor, tuple_shape_.tuple_shapes(i), &linearized_buffers,
          &saved_input_tensors);
      OP_REQUIRES_OK(ctx, status);
      all_linearized_buffers.insert(
          all_linearized_buffers.end(),
          std::make_move_iterator(linearized_buffers.begin()),
          std::make_move_iterator(linearized_buffers.end()));
      all_saved_input_tensors.insert(
          all_saved_input_tensors.end(),
          std::make_move_iterator(saved_input_tensors.begin()),
          std::make_move_iterator(saved_input_tensors.end()));
    }
    tensorflow::Tensor* output;
    OP_REQUIRES_OK(ctx,
                   ctx->allocate_output(0, tensorflow::TensorShape{}, &output));
    output->scalar<tensorflow::Variant>()() = LinearizedBuffersWrapper{
        std::move(all_linearized_buffers), std::move(all_saved_input_tensors)};
  }
  bool IsExpensive() override { return true; }
 private:
  std::vector<TensorShape> shapes_;
  DataTypeVector dtypes_;
  xla::Shape tuple_shape_;
  // PrelinearizeTupleOp is neither copyable nor movable.
  PrelinearizeTupleOp(const PrelinearizeTupleOp&) = delete;
  PrelinearizeTupleOp& operator=(const PrelinearizeTupleOp&) = delete;
 };
 // The InfeedEnqueuePrelinearizedBufferOp op is used to transfer prelinearized
 // buffers to the device infeed queue.
 class InfeedEnqueuePrelinearizedBufferOp : public TpuTransferAsyncOpKernel {
 public:
  explicit InfeedEnqueuePrelinearizedBufferOp(OpKernelConstruction* ctx)
      : TpuTransferAsyncOpKernel(ctx, "prelinearized_buffers_to_infeed", 8) {}
  Status DoWork(OpKernelContext* ctx,
                xla::TpuTransferManagerInterface* transfer_manager,
                stream_executor::StreamExecutor* stream_executor) override {
    const Tensor& input_tensor = ctx->input(0);
    const LinearizedBuffersWrapper* wrapper =
        input_tensor.scalar<tensorflow::Variant>()()
            .get<LinearizedBuffersWrapper>();
    TF_RETURN_IF_ERROR(transfer_manager->TransferBuffersToInfeed(
        stream_executor, wrapper->buffers));
    return Status::OK();
  }
 private:
  // InfeedEnqueuePrelinearizedBufferOp is neither copyable nor movable.
  InfeedEnqueuePrelinearizedBufferOp(
      const InfeedEnqueuePrelinearizedBufferOp&) = delete;
  InfeedEnqueuePrelinearizedBufferOp& operator=(
      const InfeedEnqueuePrelinearizedBufferOp&) = delete;
 };
 }  // anonymous namespace
 TpuInfeedEnqueueOp::TpuInfeedEnqueueOp(OpKernelConstruction* ctx)
    : TpuTransferAsyncOpKernel(ctx, "infeed_enqueue", 8) {
  OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
  xla::Shape shape;
  OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape_, &shape));
  OP_REQUIRES_OK(ctx,
                 GetInfeedShapeWithLayout(ctx, "layout", shape, &xla_shape_));
 }
 Status TpuInfeedEnqueueOp::DoWork(
    OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
    stream_executor::StreamExecutor* stream_executor) {
  const Tensor& input_tensor = ctx->input(0);
  // Validate runtime shape and fail if it doesn't match the contract.
  if (input_tensor.dtype() != dtype_) {
    return errors::InvalidArgument("Infeed dtype mismatch.");
  }
  if (input_tensor.shape() != shape_) {
    return errors::InvalidArgument("Infeed shape mismatch; expected ",
                                   shape_.DebugString(), ", got ",
                                   input_tensor.shape().DebugString());
  }
  const Tensor* tensor = &input_tensor;
  Tensor transposed_tensor;
  if (!xla::LayoutUtil::IsMonotonicWithDim0Major(xla_shape_.layout())) {
    // If the given layout is not in dim0major layout, transpose the tensor.
    TF_ASSIGN_OR_RETURN(transposed_tensor,
                        TransposeTensor(ctx, input_tensor, xla_shape_));
    tensor = &transposed_tensor;
  }
  xla::BorrowingLiteral literal;
  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(*tensor, &literal));
  // Transfer the given literal to the Infeed interface of the device.
  TF_RETURN_IF_ERROR(
      transfer_manager->TransferLiteralToInfeed(stream_executor, literal));
  return Status::OK();
 }
 TpuInfeedEnqueueTupleOp::TpuInfeedEnqueueTupleOp(OpKernelConstruction* ctx)
    : TpuTransferAsyncOpKernel(ctx, "infeed_enqueue", 8) {
  OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &shapes_));
  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
  OP_REQUIRES(
      ctx, shapes_.size() == dtypes_.size(),
      errors::InvalidArgument("shapes and dtypes must be the same length."));
  std::vector<xla::Shape> xla_shapes;
  for (int i = 0; i < shapes_.size(); i++) {
    xla::Shape xla_shape;
    OP_REQUIRES_OK(ctx,
                   TensorShapeToXLAShape(dtypes_[i], shapes_[i], &xla_shape));
    xla_shapes.push_back(xla_shape);
  }
  OP_REQUIRES_OK(
      ctx, GetInfeedShapeWithLayout(ctx, "layouts",
                                    xla::ShapeUtil::MakeTupleShape(xla_shapes),
                                    &tuple_shape_));
 }
 Status TpuInfeedEnqueueTupleOp::DoWork(
    OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
    stream_executor::StreamExecutor* stream_executor) {
  OpInputList values;
  TF_RETURN_IF_ERROR(ctx->input_list("inputs", &values));
  if (values.size() != shapes_.size()) {
    return errors::InvalidArgument(
        "Wrong number of inputs to InfeedEnqueueTuple.");
  }
  for (const auto& shapes : shapes_) {
    VLOG(1) << "TransferLiteralToInfeed " << shapes.DebugString();
  }
  std::vector<Tensor> maybe_transposed_tensors;
  maybe_transposed_tensors.reserve(values.size());
  for (int i = 0; i < values.size(); i++) {
    // Validate runtime shapes and fail if it doesn't match the contract.
    const Tensor* tensor = &values[i];
    if (tensor->shape() != shapes_[i]) {
      return errors::InvalidArgument("Infeed shape mismatch for tuple element ",
                                     i, "; expected ", shapes_[i].DebugString(),
                                     ", got ", tensor->shape().DebugString());
    }
    if (!xla::LayoutUtil::IsMonotonicWithDim0Major(
            tuple_shape_.tuple_shapes(i).layout())) {
      // If the given layout is not in dim0major layout, tranposes the given
      // tensor.
      TF_ASSIGN_OR_RETURN(
          Tensor transposed_tensor,
          TransposeTensor(ctx, *tensor, tuple_shape_.tuple_shapes(i)));
      maybe_transposed_tensors.emplace_back(transposed_tensor);
    } else {
      maybe_transposed_tensors.emplace_back(*tensor);
    }
  }
  xla::BorrowingLiteral tuple;
  TF_RETURN_IF_ERROR(
      HostTensorsToBorrowingLiteralTuple(maybe_transposed_tensors, &tuple));
  // Transfer the given literal to the Infeed interface of the device.
  TF_RETURN_IF_ERROR(
      transfer_manager->TransferLiteralToInfeed(stream_executor, tuple));
  VLOG(1) << "TransferLiteralToInfeed complete.";
  return Status::OK();
 }
 // These ops execute on either the TPU device or the CPU device. When running on
 // CPU they must specify a non-negative value for device_ordinal to indicate
 // which TPU to send infeed to.
 REGISTER_KERNEL_BUILDER(
    Name("InfeedEnqueue").Device(DEVICE_TPU_NODE).HostMemory("input"),
    TpuInfeedEnqueueOp);
 REGISTER_KERNEL_BUILDER(Name("InfeedEnqueue").Device(DEVICE_CPU),
                        TpuInfeedEnqueueOp);
 REGISTER_KERNEL_BUILDER(
    Name("InfeedEnqueueTuple").Device(DEVICE_TPU_NODE).HostMemory("inputs"),
    TpuInfeedEnqueueTupleOp);
 REGISTER_KERNEL_BUILDER(Name("InfeedEnqueueTuple").Device(DEVICE_CPU),
                        TpuInfeedEnqueueTupleOp);
 // Prelinearize ops run on CPU as part of tf.data input pipeline.
 REGISTER_KERNEL_BUILDER(Name("Prelinearize").Device(DEVICE_CPU),
                        PrelinearizeOp);
 REGISTER_KERNEL_BUILDER(Name("PrelinearizeTuple").Device(DEVICE_CPU),
                        PrelinearizeTupleOp);
 // InfeedEnqueuePrelinearizedBuffer op run on CPU and takes a device_ordinal to
 // select the right device to infeed.
 REGISTER_KERNEL_BUILDER(
    Name("InfeedEnqueuePrelinearizedBuffer").Device(DEVICE_CPU),
    InfeedEnqueuePrelinearizedBufferOp);
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/kernels/infeed_ops.h
+++ b/tensorflow/core/tpu/kernels/infeed_ops.h
@ -0,0 +1,69 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/transfer_ops.h"
 namespace tensorflow {
 // TODO(b/65200690): Rework this when there is a callback based infeed API to
 // StreamExecutor.
 // The InfeedEnqueue op is used to deliver data to the device infeed queue.
 class TpuInfeedEnqueueOp : public TpuTransferAsyncOpKernel {
 public:
  explicit TpuInfeedEnqueueOp(OpKernelConstruction* ctx);
  Status DoWork(OpKernelContext* ctx,
                xla::TpuTransferManagerInterface* transfer_manager,
                stream_executor::StreamExecutor* stream_executor) override;
 private:
  TensorShape shape_;
  DataType dtype_;
  xla::Shape xla_shape_;
  // TpuInfeedEnqueueOp is neither copyable nor movable.
  TpuInfeedEnqueueOp(const TpuInfeedEnqueueOp&) = delete;
  TpuInfeedEnqueueOp& operator=(const TpuInfeedEnqueueOp&) = delete;
 };
 // The InfeedEnqueueTuple op is used on the host to deliver multiple tensors to
 // the device infeed queue as an XLA tuple.
 class TpuInfeedEnqueueTupleOp : public TpuTransferAsyncOpKernel {
 public:
  explicit TpuInfeedEnqueueTupleOp(OpKernelConstruction* ctx);
  Status DoWork(OpKernelContext* ctx,
                xla::TpuTransferManagerInterface* transfer_manager,
                stream_executor::StreamExecutor* stream_executor) override;
 private:
  std::vector<TensorShape> shapes_;
  DataTypeVector dtypes_;
  xla::Shape tuple_shape_;
  // TpuInfeedEnqueueTupleOp is neither copyable nor movable.
  TpuInfeedEnqueueTupleOp(const TpuInfeedEnqueueTupleOp&) = delete;
  TpuInfeedEnqueueTupleOp& operator=(const TpuInfeedEnqueueTupleOp&) = delete;
 };
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
--- a/tensorflow/core/tpu/kernels/outfeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/outfeed_ops.cc
@ -0,0 +1,116 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/outfeed_ops.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/tpu/kernels/transfer_ops.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 namespace tensorflow {
 TpuOutfeedDequeueOp::TpuOutfeedDequeueOp(OpKernelConstruction* ctx)
    : TpuTransferAsyncOpKernel(ctx, "outfeed_dequeue", 1) {
  OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &shape_));
  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtype", &dtype_));
  OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype_, shape_, &xla_shape_));
 }
 Status TpuOutfeedDequeueOp::DoWork(
    OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
    stream_executor::StreamExecutor* stream_executor) {
  Tensor* output;
  TF_RETURN_IF_ERROR(ctx->allocate_output(0, shape_, &output));
  // Transfer from the outfeed interface of the device.
  xla::MutableBorrowingLiteral literal;
  TF_RETURN_IF_ERROR(
      HostTensorToMutableBorrowingLiteral(xla_shape_, output, &literal));
  VLOG(1) << "TransferLiteralFromOutfeed "
          << xla::ShapeUtil::HumanStringWithLayout(xla_shape_);
  TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralFromOutfeed(
      stream_executor, xla_shape_, literal));
  VLOG(1) << "TransferLiteralFromOutfeed complete.";
  return Status::OK();
 }
 // The OutfeedDequeueTuple op is used to retrieve multiple tensors from the
 // device outfeed queue.
 TpuOutfeedDequeueTupleOp::TpuOutfeedDequeueTupleOp(OpKernelConstruction* ctx)
    : TpuTransferAsyncOpKernel(ctx, "outfeed_dequeue", 1) {
  OP_REQUIRES_OK(ctx, ctx->GetAttr("shapes", &shapes_));
  OP_REQUIRES_OK(ctx, ctx->GetAttr("dtypes", &dtypes_));
  OP_REQUIRES(
      ctx, shapes_.size() == dtypes_.size(),
      errors::InvalidArgument("shapes and dtypes must be the same length."));
  // The `dtypes` list is inferred from the supplied inputs, so it
  // is always the correct length.
  for (int i = 0; i < shapes_.size(); i++) {
    xla::Shape xla_shape;
    OP_REQUIRES_OK(ctx,
                   TensorShapeToXLAShape(dtypes_[i], shapes_[i], &xla_shape));
    xla_shapes_.push_back(xla_shape);
  }
  tuple_shape_ = xla::ShapeUtil::MakeTupleShape(xla_shapes_);
 }
 Status TpuOutfeedDequeueTupleOp::DoWork(
    OpKernelContext* ctx, xla::TpuTransferManagerInterface* transfer_manager,
    stream_executor::StreamExecutor* stream_executor) {
  VLOG(1) << "TransferLiteralFromOutfeed "
          << xla::ShapeUtil::HumanStringWithLayout(tuple_shape_);
  for (int i = 0; i < shapes_.size(); ++i) {
    Tensor* output;
    TF_RETURN_IF_ERROR(ctx->allocate_output(i, shapes_[i], &output));
    xla::MutableBorrowingLiteral literal;
    TF_RETURN_IF_ERROR(
        HostTensorToMutableBorrowingLiteral(xla_shapes_[i], output, &literal));
    TF_RETURN_IF_ERROR(transfer_manager->TransferLiteralFromOutfeed(
        stream_executor, xla_shapes_[i], literal));
  }
  return Status::OK();
 }
 // These ops execute on either the TPU device or the CPU device. When
 // running on CPU they must specify a non-negative value for
 // device_ordinal to indicate which TPU to receive outfeed from.
 REGISTER_KERNEL_BUILDER(
    Name("OutfeedDequeue").Device(DEVICE_TPU_NODE).HostMemory("output"),
    TpuOutfeedDequeueOp);
 REGISTER_KERNEL_BUILDER(Name("OutfeedDequeue").Device(DEVICE_CPU),
                        TpuOutfeedDequeueOp);
 REGISTER_KERNEL_BUILDER(
    Name("OutfeedDequeueTuple").Device(DEVICE_TPU_NODE).HostMemory("outputs"),
    TpuOutfeedDequeueTupleOp);
 REGISTER_KERNEL_BUILDER(Name("OutfeedDequeueTuple").Device(DEVICE_CPU),
                        TpuOutfeedDequeueTupleOp);
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/kernels/outfeed_ops.h
+++ b/tensorflow/core/tpu/kernels/outfeed_ops.h
@ -0,0 +1,69 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/tpu/kernels/transfer_ops.h"
 namespace tensorflow {
 // The OutfeedDequeue op is used to retrieve a single tensor from the device
 // outfeed queue.
 class TpuOutfeedDequeueOp : public TpuTransferAsyncOpKernel {
 public:
  explicit TpuOutfeedDequeueOp(OpKernelConstruction* ctx);
  Status DoWork(OpKernelContext* ctx,
                xla::TpuTransferManagerInterface* transfer_manager,
                stream_executor::StreamExecutor* stream_executor) override;
 private:
  TensorShape shape_;
  DataType dtype_;
  xla::Shape xla_shape_;
  // OutfeedDequeueOp is neither copyable nor movable.
  TpuOutfeedDequeueOp(const TpuOutfeedDequeueOp&) = delete;
  TpuOutfeedDequeueOp& operator=(const TpuOutfeedDequeueOp&) = delete;
 };
 // The OutfeedDequeueTuple op is used to retrieve multiple tensors from the
 // device outfeed queue.
 class TpuOutfeedDequeueTupleOp : public TpuTransferAsyncOpKernel {
 public:
  explicit TpuOutfeedDequeueTupleOp(OpKernelConstruction* ctx);
  Status DoWork(OpKernelContext* ctx,
                xla::TpuTransferManagerInterface* transfer_manager,
                stream_executor::StreamExecutor* stream_executor) override;
 private:
  std::vector<TensorShape> shapes_;
  DataTypeVector dtypes_;
  std::vector<xla::Shape> xla_shapes_;
  xla::Shape tuple_shape_;
  // OutfeedDequeueTupleOp is neither copyable nor movable.
  TpuOutfeedDequeueTupleOp(const TpuOutfeedDequeueTupleOp&) = delete;
  TpuOutfeedDequeueTupleOp& operator=(const TpuOutfeedDequeueTupleOp&) = delete;
 };
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
--- a/tensorflow/core/tpu/kernels/replication_ops.cc
+++ b/tensorflow/core/tpu/kernels/replication_ops.cc
@ -0,0 +1,27 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("_TPUReplicate").Device(DEVICE_TPU_SYSTEM),
                        XlaDeviceDummyOp);
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/kernels/tpu_handle_to_key_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_handle_to_key_op.cc
@ -0,0 +1,62 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <string>
 #include <vector>
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 #include "tensorflow/core/tpu/kernels/tpu_op_consts.h"
 #include "tensorflow/core/tpu/tpu_configuration.h"
 namespace tensorflow {
 class TpuHandleToProtoKeyOp : public OpKernel {
 public:
  explicit TpuHandleToProtoKeyOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
  ~TpuHandleToProtoKeyOp() override = default;
  TpuHandleToProtoKeyOp(const TpuHandleToProtoKeyOp&) = delete;
  TpuHandleToProtoKeyOp& operator=(const TpuHandleToProtoKeyOp&) = delete;
  void Compute(OpKernelContext* ctx) override {
    VLOG(1) << "TpuHandleToProtoKeyOp::Compute " << ctx->op_kernel().name()
            << " on device " << ctx->op_kernel().requested_device();
    const Tensor& uid = ctx->input(0);
    ResourceMgr* rm = GetTPUConfigResourceMgr();
    tpu::TpuCompilationCacheInterface* cache;
    OP_REQUIRES_OK(ctx, rm->Lookup<tpu::TpuCompilationCacheInterface>(
                            rm->default_container(),
                            tpu::kCompilationCacheResourceName, &cache));
    core::ScopedUnref cache_unref(cache);
    std::vector<std::string> keys;
    OP_REQUIRES_OK(ctx, cache->GetKeysFromUid(uid.scalar<int64>()(), &keys));
    TensorShape output_shape;
    output_shape.AddDim(keys.size());
    Tensor* result = nullptr;
    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &result));
    for (int i = 0; i < keys.size(); ++i) {
      result->vec<tstring>()(i) = keys[i];
    }
  };
 };
 REGISTER_KERNEL_BUILDER(Name("TpuHandleToProtoKey").Device(DEVICE_CPU),
                        TpuHandleToProtoKeyOp);
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/kernels/transfer_ops.cc
+++ b/tensorflow/core/tpu/kernels/transfer_ops.cc
@ -0,0 +1,98 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/transfer_ops.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/tpu/tpu_node_context.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
 #include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
 namespace tensorflow {
 TpuTransferAsyncOpKernel::TpuTransferAsyncOpKernel(OpKernelConstruction* ctx,
                                                   const string& transfer_type,
                                                   int number_of_threads)
    : AsyncOpKernel(ctx),
      thread_pool_(new thread::ThreadPool(
          ctx->env(),
          strings::StrCat(transfer_type, "_thread_",
                          SanitizeThreadSuffix(def().name())),
          /*num_threads=*/8)) {
  OP_REQUIRES_OK(ctx, ctx->GetAttr("device_ordinal", &device_ordinal_));
  if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
    OP_REQUIRES(
        ctx, device_ordinal_ >= 0,
        errors::InvalidArgument(transfer_type,
                                " ops must specify a device_ordinal when "
                                "placed on CPU."));
  }
 }
 void TpuTransferAsyncOpKernel::ComputeAsync(OpKernelContext* ctx,
                                            DoneCallback done) {
  CancellationToken token =
      ctx->cancellation_manager()->get_cancellation_token();
  bool already_cancelled;
  {
    // Only protect registering the cancellation callback as mu_ cannot be held
    // at a point where `done` could be called.
    mutex_lock lock(mu_);
    already_cancelled = !ctx->cancellation_manager()->RegisterCallback(
        token, [this]() { Cancel(); });
  }
  OP_REQUIRES_ASYNC(ctx, !already_cancelled,
                    errors::Cancelled("Infeed was cancelled."), done);
  thread_pool_->Schedule([this, ctx, done, token]() {
    Status s = RunTransfer(ctx);
    ctx->cancellation_manager()->DeregisterCallback(token);
    OP_REQUIRES_OK_ASYNC(ctx, s, done);
    done();
  });
 }
 Status TpuTransferAsyncOpKernel::RunTransfer(OpKernelContext* ctx) {
  auto* tpu_platform = tpu::TpuPlatformInterface::GetRegisteredPlatform();
  int real_device_ordinal = device_ordinal_;
  if (real_device_ordinal < 0) {
    const XlaDevice::Metadata* metadata;
    TF_RETURN_IF_ERROR(XlaDevice::GetMetadata(ctx, &metadata));
    real_device_ordinal = metadata->device_ordinal();
  }
  stream_executor::StreamExecutor* stream_executor =
      tpu_platform->ExecutorForDevice(real_device_ordinal).ValueOrDie();
  // When Xprof profiling is off (which is the default), constructing the
  // activity is simple enough that its overhead is negligible.
  profiler::TraceMe activity(
      [this] { return profiler::TraceMeOp(name(), type_string()); },
      profiler::TraceMeLevel::kInfo);
  return DoWork(
      ctx, xla::TpuTransferManagerInterface::GetRegisteredTpuTransferManager(),
      stream_executor);
 }
 void TpuTransferAsyncOpKernel::Cancel() {
  mutex_lock lock(mu_);
  TF_CHECK_OK(tpu::TpuNodeContext::CloseTpuHost());
 }
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/kernels/transfer_ops.h
+++ b/tensorflow/core/tpu/kernels/transfer_ops.h
@ -0,0 +1,56 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 #include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
 namespace tensorflow {
 // Base class providing common functionality for async ops that transfer from
 // host to TPU.
 class TpuTransferAsyncOpKernel : public AsyncOpKernel {
 public:
  explicit TpuTransferAsyncOpKernel(OpKernelConstruction* ctx,
                                    const string& transfer_type,
                                    int number_of_threads);
  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
 protected:
  virtual Status DoWork(OpKernelContext* context,
                        xla::TpuTransferManagerInterface* transfer_manager,
                        stream_executor::StreamExecutor* stream_executor) = 0;
 private:
  Status RunTransfer(OpKernelContext* ctx);
  void Cancel();
  std::unique_ptr<thread::ThreadPool> thread_pool_;
  int device_ordinal_;
  mutex mu_;
  // TpuTransferAsyncOpKernel is neither copyable nor movable.
  TpuTransferAsyncOpKernel(const TpuTransferAsyncOpKernel&) = delete;
  TpuTransferAsyncOpKernel& operator=(const TpuTransferAsyncOpKernel&) = delete;
 };
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
--- a/tensorflow/core/tpu/tpu_defs.cc
+++ b/tensorflow/core/tpu/tpu_defs.cc
@ -15,6 +15,10 @@ limitations under the License.
 #include "tensorflow/core/tpu/tpu_defs.h"
 #include "tensorflow/core/tpu/tpu_api.h"
 #include "tensorflow/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/stream_executor/tpu/c_api_decl.h"
 namespace tensorflow {
 const char* const DEVICE_TPU_NODE = "TPU";
@ -27,4 +31,18 @@ const char* const TPUREPLICATE_MIRRORED_VAR_INDICES_ATTR =
 const char* const kTPUReplicateAttr = "_tpu_replicate";
 const char* const kOutsideCompilationAttr = "_xla_outside_compilation";
 xla::Shape GetTPUInfeedLayout(const xla::Shape& shape) {
  XLA_Shape c_shape;
  XLA_Shape c_infeed_shape;
  ApiConverter::ToC(shape, &c_shape);
  tpu::ExecutorApiFn()->TpuTransferManager_GetInfeedLayoutFn(&c_shape,
                                                             &c_infeed_shape);
  xla::Shape infeed_shape = ApiConverter::FromC(&c_infeed_shape);
  ApiConverter::Free(&c_shape);
  ApiConverter::Free(&c_infeed_shape);
  return infeed_shape;
 }
 }  // namespace tensorflow
--- a/tensorflow/core/tpu/tpu_defs.h
+++ b/tensorflow/core/tpu/tpu_defs.h
@ -20,6 +20,7 @@ limitations under the License.
 #include <array>
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/core/framework/types.pb.h"
 namespace tensorflow {
@ -56,6 +57,11 @@ static constexpr std::array<DataType, 16> kTpuAllTypes = {
     DT_COMPLEX64, DT_INT64, DT_UINT64, DT_QINT8, DT_QUINT8, DT_INT8, DT_UINT8,
     DT_INT16, DT_UINT16}};
 // For the given shape, chooses a layout for infeed on TPU. The returned shape
 // has the same dimensions as the original shape, and only the layout is
 // changed.
 xla::Shape GetTPUInfeedLayout(const xla::Shape& shape);
 }  // namespace tensorflow
 #endif  // TENSORFLOW_CORE_TPU_TPU_DEFS_H_
--- a/tensorflow/core/tpu/tpu_library_init_fns.inc
+++ b/tensorflow/core/tpu/tpu_library_init_fns.inc
@ -161,6 +161,7 @@ tensorflow::Status SetExecutorStructFn(void* library_handle) {
  TFTPU_SET_FN(executor_fn, TpuTransferManager_TransferLiteralFromDevice);
  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetByteSizeRequirement);
  TFTPU_SET_FN(executor_fn, TpuTransferManager_WriteSingleTupleIndexTable);
  TFTPU_SET_FN(executor_fn, TpuTransferManager_GetInfeedLayout);
  TFTPU_SET_FN(executor_fn, TpuTransferManager_LinearizeToBuffers);
  TFTPU_SET_FN(executor_fn, TpuTransferManager_FreeBuffers);
--- a/tensorflow/stream_executor/tpu/BUILD
+++ b/tensorflow/stream_executor/tpu/BUILD
@ -203,10 +203,12 @@ cc_library(
 cc_library(
    name = "tpu_transfer_manager_interface",
    srcs = ["tpu_transfer_manager_interface.cc"],
    hdrs = ["tpu_transfer_manager_interface.h"],
    visibility = ["//visibility:public"],
    deps = [
        ":noncopyable_buffer",
        ":tpu_platform_interface",
        "//tensorflow/compiler/xla/service:transfer_manager",
    ],
 )
--- a/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
+++ b/tensorflow/stream_executor/tpu/tpu_executor_c_api.h
@ -182,6 +182,8 @@ void TpuTransferManager_WriteSingleTupleIndexTable(
    XLA_TransferManager* manager, SE_Stream* stream,
    SE_DeviceMemoryBase* elements, size_t elements_len, XLA_Shape* shape,
    SE_DeviceMemoryBase* region, SE_Status* status);
 void TpuTransferManager_GetInfeedLayout(XLA_Shape* shape,
                                        XLA_Shape* infeed_shape);
 void TpuTransferManager_LinearizeToBuffers(
    XLA_TransferManager* manager, XLA_Literal* c_literal, char*** buffers_array,
    int64_t** buffers_size, int64_t* buffers_array_size, SE_Status* status);
@ -341,6 +343,7 @@ struct TfTpu_ExecutorApiFn {
  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_TransferLiteralFromDevice);
  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_GetByteSizeRequirement);
  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_WriteSingleTupleIndexTable);
  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_GetInfeedLayout);
  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_LinearizeToBuffers);
  TFTPU_ADD_FN_IN_STRUCT(TpuTransferManager_FreeBuffers);
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager.h
@ -81,6 +81,12 @@ class TpuTransferManager : public xla::TpuTransferManagerInterface {
      const xla::Shape& shape,
      stream_executor::DeviceMemoryBase* region) override;
  Status LinearizeToBuffers(
      const xla::LiteralSlice& literal,
      std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) override {
    LOG(FATAL) << "Not yet implemented.";
  }
 private:
  XLA_TransferManager* manager_;
 };
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.cc
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.cc
@ -0,0 +1,40 @@
 /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h"
 #include "tensorflow/stream_executor/tpu/tpu_platform_interface.h"
 namespace xla {
 /*static*/ TpuTransferManagerInterface*
 TpuTransferManagerInterface::GetRegisteredTpuTransferManager() {
  auto* platform = tensorflow::tpu::TpuPlatformInterface::GetRegisteredPlatform(
      /*initialize_platform=*/false);
  if (platform == nullptr) {
    LOG(ERROR) << "Unable to retrieve registered TPU platform.";
    return nullptr;
  }
  auto tm = xla::TransferManager::GetForPlatform(platform);
  if (!tm.ok()) {
    LOG(ERROR) << "Unable to retrieve TpuTransferManager. No TPU platform is "
                  "registered for platform "
               << platform->Name() << " and ID " << platform->id();
    return nullptr;
  }
  return static_cast<TpuTransferManagerInterface*>(tm.ValueOrDie());
 }
 }  // namespace xla
--- a/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h
+++ b/tensorflow/stream_executor/tpu/tpu_transfer_manager_interface.h
@ -24,9 +24,16 @@ limitations under the License.
 namespace xla {
 class TpuTransferManagerInterface : public xla::TransferManager {
 public:
  virtual Status TransferBuffersToInfeed(
      se::StreamExecutor* executor,
      const std::deque<tensorflow::tpu::NoncopyableBuffer>& buffers) = 0;
  virtual Status LinearizeToBuffers(
      const LiteralSlice& literal,
      std::deque<tensorflow::tpu::NoncopyableBuffer>* buffers) = 0;
  static TpuTransferManagerInterface* GetRegisteredTpuTransferManager();
 };
 }  // namespace xla