[TF:XLA] Create Interpreter backend from the Executor backend.

- Move plugin/executor to xla/service/interpreter/ - Remove executor's TransferManager, and use GenericTransferManager instead. - Renamings and minor fixes. PiperOrigin-RevId: 169160056
2017-09-18 15:49:35 -07:00 · 2017-09-18 15:49:35 -07:00 · 7de939bb74
commit 7de939bb74
parent de724b1ac4
26 changed files with 539 additions and 576 deletions
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@ -238,7 +238,6 @@ filegroup(
        "//tensorflow/compiler/jit/kernels:all_files",
        "//tensorflow/compiler/jit/legacy_flags:all_files",
        "//tensorflow/compiler/jit/ops:all_files",
-        "//tensorflow/compiler/plugin/executor:all_files",
        "//tensorflow/compiler/tests:all_files",
        "//tensorflow/compiler/tf2xla:all_files",
        "//tensorflow/compiler/tf2xla/cc:all_files",
@ -252,6 +251,7 @@ filegroup(
        "//tensorflow/compiler/xla/service/cpu:all_files",
        "//tensorflow/compiler/xla/service/gpu:all_files",
        "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend:all_files",
+        "//tensorflow/compiler/xla/service/interpreter:all_files",
        "//tensorflow/compiler/xla/service/llvm_ir:all_files",
        "//tensorflow/compiler/xla/tests:all_files",
        "//tensorflow/compiler/xla/tools:all_files",
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@ -17,7 +17,6 @@ package_group(
 package(
    default_visibility = [
        ":internal",
-        "//tensorflow/compiler/plugin/executor:__pkg__",
    ],
 )

@ -33,7 +32,6 @@ cc_library(
    deps = [
        ":xla_cpu_device",
        ":xla_cpu_jit",
-        "//tensorflow/compiler/plugin",
    ] + if_cuda_is_configured([
        ":xla_gpu_device",
        ":xla_gpu_jit",
@ -99,6 +97,17 @@ cc_library(
    alwayslink = 1,
 )

+cc_library(
+    name = "xla_interpreter_device",
+    srcs = ["xla_interpreter_device.cc"],
+    deps = [
+        ":xla_device",
+        "//tensorflow/compiler/jit/kernels:xla_device_launch_op",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+    ],
+    alwayslink = True,
+)
+
 # Internal targets below this point.

 cc_library(
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@ -2,7 +2,6 @@ licenses(["notice"])  # Apache 2.0

 package(
    default_visibility = [
-        "//tensorflow/compiler/plugin/executor:__pkg__",
        "//tensorflow/compiler/tf2xla:internal",
    ],
 )
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

+// Registers the XLA_INTERPRETER device which exposes the XLA Interpreter.
+
 #include "tensorflow/compiler/jit/kernels/xla_device_launch_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
@ -20,46 +22,47 @@ limitations under the License.

 namespace tensorflow {

-const char* const DEVICE_XLA_EXEC = "XLA_EXEC";
-const char* const DEVICE_EXEC_XLA_JIT = "XLA_EXEC_JIT";
+const char* const DEVICE_XLA_INTERPRETER = "XLA_INTERPRETER";
+const char* const DEVICE_INTERPRETER_XLA_JIT = "XLA_INTERPRETER_JIT";

 constexpr std::array<DataType, 5> kExecAllTypes = {
    {DT_INT32, DT_FLOAT, DT_BOOL, DT_DOUBLE, DT_INT64}};

-class XlaExaDeviceFactory : public DeviceFactory {
+class XlaInterpreterDeviceFactory : public DeviceFactory {
 public:
  Status CreateDevices(const SessionOptions& options, const string& name_prefix,
                       std::vector<Device*>* devices) override;
 };

-Status XlaExaDeviceFactory::CreateDevices(const SessionOptions& options,
-                                          const string& name_prefix,
-                                          std::vector<Device*>* devices) {
-  static XlaDeviceOpRegistrations* registrations =
-      RegisterXlaDeviceKernels(DEVICE_XLA_EXEC, DEVICE_EXEC_XLA_JIT);
+Status XlaInterpreterDeviceFactory::CreateDevices(
+    const SessionOptions& options, const string& name_prefix,
+    std::vector<Device*>* devices) {
+  static XlaDeviceOpRegistrations* registrations = RegisterXlaDeviceKernels(
+      DEVICE_XLA_INTERPRETER, DEVICE_INTERPRETER_XLA_JIT);
  (void)registrations;

  std::unique_ptr<XlaDevice> device;
-  TF_RETURN_IF_ERROR(XlaDevice::Create("Executor", DEVICE_XLA_EXEC, 0,
-                                       DEVICE_EXEC_XLA_JIT, options,
+  TF_RETURN_IF_ERROR(XlaDevice::Create("Interpreter", DEVICE_XLA_INTERPRETER, 0,
+                                       DEVICE_INTERPRETER_XLA_JIT, options,
                                       name_prefix, &device));
  devices->push_back(device.release());
  return Status::OK();
 }

-// Set priority to be below the default priority (50), so that Executor is not
-// selected as a high priority device over other default devices.
-// See constructor comments for Registrar in
+// Set priority to be below the default priority (50), so that Interpreter is
+// not selected as a high priority device over other default devices. See
+// constructor comments for Registrar in
 // tensorflow/core/common_runtime/device_factory.h for a list of priority for
 // devices.
-REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_EXEC, XlaExaDeviceFactory, 40);
+REGISTER_LOCAL_DEVICE_FACTORY(DEVICE_XLA_INTERPRETER,
+                              XlaInterpreterDeviceFactory, 40);

 // Kernel registrations
-
 static bool OpFilter(KernelDef* kdef) { return true; }

-REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_EXEC, XlaDeviceLaunchOp, kExecAllTypes);
-REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_EXEC, kExecAllTypes);
-REGISTER_XLA_BACKEND(DEVICE_EXEC_XLA_JIT, kExecAllTypes, OpFilter);
+REGISTER_XLA_LAUNCH_KERNEL(DEVICE_XLA_INTERPRETER, XlaDeviceLaunchOp,
+                           kExecAllTypes);
+REGISTER_XLA_DEVICE_KERNELS(DEVICE_XLA_INTERPRETER, kExecAllTypes);
+REGISTER_XLA_BACKEND(DEVICE_INTERPRETER_XLA_JIT, kExecAllTypes, OpFilter);

 }  // namespace tensorflow
--- a/tensorflow/compiler/plugin/BUILD
+++ b/tensorflow/compiler/plugin/BUILD
@ -1,38 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-"""Configuration file for an XLA plugin.
- please don't check in changes to this file
- to prevent changes appearing in git status, use:
-  git update-index --assume-unchanged tensorflow/compiler/plugin/BUILD
-
-To add additional devices to the XLA subsystem, add targets to the
-dependency list in the 'plugin' target. For instance:
-
-    deps = ["//tensorflow/compiler/plugin/example:plugin_lib"],
-"""
-
-licenses(["notice"])
-
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "plugin",
-    deps = [
-        "//tensorflow/compiler/plugin/executor:plugin_lib",
-    ],
-)
--- a/tensorflow/compiler/plugin/executor/BUILD
+++ b/tensorflow/compiler/plugin/executor/BUILD
@ -1,37 +0,0 @@
-licenses(["restricted"])
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = "plugin_lib",
-    srcs = glob([
-        "*.cc",
-    ]),
-    hdrs = glob([
-        "*.h",
-    ]),
-    deps = [
-        "//tensorflow/compiler/jit:xla_device",
-        "//tensorflow/compiler/jit:xla_jit_headers_lib",
-        "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/xla:xla_headers_lib",
-        "//tensorflow/compiler/xla/service",
-        "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:layout_assignment",
-        "//third_party/eigen3",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@protobuf_archive//:protobuf_headers",
-    ],
-    alwayslink = 1,
-)
-
-filegroup(
-    name = "all_files",
-    srcs = glob(
-        ["**/*"],
-        exclude = [
-            "**/METADATA",
-            "**/OWNERS",
-        ],
-    ),
-)
--- a/tensorflow/compiler/plugin/executor/transfer_manager.cc
+++ b/tensorflow/compiler/plugin/executor/transfer_manager.cc
@ -1,186 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/plugin/executor/transfer_manager.h"
-#include "tensorflow/compiler/plugin/executor/platform_id.h"
-
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-
-#include <string>
-#include <utility>
-#include <vector>
-
-namespace sep = ::perftools::gputools::executorplugin;
-
-namespace xla {
-namespace executorplugin {
-
-ExecutorTransferManager::ExecutorTransferManager() {}
-
-se::Platform::Id ExecutorTransferManager::PlatformId() const {
-  return se::executorplugin::kExecutorPlatformId;
-}
-
-Status ExecutorTransferManager::TransferLiteralFromDevice(
-    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
-    const Shape& device_shape, const Shape& literal_shape, Literal* literal) {
-  TF_RET_CHECK(ShapeUtil::Compatible(device_shape, literal_shape));
-
-  // Tuples are a special case and contain one or more shapes inside of them to
-  // an arbitrary nesting depth.
-  if (device_shape.element_type() == TUPLE) {
-    *literal->mutable_shape() = literal_shape;
-    TF_ASSIGN_OR_RETURN(
-        std::vector<se::DeviceMemoryBase> element_buffers,
-        ShallowCopyTupleFromDevice(executor, source, device_shape));
-    TF_RET_CHECK(element_buffers.size() ==
-                 ShapeUtil::TupleElementCount(device_shape));
-    for (int64 i = 0; i < element_buffers.size(); ++i) {
-      const Shape& element_device_shape = device_shape.tuple_shapes(i);
-      const Shape& element_literal_shape = literal_shape.tuple_shapes(i);
-      Literal* element_literal = literal->add_tuple_literals();
-      // Recursively call TransferFromDevice to copy over the data in the
-      // element array.
-      TF_RETURN_IF_ERROR(TransferLiteralFromDevice(
-          executor, element_buffers[i], element_device_shape,
-          element_literal_shape, element_literal));
-    }
-    return Status::OK();
-  }
-
-  *literal->mutable_shape() = device_shape;
-  literal->Reserve(ShapeUtil::ElementsIn(device_shape));
-  TF_RETURN_IF_ERROR(TransferBufferFromDevice(
-      executor, source, ShapeUtil::ByteSizeOf(device_shape),
-      literal->MutableInternalData()));
-  if (!ShapeUtil::Equal(literal_shape, device_shape)) {
-    *literal = std::move(*literal->Relayout(literal_shape.layout()));
-  }
-  TF_RET_CHECK(ShapeUtil::Equal(literal_shape, literal->shape()));
-  return Status::OK();
-}
-
-StatusOr<std::vector<se::DeviceMemoryBase>>
-ExecutorTransferManager::ShallowCopyTupleFromDevice(
-    se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
-    const Shape& shape) {
-  TF_RET_CHECK(ShapeUtil::IsTuple(shape));
-
-  std::vector<void*> element_pointers(ShapeUtil::TupleElementCount(shape),
-                                      nullptr);
-  int64 tuple_size = ShapeUtil::ByteSizeOf(shape, sizeof(void*));
-  auto copy_status = executor->SynchronousMemcpyD2H(source, tuple_size,
-                                                    element_pointers.data());
-  if (!copy_status.ok()) {
-    return AddStatus(
-        Status(static_cast<tensorflow::error::Code>(copy_status.code()),
-               copy_status.error_message()),
-        "failed transfer of tuple buffer " + ShapeUtil::HumanString(shape));
-  }
-
-  // Create a DeviceMemoryBase from each void* pointer.
-  std::vector<se::DeviceMemoryBase> destination;
-  for (int i = 0; i < element_pointers.size(); ++i) {
-    if (element_pointers[i] == nullptr &&
-        !ShapeUtil::HasZeroElements(shape.tuple_shapes(i))) {
-      return FailedPrecondition("tuple contains nullptr at element %d", i);
-    }
-    int64 buffer_size =
-        ShapeUtil::ByteSizeOf(shape.tuple_shapes(i), sizeof(void*));
-    destination.emplace_back(element_pointers[i], buffer_size);
-  }
-  return std::move(destination);
-}
-
-Status ExecutorTransferManager::TransferLiteralToDevice(
-    se::StreamExecutor* executor, const Literal& literal,
-    se::DeviceMemoryBase* destination) {
-  const Shape& shape = literal.shape();
-
-  if (ShapeUtil::IsTuple(literal.shape())) {
-    std::vector<void*> tuple_elements_on_device;
-    for (const Literal& tuple_element : literal.tuple_literals()) {
-      se::DeviceMemoryBase allocation = executor->AllocateArray<uint8>(
-          GetByteSizeRequirement(tuple_element.shape()));
-      TF_RETURN_IF_ERROR(
-          TransferLiteralToDevice(executor, tuple_element, &allocation));
-      tuple_elements_on_device.push_back(allocation.opaque());
-    }
-    return TransferBufferToDevice(
-        executor, tuple_elements_on_device.size() * sizeof(void*),
-        tuple_elements_on_device.data(), destination);
-  }
-
-  return TransferBufferToDevice(executor, GetByteSizeRequirement(shape),
-                                literal.InternalData(),
-                                destination);
-}
-
-Status ExecutorTransferManager::TransferLiteralToInfeed(
-    se::StreamExecutor* executor, const Literal& literal) {
-  const Shape& shape = literal.shape();
-  VLOG(1) << "transferring literal shape to infeed: "
-          << ShapeUtil::HumanString(shape);
-
-  return Status::OK();
-}
-
-Status ExecutorTransferManager::TransferBufferToInfeed(
-    se::StreamExecutor* executor, int64 size, const void* source) {
-  return Unimplemented("Transfer to Infeed");
-}
-
-Status ExecutorTransferManager::TransferLiteralFromOutfeed(
-    perftools::gputools::StreamExecutor* executor, const Shape& literal_shape,
-    Literal* literal) {
-  const Shape& shape = literal->shape();
-  VLOG(1) << "transferring literal shape from outfeed: "
-          << ShapeUtil::HumanString(shape);
-
-  return Status::OK();
-}
-
-Status ExecutorTransferManager::ResetDevices(
-    tensorflow::gtl::ArraySlice<perftools::gputools::StreamExecutor*>
-        executors) {
-  return Unimplemented("Device reset not supported");
-}
-
-int64 ExecutorTransferManager::GetByteSizeRequirement(const Shape& shape) {
-  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
-}
-
-}  // namespace executorplugin
-}  // namespace xla
-
-static std::unique_ptr<xla::TransferManager> CreateExecutorTransferManager() {
-  return xla::MakeUnique<xla::executorplugin::ExecutorTransferManager>();
-}
-
-static bool InitModule() {
-  xla::TransferManager::RegisterTransferManager(sep::kExecutorPlatformId,
-                                                &CreateExecutorTransferManager);
-  return true;
-}
-static bool module_initialized = InitModule();
--- a/tensorflow/compiler/plugin/executor/transfer_manager.h
+++ b/tensorflow/compiler/plugin/executor/transfer_manager.h
@ -1,77 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
-#define TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
-
-#include "tensorflow/compiler/xla/service/transfer_manager.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/stream_executor_no_cuda.h"
-#include "tensorflow/core/platform/types.h"
-
-#include <vector>
-
-namespace se = ::perftools::gputools;
-
-namespace xla {
-namespace executorplugin {
-
-class ExecutorTransferManager : public TransferManager {
- public:
-  ExecutorTransferManager();
-
-  ~ExecutorTransferManager() override {}
-
-  se::Platform::Id PlatformId() const override;
-
-  StatusOr<std::vector<se::DeviceMemoryBase>> ShallowCopyTupleFromDevice(
-      se::StreamExecutor* executor, const se::DeviceMemoryBase& source,
-      const Shape& shape) override;
-
-  Status TransferLiteralFromDevice(se::StreamExecutor* executor,
-                                   const se::DeviceMemoryBase& source,
-                                   const Shape& device_shape,
-                                   const Shape& literal_shape,
-                                   Literal* literal) override;
-
-  Status TransferLiteralToDevice(se::StreamExecutor* executor,
-                                 const Literal& literal,
-                                 se::DeviceMemoryBase* destination) override;
-
-  Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                 const Literal& literal) override;
-
-  Status TransferBufferToInfeed(se::StreamExecutor* executor,
-                                 int64 size, const void* source) override;
-
-  Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
-                                    const Shape& literal_shape,
-                                    Literal* literal) override;
-
-  Status ResetDevices(
-      tensorflow::gtl::ArraySlice<se::StreamExecutor*> executors) override;
-
-  int64 GetByteSizeRequirement(const Shape& shape) override;
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorTransferManager);
-};
-
-}  // namespace executorplugin
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_TRANSFER_MANAGER_H_
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@ -529,6 +529,17 @@ cc_library(
    ],
 )

+cc_library(
+    name = "interpreter_plugin",
+    deps = [
+        ":interpreter_transfer_manager",
+        ":service",
+        "//tensorflow/compiler/xla/service/interpreter:compiler",
+        "//tensorflow/compiler/xla/service/interpreter:platform",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
 cc_library(
    name = "shaped_buffer",
    srcs = ["shaped_buffer.cc"],
@ -1152,6 +1163,7 @@ cc_library(
        "//tensorflow/compiler/xla:types",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service/interpreter:platform_id",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
    ],
@ -1200,6 +1212,27 @@ cc_library(
    alwayslink = True,  # Contains per-platform transfer manager registration
 )

+cc_library(
+    name = "interpreter_transfer_manager",
+    srcs = ["interpreter_transfer_manager.cc"],
+    hdrs = ["interpreter_transfer_manager.h"],
+    deps = [
+        ":generic_transfer_manager",
+        ":transfer_manager",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service/interpreter:platform_id",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+    alwayslink = True,  # Contains per-platform transfer manager registration
+)
+
 cc_test(
    name = "transfer_manager_test",
    srcs = ["transfer_manager_test.cc"],
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>

 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@ -36,19 +37,16 @@ namespace xla {

 GenericTransferManager::GenericTransferManager(se::Platform::Id platform_id)
    : platform_id_(platform_id) {
-  // We currently only support kHostPlatformId for CPU and kCudaPlatformId for
-  // GPU. Before supporting other platforms, we need to test this transfer
-  // manager on them.
+  // We currently only support kHostPlatformId for CPU, kCudaPlatformId for
+  // GPU and kInterpreterPlatformId for Interpreter. Before supporting other
+  // platforms, we need to test this transfer manager on them.
  CHECK(platform_id_ == se::host::kHostPlatformId ||
+        platform_id_ == se::interpreter::kInterpreterPlatformId ||
        platform_id_ == se::cuda::kCudaPlatformId);
 }

 se::Platform::Id GenericTransferManager::PlatformId() const {
-  if (platform_id_ == se::cuda::kCudaPlatformId ||
-      platform_id_ == se::host::kHostPlatformId) {
-    return platform_id_;
-  }
-  CHECK(false) << "GenericTransferManager::platform_id_ is invalid";
+  return platform_id_;
 }

 Status GenericTransferManager::TransferLiteralFromDevice(
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@ -75,7 +75,7 @@ class GenericTransferManager : public TransferManager {

 private:
  // The platform this transfer manager targets.
-  perftools::gputools::Platform::Id platform_id_;
+  const perftools::gputools::Platform::Id platform_id_;

  TF_DISALLOW_COPY_AND_ASSIGN(GenericTransferManager);
 };
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@ -0,0 +1,113 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+cc_library(
+    name = "compiler",
+    srcs = ["compiler.cc"],
+    hdrs = ["compiler.h"],
+    deps = [
+        ":executable",
+        ":platform_id",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:algebraic_simplifier",
+        "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/compiler/xla/service:computation_placer",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:flatten_call_graph",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_constant_folding",
+        "//tensorflow/compiler/xla/service:hlo_cost_analysis",
+        "//tensorflow/compiler/xla/service:hlo_cse",
+        "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
+        "//tensorflow/compiler/xla/service:inliner",
+        "//tensorflow/compiler/xla/service:layout_assignment",
+        "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor",
+    ],
+    alwayslink = True,  # Contains compiler registration
+)
+
+cc_library(
+    name = "platform_id",
+    srcs = ["platform_id.cc"],
+    hdrs = ["platform_id.h"],
+    deps = [
+        "//tensorflow/core:stream_executor_headers_lib",
+        "@nsync//:nsync_headers",
+        "@protobuf_archive//:protobuf_headers",
+        "@protobuf_archive//:protoc_lib",
+    ],
+)
+
+cc_library(
+    name = "executable",
+    srcs = ["executable.cc"],
+    hdrs = ["executable.h"],
+    deps = [
+        ":executor",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_cost_analysis",
+        "//tensorflow/compiler/xla/service:hlo_evaluator",
+        "//tensorflow/compiler/xla/service:hlo_execution_profile",
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
+cc_library(
+    name = "platform",
+    srcs = ["platform.cc"],
+    hdrs = ["platform.h"],
+    deps = [
+        ":executor",
+        ":platform_id",
+        "//tensorflow/core:stream_executor_headers_lib",
+    ],
+    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+)
+
+cc_library(
+    name = "executor",
+    srcs = ["executor.cc"],
+    hdrs = ["executor.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_headers_lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+)
--- a/tensorflow/compiler/xla/service/interpreter/README.md
+++ b/tensorflow/compiler/xla/service/interpreter/README.md
@ -0,0 +1,19 @@
+# XLA Interpreter Backend
+
+The XLA Interpreter backend operates at HLO-level by ingesting a HloModule and
+evaluating the result of the HLO graph directly with HloEvaluator, without
+lowering it further (to LLVM IR for example) before execution as other backends
+(CPU and GPU for example) do.
+
+Its key componenets are:
+
+*   [`InterpreterCompiler`] despite the inherited naming of "compiler", all
+    `InterpreterCompiler` really does is the following:
+    1.  Runs certain HLO optimization passes on the given HLO graph.
+    2.  Generates an `InterpreterExecutable` from the optimized HLO graph.
+    3.  Registers itself in the global compiler factory registry.
+*   [`InterpreterExecutable`]: responsible for running input HLO graph through
+    the `HloEvaluator`, allocating output buffer and finally copying evaluated
+    Literal result over.
+*   [`HloEvaluator`]: traverses a HLO graph and evaluates each node in DFS
+    ordering along the way.
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include <stdlib.h>
-#include <fstream>
+#include "tensorflow/compiler/xla/service/interpreter/compiler.h"

-#include "tensorflow/compiler/plugin/executor/compiler.h"
-#include "tensorflow/compiler/plugin/executor/executable.h"
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
@ -28,26 +29,27 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
 #include "tensorflow/compiler/xla/service/inliner.h"
+#include "tensorflow/compiler/xla/service/interpreter/executable.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/stream_executor/lib/initialize.h"
-#include "tensorflow/stream_executor/lib/strcat.h"
+#include "tensorflow/core/platform/types.h"

 namespace xla {
-namespace executorplugin {
+namespace interpreter {

 namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::executorplugin;
+namespace sep = ::perftools::gputools::interpreter;

 /*
- * Run optimization passes on the module.  The graph is transformed by
- * each pass in the optimization pipeline.  The service subdirectory
+ * Run optimization passes on the module. The graph is transformed by
+ * each pass in the optimization pipeline. The service subdirectory
 * contains useful optimization passes.
 */
-Status ExecutorCompiler::RunHloOptimization(HloModule* hlo_module) {
-  HloPassPipeline pipeline("Executor");
+Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
+  HloPassPipeline pipeline("Interpreter");
  pipeline.AddPass<Inliner>();
  pipeline.AddPass<HloSubcomputationUnification>();
  pipeline.AddPass<HloCSE>(false);
@ -65,9 +67,8 @@ Status ExecutorCompiler::RunHloOptimization(HloModule* hlo_module) {
  return pipeline.Run(hlo_module).status();
 }

-StatusOr<std::unique_ptr<Executable>> ExecutorCompiler::Compile(
-        std::unique_ptr<HloModule> hlo_module,
-        se::StreamExecutor* stream_exec) {
+StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::Compile(
+    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec) {
  TF_RET_CHECK(stream_exec != nullptr);

  VLOG(1) << "Generate graph " << hlo_module->name();
@ -75,53 +76,54 @@ StatusOr<std::unique_ptr<Executable>> ExecutorCompiler::Compile(
  TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));

  // Typically you would visit the HLO graph, building up a compiled equivalent
-  // In this case we are using an Hlo evaluator at execution time, so we don't
+  // In this case we are using an HloEvaluator at execution time, so we don't
  // need to compile anything

-  // Create executable from only the Hlo module
-  std::unique_ptr<Executable> executable;
-  executable.reset(new ExecutorExecutable(std::move(hlo_module)));
+  // Create executable from only the Hlo module.
+  std::unique_ptr<Executable> executable =
+      xla::MakeUnique<InterpreterExecutable>(std::move(hlo_module));

  return std::move(executable);
 }

-StatusOr<std::vector<std::unique_ptr<Executable>>> ExecutorCompiler::Compile(
-        std::vector<std::unique_ptr<HloModule>> hlo_modules,
-        std::vector<se::StreamExecutor*> stream_execs) {
-
+StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
+    std::vector<std::unique_ptr<HloModule>> /*hlo_modules*/,
+    std::vector<se::StreamExecutor*> /*stream_execs*/) {
  return tensorflow::errors::Unimplemented(
-      "Compilation of multiple HLO modules is not supported on Executor.");
+      "Compilation of multiple HLO modules is not supported on Interpreter.");
 }

 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-ExecutorCompiler::CompileAheadOfTime(
+InterpreterCompiler::CompileAheadOfTime(
    std::vector<std::unique_ptr<HloModule>> hlo_modules,
    const AotCompilationOptions& aot_options) {
-
  return tensorflow::errors::InvalidArgument(
-      "AOT compilation not supported on Executor");
+      "AOT compilation not supported on Interpreter");
 }

-se::Platform::Id ExecutorCompiler::PlatformId() const {
-  return sep::kExecutorPlatformId;
+se::Platform::Id InterpreterCompiler::PlatformId() const {
+  return sep::kInterpreterPlatformId;
 }

-HloCostAnalysis::ShapeSizeFunction
-ExecutorCompiler::ShapeSizeBytesFunction() const {
-  return ExecutorExecutable::ShapeSizeBytes;
+HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction()
+    const {
+  return InterpreterExecutable::ShapeSizeBytes;
 }

 static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
  return xla::MakeUnique<xla::ComputationPlacer>();
 }

-REGISTER_MODULE_INITIALIZER(executor_compiler, {
-  xla::Compiler::RegisterCompilerFactory(sep::kExecutorPlatformId, []() {
-    return xla::MakeUnique<xla::executorplugin::ExecutorCompiler>();
+static bool InitModule() {
+  xla::Compiler::RegisterCompilerFactory(sep::kInterpreterPlatformId, []() {
+    return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
  });
-  xla::ComputationPlacer::RegisterComputationPlacer(sep::kExecutorPlatformId,
+  xla::ComputationPlacer::RegisterComputationPlacer(sep::kInterpreterPlatformId,
                                                    &CreateComputationPlacer);
-});
+  return true;
+}

-}  // namespace executorplugin
+static bool module_initialized = InitModule();
+
+}  // namespace interpreter
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@ -13,38 +13,47 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#ifndef TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
-#define TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_COMPILER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_COMPILER_H_

 #include <memory>
+#include <vector>

 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-
-#include "tensorflow/compiler/plugin/executor/platform_id.h"
+#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/stream_executor/stream_executor.h"

 namespace xla {
-namespace executorplugin {
+namespace interpreter {

-class ExecutorCompiler : public Compiler {
+// Despite the inherited "compiler" naming, InterpreterCompiler does not
+// perform any lowering as other backends do. It operates at HLO-level for
+// and is responsible for generating an InterpreterExecutable.
+// Refer to interpreter/README.md for more.
+class InterpreterCompiler : public Compiler {
 public:
-  ExecutorCompiler() {}
-  ~ExecutorCompiler() override {}
+  InterpreterCompiler() {}
+  ~InterpreterCompiler() override {}

  StatusOr<std::unique_ptr<Executable>> Compile(
-      std::unique_ptr<HloModule> hlo_module,
+      std::unique_ptr<HloModule> hlo_modules,
      perftools::gputools::StreamExecutor* stream_exec) override;

  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
-      std::vector<std::unique_ptr<HloModule>> hlo_module,
+      std::vector<std::unique_ptr<HloModule>> hlo_modules,
      std::vector<perftools::gputools::StreamExecutor*> stream_exec) override;

  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      std::vector<std::unique_ptr<HloModule>> module,
-      const AotCompilationOptions& options) override;
+  CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> hlo_modules,
+                     const AotCompilationOptions& aot_options) override;

  HloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const override;

@ -53,10 +62,10 @@ class ExecutorCompiler : public Compiler {
 private:
  Status RunHloOptimization(HloModule* hlo_module);

-  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorCompiler);
+  TF_DISALLOW_COPY_AND_ASSIGN(InterpreterCompiler);
 };

-}  // namespace executorplugin
+}  // namespace interpreter
 }  // namespace xla

-#endif  // TENSORFLOW_COMPILER_EXECUTOR_COMPILER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_COMPILER_H_
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@ -13,25 +13,41 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/compiler/plugin/executor/executable.h"
-#include "tensorflow/compiler/plugin/executor/executor.h"
+#include "tensorflow/compiler/xla/service/interpreter/executable.h"
+
+#include <algorithm>
+#include <cstring>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/interpreter/executor.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"

 namespace xla {
-namespace executorplugin {
+namespace interpreter {

 namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::executorplugin;
+namespace sep = ::perftools::gputools::interpreter;

-ExecutorExecutable::ExecutorExecutable(std::unique_ptr<HloModule> hlo_module)
+InterpreterExecutable::InterpreterExecutable(
+    std::unique_ptr<HloModule> hlo_module)
    : Executable(std::move(hlo_module)) {}

-ExecutorExecutable::~ExecutorExecutable() {}
+InterpreterExecutable::~InterpreterExecutable() {}

 static se::DeviceMemoryBase AllocateSingleOutput(
-    sep::ExecutorExecutor* executor, const Literal& literal) {
+    sep::InterpreterExecutor* executor, const Literal& literal) {
  int64 size(xla::ShapeUtil::ByteSizeOf(literal.shape()));
  void* buf = executor->Allocate(size);
  const void* src = literal.InternalData();
@ -40,7 +56,7 @@ static se::DeviceMemoryBase AllocateSingleOutput(
 }

 static se::DeviceMemoryBase AllocateOutputBuffer(
-    sep::ExecutorExecutor* executor, const Literal& literal) {
+    sep::InterpreterExecutor* executor, const Literal& literal) {
  const Shape& shape = literal.shape();
  if (shape.element_type() != xla::TUPLE) {
    return AllocateSingleOutput(executor, literal);
@ -58,7 +74,7 @@ static se::DeviceMemoryBase AllocateOutputBuffer(
  }
 }

-StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteOnStream(
+StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteOnStream(
    const ServiceExecutableRunOptions* run_options,
    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments,
    HloExecutionProfile* hlo_execution_profile) {
@ -82,7 +98,7 @@ StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteOnStream(
  // Create the arguments as an vector of XLA literals
  std::vector<std::unique_ptr<Literal>> arg_literals;
  std::vector<Literal*> arg_literals_ptrs;
-  for (int64 p = 0; p < computation->num_parameters(); p++) {
+  for (int64 p = 0; p < computation->num_parameters(); ++p) {
    // Create the input literal for the parameter
    HloInstruction* param = computation->parameter_instruction(p);
    arg_literals.emplace_back(Literal::CreateFromShape(param->shape()));
@ -94,18 +110,18 @@ StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteOnStream(
           ShapeUtil::ByteSizeOf(param->shape()));
  }

-  // Execute the graph using the evaluator
+  // Execute the graph using the HloEvaluator.
  HloEvaluator evaluator;
  TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> output,
                      evaluator.Evaluate(*computation, arg_literals_ptrs));

  // Copy the result into the return buffer
  perftools::gputools::StreamExecutor* executor(stream->parent());
-  sep::ExecutorExecutor* executorExecutor(
-      static_cast<sep::ExecutorExecutor*>(executor->implementation()));
+  sep::InterpreterExecutor* interpreter_executor(
+      static_cast<sep::InterpreterExecutor*>(executor->implementation()));

  se::DeviceMemoryBase ret =
-      AllocateOutputBuffer(executorExecutor, *(output.get()));
+      AllocateOutputBuffer(interpreter_executor, *(output.get()));

  uint64 end_micros = tensorflow::Env::Default()->NowMicros();

@ -118,32 +134,32 @@ StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteOnStream(
  return ret;
 }

-StatusOr<std::unique_ptr<ShapedBuffer>> ExecutorExecutable::ExecuteOnStream(
+StatusOr<std::unique_ptr<ShapedBuffer>> InterpreterExecutable::ExecuteOnStream(
    const ServiceExecutableRunOptions* run_options,
    tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
    HloExecutionProfile* hlo_execution_profile) {
  return tensorflow::errors::Unimplemented(
-      "ExecuteOnStream is not yet supported on Executor.");
+      "ExecuteOnStream is not yet supported on Interpreter.");
 }

-StatusOr<se::DeviceMemoryBase> ExecutorExecutable::ExecuteAsyncOnStream(
+StatusOr<se::DeviceMemoryBase> InterpreterExecutable::ExecuteAsyncOnStream(
    const ServiceExecutableRunOptions* run_options,
    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> arguments) {
  return tensorflow::errors::Unimplemented(
-      "ExecuteAsyncOnStream is not yet supported on Executor.");
+      "ExecuteAsyncOnStream is not yet supported on Interpreter.");
 }

-/*static*/ int64 ExecutorExecutable::ShapeSizeBytes(const Shape& shape) {
+/*static*/ int64 InterpreterExecutable::ShapeSizeBytes(const Shape& shape) {
  if (ShapeUtil::IsOpaque(shape)) {
    return sizeof(void*);
  }
  return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
 }

-std::unique_ptr<HloCostAnalysis> ExecutorExecutable::CreateCostAnalysis()
+std::unique_ptr<HloCostAnalysis> InterpreterExecutable::CreateCostAnalysis()
    const {
  return MakeUnique<HloCostAnalysis>(ShapeSizeBytes);
 }

-}  // namespace executorplugin
+}  // namespace interpreter
 }  // namespace xla
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@ -13,29 +13,35 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#ifndef TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
-#define TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTABLE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTABLE_H_

-#include <cstddef>
 #include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>

 #include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/core/platform/types.h"

 namespace xla {
-namespace executorplugin {
+namespace interpreter {

-class ExecutorExecutable : public Executable {
+// Responsible for running a HLO graph through the HloEvaluator and output
+// buffer allocation. Refer to interpreter/README.md for more.
+class InterpreterExecutable : public Executable {
 public:
-  ExecutorExecutable(std::unique_ptr<HloModule> hlo_module);
-  ~ExecutorExecutable() override;
+  InterpreterExecutable(std::unique_ptr<HloModule> hlo_module);
+  ~InterpreterExecutable() override;

  StatusOr<perftools::gputools::DeviceMemoryBase> ExecuteOnStream(
      const ServiceExecutableRunOptions* run_options,
@ -58,10 +64,10 @@ class ExecutorExecutable : public Executable {
  std::unique_ptr<HloCostAnalysis> CreateCostAnalysis() const override;

 private:
-  TF_DISALLOW_COPY_AND_ASSIGN(ExecutorExecutable);
+  TF_DISALLOW_COPY_AND_ASSIGN(InterpreterExecutable);
 };

-}  // namespace executorplugin
+}  // namespace interpreter
 }  // namespace xla

-#endif  // TENSORFLOW_COMPILER_EXECUTOR_DRIVER_EXECUTOR_EXECUTABLE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTABLE_H_
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@ -13,117 +13,110 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/compiler/plugin/executor/executor.h"
+#include "tensorflow/compiler/xla/service/interpreter/executor.h"

-#include <stdlib.h>
-#include <string.h>
+#include <cstring>

-#include "tensorflow/compiler/plugin/executor/platform_id.h"
 #include "tensorflow/compiler/xla/status_macros.h"

 namespace perftools {
 namespace gputools {
-namespace executorplugin {
+namespace interpreter {

 host::HostStream *AsExecutorStream(Stream *stream) {
  DCHECK(stream != nullptr);
  return dynamic_cast<host::HostStream *>(stream->implementation());
 }

-ExecutorExecutor::ExecutorExecutor(const PluginConfig &plugin_config)
+InterpreterExecutor::InterpreterExecutor(const PluginConfig &plugin_config)
    : plugin_config_(plugin_config) {}

-ExecutorExecutor::~ExecutorExecutor() {}
+InterpreterExecutor::~InterpreterExecutor() {}

-void *ExecutorExecutor::Allocate(uint64 size) { return new char[size]; }
+void *InterpreterExecutor::Allocate(uint64 size) { return new char[size]; }

-void *ExecutorExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
-                                         uint64 offset_bytes,
-                                         uint64 size_bytes) {
+void *InterpreterExecutor::AllocateSubBuffer(DeviceMemoryBase *parent,
+                                             uint64 offset_bytes,
+                                             uint64 /*size_bytes*/) {
  return parent + offset_bytes;
 }

-void ExecutorExecutor::Deallocate(DeviceMemoryBase *mem) {
+void InterpreterExecutor::Deallocate(DeviceMemoryBase *mem) {
  if (!mem->is_sub_buffer()) {
    delete[] static_cast<char *>(mem->opaque());
  }
 }

-bool ExecutorExecutor::Memcpy(Stream *stream, void *host_dst,
-                             const DeviceMemoryBase &dev_src, uint64 size) {
+bool InterpreterExecutor::Memcpy(Stream *stream, void *host_dst,
+                                 const DeviceMemoryBase &dev_src, uint64 size) {
  AsExecutorStream(stream)->EnqueueTask([this, host_dst, dev_src, size]() {
    port::Status ok = SynchronousMemcpy(host_dst, dev_src, size);
  });
  return true;
 }

-bool ExecutorExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
-                             const void *host_src, uint64 size) {
+bool InterpreterExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
+                                 const void *host_src, uint64 size) {
  AsExecutorStream(stream)->EnqueueTask([this, dev_dst, host_src, size]() {
    port::Status ok = SynchronousMemcpy(dev_dst, host_src, size);
  });
  return true;
 }

-port::Status ExecutorExecutor::SynchronousMemcpy(DeviceMemoryBase *dev_dst,
-                                                const void *host_src,
-                                                uint64 size) {
+port::Status InterpreterExecutor::SynchronousMemcpy(DeviceMemoryBase *dev_dst,
+                                                    const void *host_src,
+                                                    uint64 size) {
  memcpy(dev_dst->opaque(), host_src, size);
  return port::Status::OK();
 }

-port::Status ExecutorExecutor::SynchronousMemcpy(void *host_dst,
-                                                const DeviceMemoryBase &dev_src,
-                                                uint64 size) {
+port::Status InterpreterExecutor::SynchronousMemcpy(
+    void *host_dst, const DeviceMemoryBase &dev_src, uint64 size) {
  memcpy(host_dst, dev_src.opaque(), size);
  return port::Status::OK();
 }

-bool ExecutorExecutor::HostCallback(Stream *stream,
-                                   std::function<void()> callback) {
+bool InterpreterExecutor::HostCallback(Stream *stream,
+                                       std::function<void()> callback) {
  AsExecutorStream(stream)->EnqueueTask(callback);
  return true;
 }

-bool ExecutorExecutor::CreateStreamDependency(Stream *dependent, Stream *other) {
+bool InterpreterExecutor::CreateStreamDependency(Stream *dependent,
+                                                 Stream *other) {
  AsExecutorStream(dependent)->EnqueueTask(
      [other]() { other->BlockHostUntilDone(); });
  AsExecutorStream(dependent)->BlockUntilDone();
  return true;
 }

-bool ExecutorExecutor::StartTimer(Stream *stream, Timer *timer) {
+bool InterpreterExecutor::StartTimer(Stream *stream, Timer *timer) {
  dynamic_cast<host::HostTimer *>(timer->implementation())->Start(stream);
  return true;
 }

-bool ExecutorExecutor::StopTimer(Stream *stream, Timer *timer) {
+bool InterpreterExecutor::StopTimer(Stream *stream, Timer *timer) {
  dynamic_cast<host::HostTimer *>(timer->implementation())->Stop(stream);
  return true;
 }

-bool ExecutorExecutor::BlockHostUntilDone(Stream *stream) {
+bool InterpreterExecutor::BlockHostUntilDone(Stream *stream) {
  AsExecutorStream(stream)->BlockUntilDone();
  return true;
 }

-DeviceDescription *ExecutorExecutor::PopulateDeviceDescription() const {
+DeviceDescription *InterpreterExecutor::PopulateDeviceDescription() const {
  internal::DeviceDescriptionBuilder builder;

  builder.set_device_address_bits(64);

-  builder.set_name("Executor");
-  builder.set_device_vendor("VectorName");
-  builder.set_platform_version("1.0");
-  builder.set_driver_version("1.0");
-  builder.set_runtime_version("1.0");
-  builder.set_pci_bus_id("1");
+  builder.set_name("Interpreter");
  builder.set_device_memory_size(static_cast<uint64>(4) * 1024 * 1024 * 1024);
  builder.set_clock_rate_ghz(static_cast<float>(CLOCKS_PER_SEC) / 1e9);

  return builder.Build().release();
 }

-}  // namespace executorplugin
+}  // namespace interpreter
 }  // namespace gputools
 }  // namespace perftools
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@ -13,38 +13,47 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-// Declares the ExecutorExecutor class, which is a CPU-only implementation of
+// Declares the InterpreterExecutor class, which is a CPU-only implementation of
 // the StreamExecutor interface. For now, this is used for testing and to
 // examine the performance of host-based StreamExecutor code.
-#ifndef TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
-#define TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_

-#include "tensorflow/stream_executor/host/host_stream.h"
-#include "tensorflow/stream_executor/host/host_timer.h"
+#include <functional>
+#include <memory>

 #include "tensorflow/compiler/xla/shape_util.h"
-
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/blas.h"
-#include "tensorflow/stream_executor/lib/error.h"
-#include "tensorflow/stream_executor/lib/status.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/device_description.h"
+#include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/device_options.h"
+#include "tensorflow/stream_executor/event.h"
+#include "tensorflow/stream_executor/host/host_stream.h"
+#include "tensorflow/stream_executor/host/host_timer.h"
+#include "tensorflow/stream_executor/kernel.h"
+#include "tensorflow/stream_executor/kernel_spec.h"
+#include "tensorflow/stream_executor/launch_dim.h"
+#include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/rng.h"
+#include "tensorflow/stream_executor/shared_memory_config.h"
+#include "tensorflow/stream_executor/stream.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/stream_executor_internal.h"
-
-#include <list>
-#include <mutex>
+#include "tensorflow/stream_executor/timer.h"

 namespace perftools {
 namespace gputools {
-namespace executorplugin {
+namespace interpreter {

 using Args = tensorflow::gtl::ArraySlice<DeviceMemoryBase>;

-class ExecutorExecutor : public internal::StreamExecutorInterface {
+class InterpreterExecutor : public internal::StreamExecutorInterface {
 public:
-  explicit ExecutorExecutor(const PluginConfig &plugin_config);
-  ~ExecutorExecutor() override;
+  explicit InterpreterExecutor(const PluginConfig &plugin_config);
+  ~InterpreterExecutor() override;

  port::Status Init(int device_ordinal, DeviceOptions device_options) override {
    return port::Status::OK();
@ -194,9 +203,6 @@ class ExecutorExecutor : public internal::StreamExecutorInterface {
    return std::unique_ptr<internal::TimerInterface>(new host::HostTimer());
  }

-  port::StatusOr<DeviceMemoryBase> ExecuteGraph(const xla::Shape &shape,
-                                                Args args);
-
 private:
  DeviceMemoryBase AllocateSingleOutput(const xla::Shape &shape);

@ -206,8 +212,8 @@ class ExecutorExecutor : public internal::StreamExecutorInterface {
  const PluginConfig plugin_config_;
 };

-}  // namespace executorplugin
+}  // namespace interpreter
 }  // namespace gputools
 }  // namespace perftools

-#endif  // TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_EXECUTOR_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_EXECUTOR_H_
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@ -13,37 +13,39 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/compiler/plugin/executor/platform.h"
-#include "tensorflow/compiler/plugin/executor/executor.h"
-#include "tensorflow/compiler/plugin/executor/platform_id.h"
+#include "tensorflow/compiler/xla/service/interpreter/platform.h"

-#include "tensorflow/stream_executor/lib/error.h"
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/interpreter/executor.h"
+#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
+#include "tensorflow/stream_executor/device_options.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/ptr_util.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/status_macros.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/multi_platform_manager.h"
+#include "tensorflow/stream_executor/platform.h"

 namespace se = ::perftools::gputools;
-namespace sep = ::perftools::gputools::executorplugin;
+namespace sep = ::perftools::gputools::interpreter;

 namespace perftools {
 namespace gputools {
-namespace executorplugin {
+namespace interpreter {

-PLATFORM_DEFINE_ID(kExecutorPlatformId);
+InterpreterPlatform::InterpreterPlatform() : name_("Interpreter") {}

-ExecutorPlatform::ExecutorPlatform() : name_("Executor") {}
+InterpreterPlatform::~InterpreterPlatform() {}

-ExecutorPlatform::~ExecutorPlatform() {}
+Platform::Id InterpreterPlatform::id() const { return kInterpreterPlatformId; }

-Platform::Id ExecutorPlatform::id() const { return kExecutorPlatformId; }
+int InterpreterPlatform::VisibleDeviceCount() const { return 1; }

-int ExecutorPlatform::VisibleDeviceCount() const { return 1; }
+const string& InterpreterPlatform::Name() const { return name_; }

-const string& ExecutorPlatform::Name() const { return name_; }
-
-port::StatusOr<StreamExecutor*> ExecutorPlatform::ExecutorForDevice(
+port::StatusOr<StreamExecutor*> InterpreterPlatform::ExecutorForDevice(
    int ordinal) {
  StreamExecutorConfig config;
  config.ordinal = ordinal;
@ -53,7 +55,7 @@ port::StatusOr<StreamExecutor*> ExecutorPlatform::ExecutorForDevice(
 }

 port::StatusOr<StreamExecutor*>
-ExecutorPlatform::ExecutorForDeviceWithPluginConfig(
+InterpreterPlatform::ExecutorForDeviceWithPluginConfig(
    int device_ordinal, const PluginConfig& plugin_config) {
  StreamExecutorConfig config;
  config.ordinal = device_ordinal;
@ -62,16 +64,16 @@ ExecutorPlatform::ExecutorForDeviceWithPluginConfig(
  return GetExecutor(config);
 }

-port::StatusOr<StreamExecutor*> ExecutorPlatform::GetExecutor(
+port::StatusOr<StreamExecutor*> InterpreterPlatform::GetExecutor(
    const StreamExecutorConfig& config) {
  return executor_cache_.GetOrCreate(
      config, [&]() { return GetUncachedExecutor(config); });
 }

 port::StatusOr<std::unique_ptr<StreamExecutor>>
-ExecutorPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
+InterpreterPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
  auto executor = port::MakeUnique<StreamExecutor>(
-      this, port::MakeUnique<ExecutorExecutor>(config.plugin_config));
+      this, port::MakeUnique<InterpreterExecutor>(config.plugin_config));
  auto init_status = executor->Init(config.ordinal, config.device_options);
  if (!init_status.ok()) {
    return port::Status{
@ -84,27 +86,30 @@ ExecutorPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
  return std::move(executor);
 }

-void ExecutorPlatform::RegisterTraceListener(
+void InterpreterPlatform::RegisterTraceListener(
    std::unique_ptr<TraceListener> listener) {
  LOG(FATAL) << "not yet implemented: register executor trace listener";
 }

-void ExecutorPlatform::UnregisterTraceListener(TraceListener* listener) {
+void InterpreterPlatform::UnregisterTraceListener(TraceListener* listener) {
  LOG(FATAL) << "not yet implemented: unregister executor trace listener";
 }

-static void InitializeExecutorPlatform() {
-  std::unique_ptr<se::Platform> platform(new sep::ExecutorPlatform);
+static void InitializeInterpreterPlatform() {
+  std::unique_ptr<se::Platform> platform(new sep::InterpreterPlatform);
  SE_CHECK_OK(se::MultiPlatformManager::RegisterPlatform(std::move(platform)));
 }

-}  // namespace executorplugin
+}  // namespace interpreter
 }  // namespace gputools
 }  // namespace perftools

-REGISTER_MODULE_INITIALIZER(executor_platform, sep::InitializeExecutorPlatform());
+REGISTER_MODULE_INITIALIZER(interpreter_platform,
+                            sep::InitializeInterpreterPlatform());

 DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
-REGISTER_MODULE_INITIALIZER_SEQUENCE(executor_platform, multi_platform_manager);
+REGISTER_MODULE_INITIALIZER_SEQUENCE(interpreter_platform,
+                                     multi_platform_manager);
--- a/tensorflow/compiler/xla/service/interpreter/platform.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform.h
@ -12,38 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
-#define TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_H_

 #include <memory>
 #include <string>
-#include <vector>

 #include "tensorflow/stream_executor/executor_cache.h"
-#include "tensorflow/stream_executor/lib/statusor.h"
-#include "tensorflow/stream_executor/multi_platform_manager.h"
-#include "tensorflow/stream_executor/platform.h"
-#include "tensorflow/stream_executor/platform/mutex.h"
-#include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/platform/thread_annotations.h"
-#include "tensorflow/stream_executor/stream_executor_pimpl.h"
+#include "tensorflow/stream_executor/plugin.h"
+#include "tensorflow/stream_executor/stream_executor.h"
 #include "tensorflow/stream_executor/trace_listener.h"

 namespace perftools {
 namespace gputools {
-namespace executorplugin {
+namespace interpreter {

-class ExecutorPlatform : public Platform {
+class InterpreterPlatform : public Platform {
 public:
-  ExecutorPlatform();
-  ~ExecutorPlatform() override;
+  InterpreterPlatform();
+  ~InterpreterPlatform() override;

  Platform::Id id() const override;

-  // Device count is less clear-cut for CPUs than accelerators. This call
-  // currently returns the number of thread units in the host, as reported by
-  // base::NumCPUs().
  int VisibleDeviceCount() const override;

  const string& Name() const override;
@ -70,11 +60,11 @@ class ExecutorPlatform : public Platform {
  // Cache of created StreamExecutors.
  ExecutorCache executor_cache_;

-  SE_DISALLOW_COPY_AND_ASSIGN(ExecutorPlatform);
+  SE_DISALLOW_COPY_AND_ASSIGN(InterpreterPlatform);
 };

-}  // namespace executorplugin
+}  // namespace interpreter
 }  // namespace gputools
 }  // namespace perftools

-#endif  // TENSORFLOW_COMPILER_EXECUTOR_STREAM_EXECUTOR_EXECUTOR_PLATFORM_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_H_
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.cc
@ -0,0 +1,25 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
+
+namespace perftools {
+namespace gputools {
+namespace interpreter {
+
+PLATFORM_DEFINE_ID(kInterpreterPlatformId);
+
+}  // namespace interpreter
+}  // namespace gputools
+}  // namespace perftools
--- a/tensorflow/compiler/xla/service/interpreter/platform_id.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform_id.h
@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#ifndef TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
-#define TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_ID_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_ID_H_

 #include "tensorflow/stream_executor/platform.h"

 namespace perftools {
 namespace gputools {
-namespace executorplugin {
+namespace interpreter {

-extern const Platform::Id kExecutorPlatformId;
+extern const Platform::Id kInterpreterPlatformId;

-}  // namespace executorplugin
+}  // namespace interpreter
 }  // namespace gputools
 }  // namespace perftools

-#endif  // TENSORFLOW_STREAM_EXECUTOR_EXECUTOR_PLATFORM_ID_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_PLATFORM_ID_H_
--- a/tensorflow/compiler/xla/service/interpreter_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/interpreter_transfer_manager.cc
@ -0,0 +1,44 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/interpreter_transfer_manager.h"
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
+#include "tensorflow/compiler/xla/service/transfer_manager.h"
+
+namespace sei = ::perftools::gputools::interpreter;
+
+namespace xla {
+
+InterpreterTransferManager::InterpreterTransferManager()
+    : GenericTransferManager(sei::kInterpreterPlatformId) {}
+
+}  // namespace xla
+
+static std::unique_ptr<xla::TransferManager>
+CreateInterpreterTransferManager() {
+  return xla::MakeUnique<xla::InterpreterTransferManager>();
+}
+
+static bool InitModule() {
+  xla::TransferManager::RegisterTransferManager(
+      sei::kInterpreterPlatformId, &CreateInterpreterTransferManager);
+  return true;
+}
+
+static bool module_initialized = InitModule();
--- a/tensorflow/compiler/xla/service/interpreter_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/interpreter_transfer_manager.h
@ -0,0 +1,36 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_TRANSFER_MANAGER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_TRANSFER_MANAGER_H_
+
+#include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+
+// An implementation of the XLA GenericTransferManager for interpreter backend.
+class InterpreterTransferManager : public GenericTransferManager {
+ public:
+  InterpreterTransferManager();
+  ~InterpreterTransferManager() override = default;
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(InterpreterTransferManager);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INTERPRETER_TRANSFER_MANAGER_H_
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@ -107,15 +107,10 @@ cc_binary(
 )

 cc_binary(
-    name = "replay_computation_hlo_evaluator",
+    name = "replay_computation_interpreter",
    deps = [
        ":replay_computation_library",
-        "//tensorflow/compiler/plugin/executor:plugin_lib",
-        # TODO: This dependency is a workaround for linking error with clang.
-        # Without it, linker complains about missing symbols from
-        # 'xla_device_launch_op'. This dependency should be propagated from
-        # plugin_lib instead, but no targets other than this break without it.
-        "//tensorflow/compiler/jit",
+        "//tensorflow/compiler/xla/service:interpreter_plugin",
    ],
 )