Move DeviceMemoryAllocator and OwningDeviceMemory from XLA to StreamExecutor.

This change achieves three goals: 1. There are currently three different allocator abstractions in three different places: XLA, stream executor, and tensorflow. This change shrinks down the number of packages with allocator abstraction to two. 2. Moving the allocator enables unifying ScopedDeviceMemory and OwningDeviceMemory which both have "owning pointer" semantics, but a slightly different API. 3. Moving the allocator enables moving RedzoneAllocator in stream executor, which we would like to use in tensorflow to catch out-of-bound-writes in CUDNN convolutions during the autotuning. PiperOrigin-RevId: 247211996
2019-05-08 07:52:22 -07:00 · 2019-05-08 07:52:22 -07:00 · 0410cff073
commit 0410cff073
parent 3ae5e7d3b9
84 changed files with 405 additions and 389 deletions
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@ -262,7 +262,6 @@ cc_library(
        "//tensorflow/compiler/xla:statusor",
        "//tensorflow/compiler/xla/client:client_library",
        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/core:core_cpu_internal",
        "//tensorflow/core:framework",
        "//tensorflow/core:framework_internal",
@ -270,6 +269,7 @@ cc_library(
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/algorithm:container",
        "@com_google_absl//absl/base:core_headers",
        "@com_google_absl//absl/memory",
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@ -62,7 +62,7 @@ XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) {
  se::Platform::Id platform_id = nullptr;
  const XlaDevice::Metadata* xla_device_metadata = nullptr;
  std::unique_ptr<XlaAllocator> xla_allocator;
-  xla::DeviceMemoryAllocator* device_allocator = nullptr;
+  se::DeviceMemoryAllocator* device_allocator = nullptr;

  if (ctx->device_type() == DeviceType(DEVICE_CPU)) {
    platform_id = se::host::kHostPlatformId;
--- a/tensorflow/compiler/jit/kernels/xla_ops.h
+++ b/tensorflow/compiler/jit/kernels/xla_ops.h
@ -40,7 +40,7 @@ class XlaPlatformInfo {
                           se::Platform::Id platform_id,
                           const XlaDevice::Metadata* xla_device_metadata,
                           std::unique_ptr<XlaAllocator> xla_allocator,
-                           xla::DeviceMemoryAllocator* device_allocator)
+                           se::DeviceMemoryAllocator* device_allocator)
      : device_type_(device_type),
        platform_id_(platform_id),
        xla_device_metadata_(xla_device_metadata),
@ -55,7 +55,7 @@ class XlaPlatformInfo {
    return xla_device_metadata_ && xla_device_metadata_->UseMultipleStreams();
  }

-  xla::DeviceMemoryAllocator* allocator() const {
+  se::DeviceMemoryAllocator* allocator() const {
    return device_allocator_ ? device_allocator_ : xla_allocator_.get();
  }
  DeviceType device_type() const { return device_type_; }
@ -86,7 +86,7 @@ class XlaPlatformInfo {
  // then device_allocator_ is null and xla_allocator_ points to an appropriate
  // XlaAllocator instance.
  std::unique_ptr<XlaAllocator> xla_allocator_;
-  xla::DeviceMemoryAllocator* device_allocator_;
+  se::DeviceMemoryAllocator* device_allocator_;

  TF_DISALLOW_COPY_AND_ASSIGN(XlaPlatformInfo);
 };
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@ -168,11 +168,11 @@ Status SnapshotResourceVariables(OpKernelContext* ctx,
 }

 XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped)
-    : xla::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}
+    : se::DeviceMemoryAllocator(platform), wrapped_(wrapped) {}

 XlaAllocator::~XlaAllocator() {}

-xla::StatusOr<xla::OwningDeviceMemory> XlaAllocator::Allocate(
+xla::StatusOr<se::OwningDeviceMemory> XlaAllocator::Allocate(
    int device_ordinal, uint64 size, bool retry_on_failure) {
  AllocationAttributes attrs;
  attrs.no_retry_on_failure = !retry_on_failure;
@ -184,8 +184,8 @@ xla::StatusOr<xla::OwningDeviceMemory> XlaAllocator::Allocate(
          "Out of memory while trying to allocate ", size, " bytes.");
    }
  }
-  return xla::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
-                                 device_ordinal, this);
+  return se::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
+                                device_ordinal, this);
 }

 Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
@ -194,7 +194,7 @@ Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
 }

 XlaComputationLaunchContext::XlaComputationLaunchContext(
-    xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator,
+    xla::LocalClient* client, se::DeviceMemoryAllocator* xla_allocator,
    bool allocate_xla_tensors, bool use_multiple_streams)
    : client_(client),
      xla_allocator_(xla_allocator),
@ -374,7 +374,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
        } else {
          Tensor output_tensor = XlaTensorBuffer::MakeTensor(
              ctx->expected_output_dtype(i), shape, buffer, allocator);
-          output.set_buffer(xla::OwningDeviceMemory(), {output_num});
+          output.set_buffer(se::OwningDeviceMemory(), {output_num});
          ctx->set_output(i, output_tensor);
        }
        ++output_num;
@ -435,7 +435,7 @@ Status XlaComputationLaunchContext::PopulateOutputs(
      *variable_infos[i].var()->tensor() = output_tensor;
    } else {
      se::DeviceMemoryBase buffer = output.buffer({output_num});
-      output.set_buffer(xla::OwningDeviceMemory(), {output_num});
+      output.set_buffer(se::OwningDeviceMemory(), {output_num});
      Tensor output_tensor = XlaTensorBuffer::MakeTensor(
          write.type, write.shape, buffer, allocator);
      *variable_infos[i].var()->tensor() = output_tensor;
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@ -23,14 +23,14 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/resource_var.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"

 namespace tensorflow {
 class XlaAllocator;
@ -108,11 +108,11 @@ Status LockVariables(absl::Span<VariableInfo> variables)
 // Adapter class that wraps a Tensorflow allocator as an XLA allocator.
 // Assumes that the Tensorflow allocator permits asynchronous deallocation:
 // see comment on `AllowsAsynchronousDeallocation()`.
-class XlaAllocator : public xla::DeviceMemoryAllocator {
+class XlaAllocator : public se::DeviceMemoryAllocator {
 public:
  XlaAllocator(const se::Platform* platform, Allocator* wrapped);
  ~XlaAllocator() override;
-  xla::StatusOr<xla::OwningDeviceMemory> Allocate(
+  xla::StatusOr<se::OwningDeviceMemory> Allocate(
      int device_ordinal, uint64 size, bool retry_on_failure) override;
  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;

@ -142,7 +142,7 @@ class XlaComputationLaunchContext {
  // because we track inter-stream dependencies through events inside XlaTensor
  // objects.
  XlaComputationLaunchContext(xla::LocalClient* client,
-                              xla::DeviceMemoryAllocator* xla_allocator,
+                              se::DeviceMemoryAllocator* xla_allocator,
                              bool allocate_xla_tensors,
                              bool use_multiple_streams);

@ -186,7 +186,7 @@ class XlaComputationLaunchContext {

 private:
  xla::LocalClient* client_;
-  xla::DeviceMemoryAllocator* xla_allocator_;
+  se::DeviceMemoryAllocator* xla_allocator_;
  bool allocate_xla_tensors_;
  bool use_multiple_streams_;
  std::vector<std::unique_ptr<xla::ShapedBuffer>> arg_buffers_;
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@ -59,7 +59,7 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype,
        xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
    uint64 size =
        client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
-    TF_ASSIGN_OR_RETURN(xla::OwningDeviceMemory buffer,
+    TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
                        client->backend().memory_allocator()->Allocate(
                            device_ordinal, size, /*retry_on_failure=*/false));
    // Move our buffer into shaped_buffer, which takes ownership of it.
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@ -339,7 +339,7 @@ class XlaCompiler {
    // here, but on some devices (notably, GPUs), TensorFlow tends to eagerly
    // allocate most or all available memory on the device, leaving none for the
    // compiler to access, unless it can use TensorFlow's allocator.
-    xla::DeviceMemoryAllocator* device_allocator = nullptr;
+    se::DeviceMemoryAllocator* device_allocator = nullptr;
  };

  explicit XlaCompiler(Options options);
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@ -96,7 +96,7 @@ cc_library(
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto",
        "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/strings:str_format",
        "@com_google_absl//absl/types:optional",
@ -117,7 +117,6 @@ cc_library(
        "//tensorflow/compiler/xla:xla_data_proto",
        "//tensorflow/compiler/xla/service:backend",
        "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:dump",
        "//tensorflow/compiler/xla/service:executable",
        "//tensorflow/compiler/xla/service:hlo_proto",
@ -126,6 +125,7 @@ cc_library(
        "//tensorflow/compiler/xla/service:source_map_util",
        "//tensorflow/compiler/xla/service:stream_pool",
        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:span",
        "@llvm//:support",
@ -165,11 +165,11 @@ cc_library(
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla/service:backend",
        "//tensorflow/compiler/xla/service:compile_only_service",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:local_service",
        "//tensorflow/compiler/xla/service:platform_util",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:optional",
    ],
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/compile_only_service.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@ -39,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {

--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@ -22,12 +22,12 @@ limitations under the License.
 namespace xla {

 ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator(
-    DeviceMemoryAllocator* allocator) {
+    se::DeviceMemoryAllocator* allocator) {
  device_allocator_ = allocator;
  return *this;
 }

-DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
+se::DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const {
  return device_allocator_;
 }

--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@ -18,11 +18,11 @@ limitations under the License.

 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {

@ -57,11 +57,11 @@ class ExecutableBuildOptions {
  // want to run various algorithms on the device and pick the fastest one -- it
  // might allocate buffers for use by these algorithms using this allocator.
  //
-  // This does not need to be the same as the DeviceMemoryAllocator passed when
-  // running the executable.
+  // This does not need to be the same as the se::DeviceMemoryAllocator passed
+  // when running the executable.
  ExecutableBuildOptions& set_device_allocator(
-      DeviceMemoryAllocator* allocator);
-  DeviceMemoryAllocator* device_allocator() const;
+      se::DeviceMemoryAllocator* allocator);
+  se::DeviceMemoryAllocator* device_allocator() const;

  // Returns a string representation of the build options, suitable for
  // debugging.
@ -77,7 +77,7 @@ class ExecutableBuildOptions {
  Shape result_layout_;
  bool result_layout_set_ = false;
  absl::optional<DebugOptions> debug_options_;
-  DeviceMemoryAllocator* device_allocator_ = nullptr;
+  se::DeviceMemoryAllocator* device_allocator_ = nullptr;
  int num_replicas_ = 1;
 };

--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@ -279,7 +279,7 @@ StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(

 StatusOr<ScopedShapedBuffer> LocalClient::LiteralToShapedBuffer(
    const LiteralSlice& literal, int device_ordinal,
-    DeviceMemoryAllocator* allocator) {
+    se::DeviceMemoryAllocator* allocator) {
  if (allocator == nullptr) {
    allocator = backend().memory_allocator();
  }
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
@ -32,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {

@ -137,7 +137,7 @@ class LocalClient : public Client {
  // device is used.
  StatusOr<ScopedShapedBuffer> LiteralToShapedBuffer(
      const LiteralSlice& literal, int device_ordinal,
-      DeviceMemoryAllocator* allocator = nullptr);
+      se::DeviceMemoryAllocator* allocator = nullptr);

  // Transfer the BorrowingLiteral to the device with the given ordinal.
  StatusOr<TransferToServerResponse> TransferToLocalServer(
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@ -26,12 +26,13 @@ ExecutableRunOptions& ExecutableRunOptions::set_device_ordinal(
 int ExecutableRunOptions::device_ordinal() const { return device_ordinal_; }

 ExecutableRunOptions& ExecutableRunOptions::set_allocator(
-    DeviceMemoryAllocator* allocator) {
+    stream_executor::DeviceMemoryAllocator* allocator) {
  allocator_ = allocator;
  return *this;
 }

-DeviceMemoryAllocator* ExecutableRunOptions::allocator() const {
+stream_executor::DeviceMemoryAllocator* ExecutableRunOptions::allocator()
+    const {
  return allocator_;
 }

--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@ -23,6 +23,7 @@ limitations under the License.
 namespace stream_executor {
 class Stream;
 class Platform;
+class DeviceMemoryAllocator;
 }  // namespace stream_executor

 namespace Eigen {
@ -31,7 +32,6 @@ struct ThreadPoolDevice;

 namespace xla {

-class DeviceMemoryAllocator;
 class DeviceAssignment;
 class ExecutionProfile;

@ -39,8 +39,9 @@ class ExecutionProfile;
 class ExecutableRunOptions {
 public:
  // Specifies the allocator to use during execution.
-  ExecutableRunOptions& set_allocator(DeviceMemoryAllocator* allocator);
-  DeviceMemoryAllocator* allocator() const;
+  ExecutableRunOptions& set_allocator(
+      stream_executor::DeviceMemoryAllocator* allocator);
+  stream_executor::DeviceMemoryAllocator* allocator() const;

  // If set, this is the device to run the computation on. Valid device_ordinal
  // values are: 0 to # of devices - 1. These values are identical to the device
@ -87,7 +88,7 @@ class ExecutableRunOptions {
  int rng_seed() const;

 private:
-  DeviceMemoryAllocator* allocator_ = nullptr;
+  stream_executor::DeviceMemoryAllocator* allocator_ = nullptr;
  int device_ordinal_ = -1;
  const DeviceAssignment* device_assignment_ = nullptr;
  stream_executor::Stream* stream_ = nullptr;
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@ -67,8 +67,8 @@ cc_library(
        "//tensorflow/compiler/xla:statusor",
        "//tensorflow/compiler/xla:types",
        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/container:flat_hash_map",
        "@com_google_absl//absl/types:optional",
        "@pybind11",
@ -109,9 +109,9 @@ cc_library(
    hdrs = ["shared_device_buffer.h"],
    deps = [
        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:shaped_buffer",
        "//tensorflow/compiler/xla/service:transfer_manager",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/container:flat_hash_set",
    ],
 )
@ -178,7 +178,7 @@ tf_pybind_extension(
        "//tensorflow/compiler/xla/client/lib:self_adjoint_eig",
        "//tensorflow/compiler/xla/client/lib:svd",
        "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/service:hlo_graph_dumper",
        "//tensorflow/compiler/xla/service:name_uniquer",
--- a/tensorflow/compiler/xla/python/local_client.cc
+++ b/tensorflow/compiler/xla/python/local_client.cc
@ -212,7 +212,7 @@ StatusOr<pybind11::object> PyLocalClient::TransferFromOutfeed(
 static StatusOr<PyLocalBuffer> TransferHostToDeviceAsync(
    const PythonBufferTree& tree, int device_ordinal, PyLocalClient* client,
    const Device& device) {
-  DeviceMemoryAllocator* allocator =
+  se::DeviceMemoryAllocator* allocator =
      client->client()->backend().memory_allocator();
  TransferManager* transfer_manager =
      client->client()->backend().transfer_manager();
@ -367,7 +367,7 @@ PyLocalBuffer::FromPythonValues(
    host_shapes.push_back(buffer.on_host_shape());
    device_buffers.push_back(buffer.device_buffer());
  }
-  DeviceMemoryAllocator* allocator =
+  se::DeviceMemoryAllocator* allocator =
      client->client()->backend().memory_allocator();
  TransferManager* transfer_manager =
      client->client()->backend().transfer_manager();
--- a/tensorflow/compiler/xla/python/shared_device_buffer.cc
+++ b/tensorflow/compiler/xla/python/shared_device_buffer.cc
@ -15,7 +15,7 @@ limitations under the License.

 #include "tensorflow/compiler/xla/python/shared_device_buffer.h"

-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {

@ -47,14 +47,14 @@ void BufferDefinitionEvent::WaitForEventOnStream(se::Stream* stream) {
 static std::shared_ptr<PySharedDeviceBuffer>
 BufferFromScopedShapedBufferIterator(
    const Shape& on_device_shape, int device_ordinal,
-    DeviceMemoryAllocator* allocator,
+    se::DeviceMemoryAllocator* allocator,
    ShapeTree<se::DeviceMemoryBase>::iterator* iterator,
    const ShapeTree<se::DeviceMemoryBase>::iterator& end,
    const std::shared_ptr<BufferDefinitionEvent>& definition_event) {
  CHECK(*iterator != end);

-  OwningDeviceMemory device_memory((*iterator)->second, device_ordinal,
-                                   allocator);
+  se::OwningDeviceMemory device_memory((*iterator)->second, device_ordinal,
+                                       allocator);
  (*iterator)->second = se::DeviceMemoryBase();
  ++*iterator;

@ -90,7 +90,7 @@ PySharedDeviceBuffer::FromScopedShapedBuffer(
 /* static */ StatusOr<std::shared_ptr<PySharedDeviceBuffer>>
 PySharedDeviceBuffer::MakeTuple(
    std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
-    TransferManager* transfer_manager, DeviceMemoryAllocator* allocator,
+    TransferManager* transfer_manager, se::DeviceMemoryAllocator* allocator,
    int device_ordinal,
    std::shared_ptr<BufferDefinitionEvent> definition_event) {
  std::vector<Shape> child_shapes;
@ -102,7 +102,7 @@ PySharedDeviceBuffer::MakeTuple(

  Shape shape = ShapeUtil::MakeTupleShape(child_shapes);
  TF_ASSIGN_OR_RETURN(
-      OwningDeviceMemory device_memory,
+      se::OwningDeviceMemory device_memory,
      allocator->Allocate(device_ordinal,
                          transfer_manager->GetByteSizeRequirement(shape)));
  return std::make_shared<PySharedDeviceBuffer>(
@ -113,10 +113,10 @@ PySharedDeviceBuffer::MakeTuple(
 /* static */ StatusOr<std::shared_ptr<PySharedDeviceBuffer>>
 PySharedDeviceBuffer::MakeArray(
    Shape on_device_shape, TransferManager* transfer_manager,
-    DeviceMemoryAllocator* allocator, int device_ordinal,
+    se::DeviceMemoryAllocator* allocator, int device_ordinal,
    std::shared_ptr<BufferDefinitionEvent> definition_event) {
  TF_ASSIGN_OR_RETURN(
-      OwningDeviceMemory device_memory,
+      se::OwningDeviceMemory device_memory,
      allocator->Allocate(
          device_ordinal,
          transfer_manager->GetByteSizeRequirement(on_device_shape)));
@ -153,7 +153,7 @@ ShapedBuffer PySharedDeviceBuffer::AsShapedBuffer(
 }

 PySharedDeviceBuffer::PySharedDeviceBuffer(
-    Shape on_device_shape, OwningDeviceMemory device_memory,
+    Shape on_device_shape, se::OwningDeviceMemory device_memory,
    std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
    std::shared_ptr<BufferDefinitionEvent> definition_event)
    : on_device_shape_(std::move(on_device_shape)),
--- a/tensorflow/compiler/xla/python/shared_device_buffer.h
+++ b/tensorflow/compiler/xla/python/shared_device_buffer.h
@ -17,11 +17,11 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PYTHON_SHARED_DEVICE_BUFFER_H_

 #include "absl/container/flat_hash_set.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"

 namespace xla {

@ -93,14 +93,14 @@ class PySharedDeviceBuffer {
  // Makes a tuple buffer. Does not initialize the tuple table.
  static StatusOr<std::shared_ptr<PySharedDeviceBuffer>> MakeTuple(
      std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
-      TransferManager* transfer_manager, DeviceMemoryAllocator* allocator,
+      TransferManager* transfer_manager, se::DeviceMemoryAllocator* allocator,
      int device_ordinal,
      std::shared_ptr<BufferDefinitionEvent> definition_event);

  // Makes an uninitialized array buffer.
  static StatusOr<std::shared_ptr<PySharedDeviceBuffer>> MakeArray(
      Shape on_device_shape, TransferManager* transfer_manager,
-      DeviceMemoryAllocator* allocator, int device_ordinal,
+      se::DeviceMemoryAllocator* allocator, int device_ordinal,
      std::shared_ptr<BufferDefinitionEvent> definition_event);

  // Builds a ShapedBuffer view onto the buffers of 'tree'. Since
@ -113,7 +113,7 @@ class PySharedDeviceBuffer {
  const std::vector<std::shared_ptr<PySharedDeviceBuffer>>& children() const {
    return children_;
  }
-  const OwningDeviceMemory& device_memory() const { return device_memory_; }
+  const se::OwningDeviceMemory& device_memory() const { return device_memory_; }
  int device_ordinal() const { return device_memory_.device_ordinal(); }
  const std::shared_ptr<BufferDefinitionEvent> definition_event() const {
    return definition_event_;
@ -121,7 +121,7 @@ class PySharedDeviceBuffer {

  PySharedDeviceBuffer() = default;
  PySharedDeviceBuffer(
-      Shape on_device_shape, OwningDeviceMemory device_memory,
+      Shape on_device_shape, se::OwningDeviceMemory device_memory,
      std::vector<std::shared_ptr<PySharedDeviceBuffer>> children,
      std::shared_ptr<BufferDefinitionEvent> definition_event);

@ -130,7 +130,7 @@ class PySharedDeviceBuffer {
  // one-to-one with the tree of device buffers, so to avoid representational
  // awkwardness we maintain on-host shapes separately.
  Shape on_device_shape_;
-  OwningDeviceMemory device_memory_;
+  se::OwningDeviceMemory device_memory_;
  std::vector<std::shared_ptr<PySharedDeviceBuffer>> children_;

  // An event that is triggered when the content of one or more buffers is
--- a/tensorflow/compiler/xla/python/types.cc
+++ b/tensorflow/compiler/xla/python/types.cc
@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/types.h"

 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"

 namespace xla {

--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@ -437,10 +437,10 @@ tf_cc_test(
    srcs = ["pattern_matcher_test.cc"],
    deps = [
        ":hlo",
+        ":hlo_parser",
        ":pattern_matcher",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
        "//tensorflow/core:test",
        "@com_google_absl//absl/strings",
@ -508,8 +508,8 @@ cc_library(
    hdrs = ["hlo_matchers.h"],
    deps = [
        ":hlo",
+        ":hlo_parser",
        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/types:optional",
@ -552,13 +552,13 @@ tf_cc_test(
    srcs = ["hlo_sharding_test.cc"],
    deps = [
        ":hlo",
+        ":hlo_parser",
        "//tensorflow/compiler/xla:literal",
        "//tensorflow/compiler/xla:protobuf_util",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:test",
        "//tensorflow/compiler/xla:test_helpers",
        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
    ],
@ -586,6 +586,7 @@ tf_cc_test(
    srcs = ["call_graph_test.cc"],
    deps = [
        ":call_graph",
+        ":hlo",
        "//tensorflow/compiler/xla:literal",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:status_macros",
@ -593,7 +594,6 @@ tf_cc_test(
        "//tensorflow/compiler/xla:test_helpers",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
        "//tensorflow/core:test",
@ -656,6 +656,7 @@ tf_cc_test(
    deps = [
        ":call_graph",
        ":flatten_call_graph",
+        ":hlo",
        "//tensorflow/compiler/xla:literal",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:status_macros",
@ -663,7 +664,6 @@ tf_cc_test(
        "//tensorflow/compiler/xla:test_helpers",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
        "//tensorflow/core:test",
@ -694,7 +694,6 @@ cc_library(
    deps = [
        ":compiler",
        ":computation_placer",
-        ":device_memory_allocator",
        ":platform_util",
        ":stream_pool",
        ":transfer_manager",
@ -704,6 +703,7 @@ cc_library(
        "//tensorflow/compiler/xla:util",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "//third_party/eigen3",
        "@com_google_absl//absl/container:flat_hash_map",
        "@com_google_absl//absl/memory",
@ -724,7 +724,6 @@ cc_library(
        ":compiler",
        ":computation_layout",
        ":computation_placer",
-        ":device_memory_allocator",
        ":dump",
        ":dynamic_dimension_inference",
        ":executable",
@ -754,6 +753,7 @@ cc_library(
        "//tensorflow/core:lib",
        "//tensorflow/core:ptr_util",
        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/strings:str_format",
@ -770,7 +770,6 @@ cc_library(
        ":backend",
        ":compiler",
        ":computation_layout",
-        ":device_memory_allocator",
        ":executable",
        ":hlo",
        ":hlo_execution_profile",
@ -790,6 +789,7 @@ cc_library(
        "//tensorflow/compiler/xla/client:xla_computation",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/strings:str_format",
@ -858,7 +858,6 @@ cc_library(
    srcs = ["shaped_buffer.cc"],
    hdrs = ["shaped_buffer.h"],
    deps = [
-        ":device_memory_allocator",
        "//tensorflow/compiler/xla:shape_tree",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:status_macros",
@ -868,6 +867,7 @@ cc_library(
        "//tensorflow/compiler/xla:xla_data_proto",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/container:flat_hash_set",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
@ -881,7 +881,6 @@ tf_cc_test(
    srcs = ["shaped_buffer_test.cc"],
    deps = [
        ":cpu_plugin",
-        ":device_memory_allocator",
        ":platform_util",
        ":shaped_buffer",
        "//tensorflow/compiler/xla:shape_util",
@ -891,6 +890,7 @@ tf_cc_test(
        "//tensorflow/core:ptr_util",
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/core:test",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/memory",
    ],
 )
@ -904,7 +904,6 @@ cc_library(
    ],
    deps = [
        ":computation_layout",
-        ":device_memory_allocator",
        ":dump",
        ":hlo",
        ":hlo_execution_profile",
@ -925,6 +924,7 @@ cc_library(
        "//tensorflow/core:lib_internal",
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings:str_format",
        "@com_google_absl//absl/types:span",
@ -991,7 +991,6 @@ cc_library(
    hdrs = ["allocation_tracker.h"],
    deps = [
        ":backend",
-        ":device_memory_allocator",
        ":transfer_manager",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:status_macros",
@ -1000,6 +999,7 @@ cc_library(
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto",
        "//tensorflow/core:lib",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/container:flat_hash_map",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
@ -1159,6 +1159,7 @@ tf_cc_test(
        ":hlo",
        ":hlo_memory_scheduler",
        ":hlo_ordering",
+        ":hlo_parser",
        "//tensorflow/compiler/xla:literal",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:test",
@ -1166,7 +1167,6 @@ tf_cc_test(
        "//tensorflow/compiler/xla:types",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
        "//tensorflow/core:lib",
@ -1208,10 +1208,10 @@ tf_cc_test(
        ":hlo_dataflow_analysis",
        ":hlo_memory_scheduler",
        ":hlo_ordering",
+        ":hlo_parser",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:types",
        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
        "//tensorflow/core:test",
@ -1458,8 +1458,8 @@ tf_cc_test(
    srcs = ["instruction_fusion_test.cc"],
    deps = [
        ":hlo_matchers",
+        ":hlo_parser",
        ":instruction_fusion",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
    ],
@ -1470,11 +1470,11 @@ cc_library(
    srcs = ["multi_output_fusion.cc"],
    hdrs = ["multi_output_fusion.h"],
    deps = [
+        ":hlo",
+        ":hlo_pass",
        ":hlo_reachability",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_pass",
        "//tensorflow/core:lib",
        "@com_google_absl//absl/container:flat_hash_map",
        "@com_google_absl//absl/container:flat_hash_set",
@ -1791,8 +1791,8 @@ tf_cc_test(
    srcs = ["gather_expander_test.cc"],
    deps = [
        ":gather_expander",
+        ":hlo_parser",
        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:test_macros_header",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
    ],
@ -1890,9 +1890,9 @@ tf_cc_test(
    name = "while_loop_analysis_test",
    srcs = ["while_loop_analysis_test.cc"],
    deps = [
+        ":hlo_parser",
        ":while_loop_analysis",
        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
        "//tensorflow/core:test",
@ -2297,7 +2297,7 @@ tf_cc_test(
        ":cpu_plugin",
        ":hlo_cost_analysis",
        ":hlo_execution_profile",
-        "//tensorflow/compiler/xla/service:hlo_parser",
+        ":hlo_parser",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
        "//tensorflow/core:lib",
@ -2310,14 +2310,14 @@ tf_cc_test(
    srcs = ["hlo_computation_test.cc"],
    deps = [
        ":hlo",
+        ":hlo_matchers",
+        ":hlo_parser",
        ":pattern_matcher",
        ":pattern_matcher_gmock",
        "//tensorflow/compiler/xla:literal",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:test",
        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
        "@com_google_absl//absl/container:flat_hash_map",
@ -2522,13 +2522,13 @@ tf_cc_test(
    deps = [
        ":hlo",
        ":hlo_liveness_analysis",
+        ":hlo_parser",
        "//tensorflow/compiler/xla:literal",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:status_macros",
        "//tensorflow/compiler/xla:test",
        "//tensorflow/compiler/xla:test_helpers",
        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
        "//tensorflow/core:lib",
@ -2912,12 +2912,12 @@ tf_cc_test(
    deps = [
        ":hlo",
        ":hlo_module_dce",
+        ":hlo_parser",
        "//tensorflow/compiler/xla:literal",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:types",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:literal_test_util",
        "//tensorflow/compiler/xla/tests:test_utils",
@ -3043,12 +3043,12 @@ tf_cc_test(
        ":hlo",
        ":hlo_cse",
        ":hlo_matchers",
+        ":hlo_parser",
        "//tensorflow/compiler/xla:literal",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:types",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:literal_test_util",
        "//tensorflow/compiler/xla/tests:test_utils",
@ -3232,27 +3232,6 @@ tf_cc_test(
    ],
 )

-cc_library(
-    name = "device_memory_allocator",
-    srcs = [
-        "device_memory_allocator.cc",
-        "owning_device_memory.cc",
-    ],
-    hdrs = [
-        "device_memory_allocator.h",
-        "owning_device_memory.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
-        "@com_google_absl//absl/types:span",
-    ],
-)
-
 cc_library(
    name = "maybe_owning_device_memory",
    srcs = [
@ -3262,7 +3241,7 @@ cc_library(
        "maybe_owning_device_memory.h",
    ],
    deps = [
-        ":device_memory_allocator",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/types:optional",
        "@com_google_absl//absl/types:variant",
    ],
@ -3305,10 +3284,10 @@ xla_test(
        "gpu",
    ],
    deps = [
+        ":hlo_parser",
        "//tensorflow/compiler/xla:execution_options_util",
        "//tensorflow/compiler/xla:status_macros",
        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:client_library_test_base",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@ -3431,6 +3410,7 @@ tf_cc_test(
    deps = [
        ":hlo",
        ":hlo_matchers",
+        ":hlo_parser",
        ":shape_inference",
        ":transpose_folding",
        "//tensorflow/compiler/xla:literal",
@ -3439,7 +3419,6 @@ tf_cc_test(
        "//tensorflow/compiler/xla:test_helpers",
        "//tensorflow/compiler/xla:xla_data_proto",
        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@ -3682,10 +3661,10 @@ tf_cc_test(
    name = "tuple_util_test",
    srcs = ["tuple_util_test.cc"],
    deps = [
+        ":hlo_matchers",
+        ":hlo_parser",
        ":tuple_util",
        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
    ],
 )
@ -3711,11 +3690,11 @@ tf_cc_test(
    name = "while_util_test",
    srcs = ["while_util_test.cc"],
    deps = [
+        ":hlo_matchers",
+        ":hlo_parser",
        ":while_util",
        "//tensorflow/compiler/xla:test",
        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
        "@com_google_absl//absl/algorithm:container",
    ],
@ -3746,9 +3725,9 @@ tf_cc_test(
    srcs = ["while_loop_invariant_code_motion_test.cc"],
    deps = [
        ":hlo_matchers",
+        ":hlo_parser",
        ":while_loop_invariant_code_motion",
        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/core:test",
    ],
@ -3774,9 +3753,9 @@ tf_cc_test(
    srcs = ["while_loop_constant_sinking_test.cc"],
    deps = [
        ":hlo_matchers",
+        ":hlo_parser",
        ":while_loop_constant_sinking",
        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service:hlo_parser",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
        "//tensorflow/core:test",
    ],
@ -3976,6 +3955,8 @@ cc_library(
    hdrs = ["ar_crs_combiner.h"],
    deps = [
        ":call_graph",
+        ":hlo",
+        ":hlo_pass",
        ":pattern_matcher",
        "//tensorflow/compiler/xla:literal",
        "//tensorflow/compiler/xla:literal_util",
@ -3983,8 +3964,6 @@ cc_library(
        "//tensorflow/compiler/xla:status_macros",
        "//tensorflow/compiler/xla:statusor",
        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_pass",
        "@com_google_absl//absl/container:flat_hash_map",
        "@com_google_absl//absl/strings",
    ],
@ -4008,11 +3987,11 @@ cc_library(
    srcs = ["dynamic_index_splitter.cc"],
    hdrs = ["dynamic_index_splitter.h"],
    deps = [
+        ":hlo",
        ":hlo_casting_utils",
+        ":hlo_pass",
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/service:hlo_pass",
        "@com_google_absl//absl/container:flat_hash_map",
        "@com_google_absl//absl/container:flat_hash_set",
        "@com_google_absl//absl/container:inlined_vector",
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@ -20,13 +20,13 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {

@ -221,8 +221,8 @@ void AllocationTracker::AddAllocationOrIncrementRefCount(
  auto it = allocation_map.find(device_memory.opaque());
  if (it == allocation_map.end()) {
    allocation_map[device_memory.opaque()] = {
-        OwningDeviceMemory(device_memory, device_ordinal,
-                           backend_->memory_allocator()),
+        se::OwningDeviceMemory(device_memory, device_ordinal,
+                               backend_->memory_allocator()),
        /*ref_count=*/1};
  } else {
    it->second.ref_count++;
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@ -77,7 +77,7 @@ class AllocationTracker {
  // Data structure encapsulating single memory allocation on the device.
  struct Allocation {
    // The pointer to this allocation.
-    OwningDeviceMemory device_memory;
+    se::OwningDeviceMemory device_memory;

    // This is the number of times this memory allocation is referred to by
    // registered data handles.
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@ -134,7 +134,7 @@ Backend::Backend(se::Platform* platform, Compiler* compiler,
    }
  }
  // Create a memory allocator for the valid stream executors.
-  memory_allocator_ = absl::make_unique<StreamExecutorMemoryAllocator>(
+  memory_allocator_ = absl::make_unique<se::StreamExecutorMemoryAllocator>(
      platform, stream_executors);
  CHECK(!stream_executors_.empty())
      << "Service found no devices for backend " << platform_->Name() << '.';
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@ -27,7 +27,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/statusor.h"
@ -35,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace Eigen {
 struct ThreadPoolDevice;
@ -88,7 +88,7 @@ class Backend {
  // Accessors for the various objects.
  se::Platform* platform() const { return platform_; }
  Compiler* compiler() const { return compiler_; }
-  DeviceMemoryAllocator* memory_allocator() const {
+  se::DeviceMemoryAllocator* memory_allocator() const {
    return memory_allocator_.get();
  }
  TransferManager* transfer_manager() const { return transfer_manager_; }
@ -179,7 +179,7 @@ class Backend {
      stream_pools_ GUARDED_BY(mu_);

  // The default memory allocator to use.
-  std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
+  std::unique_ptr<se::StreamExecutorMemoryAllocator> memory_allocator_;

  // For the CPU backend, an Eigen threadpool device for use by Eigen code.
  struct IntraOpThreadPool;
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@ -75,8 +75,10 @@ class AotCompilationOptions {

  // Optional allocator that may be used for allocating temp space on the device
  // during compilation.
-  DeviceMemoryAllocator* device_allocator() const { return device_allocator_; }
-  void set_device_allocator(DeviceMemoryAllocator* device_allocator) {
+  se::DeviceMemoryAllocator* device_allocator() const {
+    return device_allocator_;
+  }
+  void set_device_allocator(se::DeviceMemoryAllocator* device_allocator) {
    device_allocator_ = device_allocator;
  }

@ -98,7 +100,7 @@ class AotCompilationOptions {
  AotCompilationOptions();

 private:
-  DeviceMemoryAllocator* device_allocator_ = nullptr;
+  se::DeviceMemoryAllocator* device_allocator_ = nullptr;
  DebugOptions debug_options_;
  absl::optional<DeviceAssignment> static_device_assignment_;
 };
@ -147,14 +149,14 @@ class Compiler {
  // allocated should be deallocated before this function returns.
  virtual StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;

  // Optimizes a HLO module group, a set of module which runs concurrently on
  // multiple devices potentially communicating data between the modules.
  virtual Status RunHloPassesOnModuleGroup(
      HloModuleGroup* module_group,
      absl::Span<se::StreamExecutor* const> executors,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;

  // Compiles the HLO module for execution on a device given by the executor,
  // and returns an executable object or an error status. No HLO passes are
@ -168,7 +170,7 @@ class Compiler {
  // device_allocator is optional; see RunHloPasses.
  virtual StatusOr<std::unique_ptr<Executable>> RunBackend(
      std::unique_ptr<HloModule> module, se::StreamExecutor* executor,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;

  // Compiles a set of HLO modules that can run in parallel, potentially
  // communicating data between the modules.
@ -176,7 +178,7 @@ class Compiler {
  RunBackendOnModuleGroup(
      std::unique_ptr<HloModuleGroup> module_group,
      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;

  // Compiles a set of HLO modules that can run in parallel, potentially
  // communicating data between the modules, and returns a corresponding
@ -189,7 +191,7 @@ class Compiler {
  virtual StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
      std::unique_ptr<HloModuleGroup> module_group,
      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) = 0;
+      se::DeviceMemoryAllocator* device_allocator) = 0;

  // Returns the backend configurations that the backend will consider for the
  // given HLO. Returns no configurations if the backend does not support
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@ -245,7 +245,6 @@ cc_library(
        "//tensorflow/compiler/xla:xla_data_proto",
        "//tensorflow/compiler/xla/service:buffer_assignment",
        "//tensorflow/compiler/xla/service:computation_layout",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:executable",
        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/service:hlo_execution_profile",
@ -255,6 +254,7 @@ cc_library(
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "//tensorflow/stream_executor/host:host_stream",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/strings:str_format",
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@ -537,7 +537,7 @@ Status CreateHloProfilingArtifacts(

 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
    std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+    se::DeviceMemoryAllocator* /*device_allocator*/) {
  std::unique_ptr<llvm::TargetMachine> jit_target_machine =
      SimpleOrcJIT::InferTargetMachineForJIT(
          CompilerTargetOptions(module->config()),
@ -597,7 +597,7 @@ struct OrcJITPostCompilationHook {

 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+    se::DeviceMemoryAllocator* /*device_allocator*/) {
  VLOG(1) << "Compiling: " << module->name();
  XLA_SCOPED_LOGGING_TIMER(
      absl::StrFormat("Compiling [%s] for CPU using JIT", module->name()));
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@ -133,11 +133,11 @@ class CpuCompiler : public LLVMCompiler {

  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::unique_ptr<Executable>> RunBackend(
      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@ -73,13 +73,13 @@ CpuExecutable::CpuExecutable(
 }

 StatusOr<std::pair<std::vector<se::DeviceMemoryBase>,
-                   std::vector<OwningDeviceMemory>>>
+                   std::vector<se::OwningDeviceMemory>>>
 CpuExecutable::CreateBufferTable(
-    DeviceMemoryAllocator* memory_allocator, int device_ordinal,
+    se::DeviceMemoryAllocator* memory_allocator, int device_ordinal,
    absl::Span<const ShapedBuffer* const> arguments) {
  std::vector<se::DeviceMemoryBase> unowning_buffers(
      assignment_->Allocations().size());
-  std::vector<OwningDeviceMemory> owning_buffers(
+  std::vector<se::OwningDeviceMemory> owning_buffers(
      assignment_->Allocations().size());
  VLOG(3) << "Allocating " << assignment_->Allocations().size()
          << " allocations for module " << module().name();
@ -207,7 +207,7 @@ Status CpuExecutable::ExecuteComputeFunction(

 StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
    const ServiceExecutableRunOptions* run_options,
-    absl::Span<OwningDeviceMemory> buffers) {
+    absl::Span<se::OwningDeviceMemory> buffers) {
  se::Stream* stream = run_options->stream();
  ScopedShapedBuffer result_buffer(
      /*on_host_shape=*/result_shape(),
@ -216,7 +216,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
  const HloInputOutputAliasConfig& input_output_alias =
      module().input_output_alias_config();

-  // Move OwningDeviceMemory values which contain the array(s) of the result
+  // Move se::OwningDeviceMemory values which contain the array(s) of the result
  // into the respective location in ScopedShapedBuffer which is returned to the
  // caller.
  TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
@ -235,7 +235,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
            const BufferAllocation::Slice slice,
            this->assignment_->GetUniqueSlice(src, buffer_source->index()));
        const BufferAllocation::Index buffer_index = slice.index();
-        OwningDeviceMemory& buffer = buffers[buffer_index];
+        se::OwningDeviceMemory& buffer = buffers[buffer_index];
        if (!slice.allocation()->is_entry_computation_parameter()) {
          // If the buffer coming out of the result is from a parameter, the
          // owning buffer will be null, and that means the caller aliased some
@ -297,8 +297,8 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
  auto* host_stream = dynamic_cast<se::host::HostStream*>(
      run_options->stream()->implementation());
  se::Stream* stream = run_options->stream();
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  std::vector<OwningDeviceMemory> owning_buffers;
+  se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  std::vector<se::OwningDeviceMemory> owning_buffers;
  std::vector<se::DeviceMemoryBase> unowning_buffers;
  TF_ASSIGN_OR_RETURN(
      std::tie(unowning_buffers, owning_buffers),
@ -326,7 +326,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
    CpuExecutable* executable;
    ServiceExecutableRunOptions run_options;
    std::vector<se::DeviceMemoryBase> unowning_buffers;
-    std::shared_ptr<std::vector<OwningDeviceMemory>> buffers;
+    std::shared_ptr<std::vector<se::OwningDeviceMemory>> buffers;
    HloExecutionProfile* hlo_execution_profile;

    void operator()() {
@ -338,7 +338,7 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStreamImpl(
  };
  host_stream->EnqueueTask(
      AsyncRunTask{this, *run_options, std::move(unowning_buffers),
-                   std::make_shared<std::vector<OwningDeviceMemory>>(
+                   std::make_shared<std::vector<se::OwningDeviceMemory>>(
                       std::move(owning_buffers)),
                   hlo_execution_profile});

--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@ -25,7 +25,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@ -37,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {
 namespace cpu {
@ -111,8 +111,9 @@ class CpuExecutable : public Executable {
  //    storage and the live-out buffer into which the computation writes it
  //    result.
  StatusOr<std::pair<std::vector<se::DeviceMemoryBase>,
-                     std::vector<OwningDeviceMemory>>>
-  CreateBufferTable(DeviceMemoryAllocator* memory_allocator, int device_ordinal,
+                     std::vector<se::OwningDeviceMemory>>>
+  CreateBufferTable(se::DeviceMemoryAllocator* memory_allocator,
+                    int device_ordinal,
                    absl::Span<const ShapedBuffer* const> arguments);

  // Calls the generated function performing the computation with the given
@ -126,7 +127,7 @@ class CpuExecutable : public Executable {
  // The addresses are set according to buffer assignment.
  StatusOr<ScopedShapedBuffer> CreateResultShapedBuffer(
      const ServiceExecutableRunOptions* run_options,
-      absl::Span<OwningDeviceMemory> buffers);
+      absl::Span<se::OwningDeviceMemory> buffers);

  // Returns the points-to set of the root instruction of the entry
  // computation. Uses points-to analysis from buffer assignment.
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@ -24,13 +24,11 @@ limitations under the License.
 #include "absl/types/variant.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
@ -40,6 +38,8 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"

 namespace xla {

@ -47,13 +47,13 @@ namespace xla {
 // leftover buffers to be released by the caller.
 struct ExecutionOutput {
  ExecutionOutput(ScopedShapedBuffer result,
-                  std::vector<OwningDeviceMemory> to_be_released)
+                  std::vector<se::OwningDeviceMemory> to_be_released)
      : result(std::move(result)), to_be_released(std::move(to_be_released)) {}
  ScopedShapedBuffer result;

  // Leftover buffers for the caller to release. Elements in this list are
  // donated input memory buffers that are not reused by XLA as outputs.
-  std::vector<OwningDeviceMemory> to_be_released;
+  std::vector<se::OwningDeviceMemory> to_be_released;
 };

 // A given platform's compiler will produce an Executable -- this is a uniform
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@ -282,10 +282,10 @@ cc_library(
        "//tensorflow/compiler/xla:types",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/container:flat_hash_map",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:span",
@ -408,7 +408,6 @@ cc_library(
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto",
        "//tensorflow/compiler/xla/service:buffer_assignment",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:executable",
        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/service:hlo_execution_profile",
@ -428,6 +427,7 @@ cc_library(
        "//tensorflow/stream_executor",
        "//tensorflow/stream_executor:blas",
        "//tensorflow/stream_executor:device_memory",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "//tensorflow/stream_executor:kernel",
        "@com_google_absl//absl/algorithm:container",
        "@com_google_absl//absl/base:core_headers",
@ -476,7 +476,6 @@ cc_library(
        "//tensorflow/compiler/xla:status_macros",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/service:hlo_casting_utils",
        "//tensorflow/compiler/xla/service:hlo_pass",
@ -485,6 +484,7 @@ cc_library(
        "//tensorflow/core:logger",
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/core/util/proto:proto_utils",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/algorithm:container",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/strings:str_format",
@ -500,8 +500,8 @@ cc_library(
    deps = [
        "//tensorflow/compiler/xla:status_macros",
        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
    ],
 )

@ -517,12 +517,12 @@ cc_library(
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:status_macros",
        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:hlo_module_config",
        "//tensorflow/compiler/xla/service:shaped_buffer",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/stream_executor:device_memory",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "//tensorflow/stream_executor:stream_executor_headers",
    ],
 )
@ -536,12 +536,12 @@ tf_cc_test(
        "//tensorflow/compiler/xla:status_macros",
        "//tensorflow/compiler/xla:test",
        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:hlo_module_config",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/core:test",
        "//tensorflow/core/platform/default/build_config:stream_executor_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "//tensorflow/stream_executor:event",
        "//tensorflow/stream_executor:kernel",
        "//tensorflow/stream_executor/cuda:cuda_activation",
@ -634,12 +634,12 @@ cc_library(
        "//tensorflow/compiler/xla:literal",
        "//tensorflow/compiler/xla:util",
        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:hlo",
        "//tensorflow/compiler/xla/service:hlo_pass",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/stream_executor:blas",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/types:optional",
    ],
 )
@ -1164,7 +1164,6 @@ cc_library(
        "//tensorflow/compiler/xla:shape_util",
        "//tensorflow/compiler/xla:status_macros",
        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:hlo_module_config",
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/stream_executor:stream_executor_headers",
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@ -39,7 +39,7 @@ void BufferAllocations::Builder::RegisterBuffer(BufferAllocation::Index index,

 StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
    const BufferAssignment* buffer_assignment, int device_ordinal,
-    DeviceMemoryAllocator* memory_allocator) {
+    se::DeviceMemoryAllocator* memory_allocator) {
  const int64 num_buffers = buffer_assignment->Allocations().size();
  auto buffer_allocations = absl::WrapUnique(new BufferAllocations(
      num_buffers, device_ordinal, memory_allocator, buffer_assignment));
@ -77,7 +77,7 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
      const int64 buffer_size = allocation.size();
      se::DeviceMemoryBase buffer_address;
      if (buffer_size > 0) {
-        OwningDeviceMemory buffer;
+        se::OwningDeviceMemory buffer;
        TF_ASSIGN_OR_RETURN(
            buffer, memory_allocator->Allocate(device_ordinal, buffer_size));
        if (reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment !=
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@ -23,9 +23,9 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {
 namespace gpu {
@ -50,7 +50,7 @@ class BufferAllocations {
    // memory on.
    StatusOr<std::unique_ptr<BufferAllocations>> Build(
        const BufferAssignment* buffer_assignment, int device_ordinal,
-        DeviceMemoryAllocator* memory_allocator);
+        se::DeviceMemoryAllocator* memory_allocator);

   private:
    absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>
@ -62,7 +62,9 @@ class BufferAllocations {
  BufferAllocations(const BufferAllocations&) = delete;
  BufferAllocations& operator=(const BufferAllocations&) = delete;

-  DeviceMemoryAllocator* memory_allocator() const { return memory_allocator_; }
+  se::DeviceMemoryAllocator* memory_allocator() const {
+    return memory_allocator_;
+  }
  int device_ordinal() const { return device_ordinal_; }

  // Returns the device address of buffer `buffer_index`. `buffer_index` must be
@ -84,7 +86,7 @@ class BufferAllocations {

 private:
  BufferAllocations(BufferAllocation::Index buffer_count, int device_ordinal,
-                    DeviceMemoryAllocator* memory_allocator,
+                    se::DeviceMemoryAllocator* memory_allocator,
                    const BufferAssignment* buffer_assignment)
      : buffers_(buffer_count),
        device_ordinal_(device_ordinal),
@ -104,7 +106,7 @@ class BufferAllocations {
  se::DeviceMemoryBase temp_buffer_base_;

  int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
+  se::DeviceMemoryAllocator* memory_allocator_;
  const BufferAssignment* buffer_assignment_;
  bool torn_down_ = false;
 };
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc
@ -256,9 +256,9 @@ StatusOr<AutotuneResult> CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache(
  const auto device_ordinal = stream_exec_->device_ordinal();

  // allocator either points to this->allocator_ or, if that's null, to a
-  // StreamExecutorMemoryAllocator for stream_exec_.
-  DeviceMemoryAllocator* allocator;
-  optional<StreamExecutorMemoryAllocator> se_allocator;
+  // se::StreamExecutorMemoryAllocator for stream_exec_.
+  se::DeviceMemoryAllocator* allocator;
+  optional<se::StreamExecutorMemoryAllocator> se_allocator;
  if (allocator_ != nullptr) {
    allocator = allocator_;
  } else {
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h
@ -19,13 +19,13 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/protobuf/autotuning.pb.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {
 namespace gpu {
@ -38,7 +38,8 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
  // memory while timing the various convolution algorithms.  If it's null,
  // we'll use the default allocator on the StreamExecutor.
  CudnnConvAlgorithmPicker(se::StreamExecutor* stream_exec,
-                           DeviceMemoryAllocator* allocator, Compiler* compiler)
+                           se::DeviceMemoryAllocator* allocator,
+                           Compiler* compiler)
      : stream_exec_(stream_exec), allocator_(allocator), compiler_(compiler) {}

  absl::string_view name() const override {
@ -56,7 +57,7 @@ class CudnnConvAlgorithmPicker : public HloModulePass {
      const HloCustomCallInstruction* instr);

  se::StreamExecutor* stream_exec_;                   // never null
-  DeviceMemoryAllocator* allocator_;                  // may be null
+  se::DeviceMemoryAllocator* allocator_;              // may be null
  Compiler* compiler_;
 };

--- a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc
@ -174,9 +174,9 @@ StatusOr<bool> CusolverRewriter::RunOnComputation(HloComputation* computation) {
  const auto device_ordinal = stream_exec_->device_ordinal();

  // allocator either points to this->allocator_ or, if that's null, to a
-  // StreamExecutorMemoryAllocator for stream_exec_.
-  DeviceMemoryAllocator* allocator;
-  absl::optional<StreamExecutorMemoryAllocator> se_allocator;
+  // se::StreamExecutorMemoryAllocator for stream_exec_.
+  se::DeviceMemoryAllocator* allocator;
+  absl::optional<se::StreamExecutorMemoryAllocator> se_allocator;
  if (allocator_ != nullptr) {
    allocator = allocator_;
  } else {
@ -200,7 +200,7 @@ StatusOr<bool> CusolverRewriter::RunOnComputation(HloComputation* computation) {
 }

 CusolverRewriter::CusolverRewriter(se::StreamExecutor* stream_exec,
-                                   DeviceMemoryAllocator* allocator)
+                                   se::DeviceMemoryAllocator* allocator)
    : stream_exec_(stream_exec), allocator_(allocator) {}

 StatusOr<bool> CusolverRewriter::Run(HloModule* module) {
--- a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
+++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h
@ -16,12 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_

-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/gpu/cusolver_context.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {
 namespace gpu {
@ -30,7 +30,7 @@ namespace gpu {
 class CusolverRewriter : public HloModulePass {
 public:
  CusolverRewriter(se::StreamExecutor* stream_exec,
-                   DeviceMemoryAllocator* allocator);
+                   se::DeviceMemoryAllocator* allocator);
  absl::string_view name() const override { return "cusolver-rewriter"; }

  StatusOr<bool> Run(HloModule* module) override;
@ -39,7 +39,7 @@ class CusolverRewriter : public HloModulePass {
  StatusOr<bool> RunOnComputation(HloComputation* computation);

  se::StreamExecutor* stream_exec_;   // never null
-  DeviceMemoryAllocator* allocator_;  // may be null
+  se::DeviceMemoryAllocator* allocator_;  // may be null
 };

 }  // namespace gpu
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@ -29,7 +29,7 @@ namespace xla {
 namespace gpu {

 FftScratchAllocator::FftScratchAllocator(
-    int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+    int device_ordinal, se::DeviceMemoryAllocator* memory_allocator)
    : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}

 int64 FftScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
@ -48,7 +48,7 @@ StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
            byte_size, GetMemoryLimitInBytes(stream)));
  }

-  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
+  TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
                      memory_allocator_->Allocate(device_ordinal_, byte_size,
                                                  /*retry_on_failure=*/false));
  total_allocated_bytes_ += byte_size;
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@ -38,7 +38,7 @@ namespace gpu {
 class FftScratchAllocator : public se::ScratchAllocator {
 public:
  FftScratchAllocator(int device_ordinal,
-                      DeviceMemoryAllocator* memory_allocator);
+                      se::DeviceMemoryAllocator* memory_allocator);

  int64 GetMemoryLimitInBytes(se::Stream* stream) override;

@ -49,8 +49,8 @@ class FftScratchAllocator : public se::ScratchAllocator {

 private:
  const int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
-  std::vector<OwningDeviceMemory> allocated_buffers_;
+  se::DeviceMemoryAllocator* memory_allocator_;
+  std::vector<se::OwningDeviceMemory> allocated_buffers_;
  int64 total_allocated_bytes_ = 0;
 };

--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@ -230,7 +230,7 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::Execute(
    const ServiceExecutableRunOptions* run_options,
    absl::Span<const ShapedBuffer* const> arguments,
    HloExecutionProfile* hlo_execution_profile, bool block_host_until_done) {
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();

  if (GetRootPointsToSet().IsAmbiguous()) {
    return Unimplemented("Points-to set of root instruction is ambiguous");
@ -348,7 +348,7 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
    const ServiceExecutableRunOptions* run_options,
    absl::Span<const ShapedBuffer* const> arguments) {
-  DeviceMemoryAllocator* memory_allocator = run_options->allocator();
+  se::DeviceMemoryAllocator* memory_allocator = run_options->allocator();
  // Force synchronous execution if the allocator requires it.
  bool block_host_until_done =
      !memory_allocator->AllowsAsynchronousDeallocation();
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@ -24,7 +24,6 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
@ -38,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {
 namespace gpu {
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@ -164,7 +164,7 @@ string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) {
 // It takes a compiler pointer, as passes may compile and execute HLOs on the
 // fly for cuDNN verification or other purposes.
 Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
-                         DeviceMemoryAllocator* device_allocator,
+                         se::DeviceMemoryAllocator* device_allocator,
                         Compiler* compiler) {
  {
    HloPassPipeline pipeline("optimization");
@ -463,7 +463,7 @@ NVPTXCompiler::NVPTXCompiler()

 StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
  // We dump the post-optimization HLO in RunBackend so no need to dump it here.
  XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
  tensorflow::profiler::TraceMe activity(
@ -479,7 +479,7 @@ StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(

 StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
    std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
  XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend");

  TF_RET_CHECK(stream_exec != nullptr);
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@ -53,11 +53,11 @@ class NVPTXCompiler : public LLVMCompiler {

  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::unique_ptr<Executable>> RunBackend(
      std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
--- a/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc
+++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc
@ -50,7 +50,7 @@ StatusOr<se::DeviceMemory<uint8>> RedzoneAllocator::AllocateBytes(

  int64 rhs_slop = RoundUpToNearest(byte_size, kRhsRedzoneAlign) - byte_size;
  TF_ASSIGN_OR_RETURN(
-      OwningDeviceMemory allocated_buffer,
+      se::OwningDeviceMemory allocated_buffer,
      memory_allocator_->Allocate(device_ordinal_,
                                  byte_size + 2 * redzone_size_ + rhs_slop,
                                  /*retry_on_failure=*/false));
--- a/tensorflow/compiler/xla/service/gpu/redzone_allocator.h
+++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator.h
@ -18,12 +18,12 @@ limitations under the License.

 #include <vector>

-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"

 namespace xla {
 namespace gpu {
@ -41,7 +41,8 @@ namespace gpu {
 // memory for cudnn convolutions.
 class RedzoneAllocator : public se::ScratchAllocator {
 public:
-  RedzoneAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator,
+  RedzoneAllocator(int device_ordinal,
+                   se::DeviceMemoryAllocator* memory_allocator,
                   const HloModuleConfig& hlo_module_config,
                   int64 redzone_size = 1 << 23,  // 8MiB per side, 16MiB total
                   uint8 redzone_pattern = -1)
@ -76,14 +77,14 @@ class RedzoneAllocator : public se::ScratchAllocator {
  const int64 redzone_size_;

  const uint8 redzone_pattern_;
-  DeviceMemoryAllocator* memory_allocator_;
+  se::DeviceMemoryAllocator* memory_allocator_;
  const HloModuleConfig& hlo_module_config_;

  // The second element of the pair is the size of the user allocation.  This
  // isn't necessarily just first.size() - 2 * redzone_size_ because when the
  // user allocation size is not a multiple of 4 bytes, we round up the size of
  // the RHS redzone.
-  std::vector<std::pair<OwningDeviceMemory, int64>> allocated_buffers_;
+  std::vector<std::pair<se::OwningDeviceMemory, int64>> allocated_buffers_;

  int64 allocated_bytes_excluding_redzones_ = 0;
 };
--- a/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc
@ -15,13 +15,13 @@ limitations under the License.

 #include "tensorflow/compiler/xla/service/gpu/redzone_allocator.h"

-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/multi_platform_manager.h"
 #include "tensorflow/stream_executor/platform.h"

@ -42,7 +42,7 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) {
      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
  se::StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
  HloModuleConfig config;
-  StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
+  se::StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
  RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, config,
                             kRedzoneSize, kRedzonePattern);

@ -118,7 +118,7 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) {
      se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie();
  se::StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie();
  HloModuleConfig config;
-  StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
+  se::StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
  RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, config,
                             kRedzoneSize, /*redzone_pattern=*/-1);
  se::Stream stream(stream_exec);
--- a/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc
+++ b/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc
@ -29,7 +29,7 @@ StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
            byte_size, GetMemoryLimitInBytes(stream)));
  }

-  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
+  TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
                      memory_allocator_->Allocate(device_ordinal_, byte_size,
                                                  /*retry_on_failure=*/false));
  total_allocated_bytes_ += byte_size;
--- a/tensorflow/compiler/xla/service/gpu/scratch_allocator.h
+++ b/tensorflow/compiler/xla/service/gpu/scratch_allocator.h
@ -18,18 +18,19 @@ limitations under the License.

 #include <vector>

-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"

 namespace xla {
 namespace gpu {

 class ScratchAllocator : public se::ScratchAllocator {
 public:
-  ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
+  ScratchAllocator(int device_ordinal,
+                   se::DeviceMemoryAllocator* memory_allocator)
      : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}

  int64 GetMemoryLimitInBytes(se::Stream* stream) override {
@ -50,8 +51,8 @@ class ScratchAllocator : public se::ScratchAllocator {

 private:
  const int device_ordinal_;
-  DeviceMemoryAllocator* memory_allocator_;
-  std::vector<OwningDeviceMemory> allocated_buffers_;
+  se::DeviceMemoryAllocator* memory_allocator_;
+  std::vector<se::OwningDeviceMemory> allocated_buffers_;
  int64 total_allocated_bytes_ = 0;
 };

--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@ -96,7 +96,7 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {

 StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* /*stream_exec*/,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+    se::DeviceMemoryAllocator* /*device_allocator*/) {
  VLOG(1) << "Run hlo passes on graph " << hlo_module->name();
  TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get()));
  return std::move(hlo_module);
@ -105,13 +105,13 @@ StatusOr<std::unique_ptr<HloModule>> InterpreterCompiler::RunHloPasses(
 Status InterpreterCompiler::RunHloPassesOnModuleGroup(
    HloModuleGroup* module_group,
    absl::Span<se::StreamExecutor* const> executors,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
  return Unimplemented("Module group compilation not supported on Interpreter");
 }

 StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
    std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
-    DeviceMemoryAllocator* /*device_allocator*/) {
+    se::DeviceMemoryAllocator* /*device_allocator*/) {
  TF_RET_CHECK(stream_exec != nullptr);

  VLOG(1) << "Run backend " << hlo_module->name();
@ -137,7 +137,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>>
 InterpreterCompiler::RunBackendOnModuleGroup(
    std::unique_ptr<HloModuleGroup> module_group,
    std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
  return Unimplemented(
      "Module group compilation is not supported on Interpreter.");
 }
@ -145,7 +145,7 @@ InterpreterCompiler::RunBackendOnModuleGroup(
 StatusOr<std::vector<std::unique_ptr<Executable>>> InterpreterCompiler::Compile(
    std::unique_ptr<HloModuleGroup> module_group,
    std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
  if (module_group->empty()) {
    return std::vector<std::unique_ptr<Executable>>();
  }
--- a/tensorflow/compiler/xla/service/interpreter/compiler.h
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.h
@ -45,24 +45,24 @@ class InterpreterCompiler : public Compiler {

  StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
      std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
  Status RunHloPassesOnModuleGroup(
      HloModuleGroup* module_group,
      absl::Span<se::StreamExecutor* const> executors,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::unique_ptr<Executable>> RunBackend(
      std::unique_ptr<HloModule> hlo_module, se::StreamExecutor* stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;
  StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
      std::unique_ptr<HloModuleGroup> module_group,
      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
      std::unique_ptr<HloModuleGroup> module_group,
      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
  CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
--- a/tensorflow/compiler/xla/service/llvm_compiler.cc
+++ b/tensorflow/compiler/xla/service/llvm_compiler.cc
@ -24,7 +24,7 @@ namespace xla {
 Status LLVMCompiler::RunHloPassesOnModuleGroup(
    HloModuleGroup* module_group,
    absl::Span<se::StreamExecutor* const> executors,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
  return Unimplemented(
      "Model partitioning not implemented for the CPU/GPU compilers!");
 }
@ -33,7 +33,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>>
 LLVMCompiler::RunBackendOnModuleGroup(
    std::unique_ptr<HloModuleGroup> module_group,
    std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
  return Unimplemented(
      "Model partitioning not implemented for the CPU/GPU compilers!");
 }
@ -41,7 +41,7 @@ LLVMCompiler::RunBackendOnModuleGroup(
 StatusOr<std::vector<std::unique_ptr<Executable>>> LLVMCompiler::Compile(
    std::unique_ptr<HloModuleGroup> module_group,
    std::vector<std::vector<se::StreamExecutor*>> stream_execs,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
  // Tensorflow tries to enable the following behaviors in all its threads:
  //
  //  - Denormals are zero (DAZ): roughly, operations treat denormal floats as
--- a/tensorflow/compiler/xla/service/llvm_compiler.h
+++ b/tensorflow/compiler/xla/service/llvm_compiler.h
@ -61,28 +61,28 @@ class LLVMCompiler : public Compiler {
  //   StatusOr<std::unique_ptr<Executable>> RunBackend(
  //       std::unique_ptr<HloModule> module,
  //       se::StreamExecutor* stream_exec,
-  //       DeviceMemoryAllocator* device_allocator)
+  //       se::DeviceMemoryAllocator* device_allocator)
  //   StatusOr<std::unique_ptr<HloModule>> RunHloPasses(
  //       std::unique_ptr<HloModule> module,
  //       se::StreamExecutor* stream_exec,
-  //       DeviceMemoryAllocator* device_allocator)
+  //       se::DeviceMemoryAllocator* device_allocator)
  using Compiler::RunBackend;
  using Compiler::RunHloPasses;

  Status RunHloPassesOnModuleGroup(
      HloModuleGroup* module_group,
      absl::Span<se::StreamExecutor* const> executors,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::vector<std::unique_ptr<Executable>>> RunBackendOnModuleGroup(
      std::unique_ptr<HloModuleGroup> module_group,
      std::vector<std::vector<se::StreamExecutor*>> stream_exec,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;

  StatusOr<std::vector<std::unique_ptr<Executable>>> Compile(
      std::unique_ptr<HloModuleGroup> module_group,
      std::vector<std::vector<se::StreamExecutor*>> stream_execs,
-      DeviceMemoryAllocator* device_allocator) override;
+      se::DeviceMemoryAllocator* device_allocator) override;

 protected:
  ModuleHook user_pre_optimization_hook_;
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@ -23,13 +23,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {

--- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
+++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc
@ -17,25 +17,29 @@ limitations under the License.
 #include "absl/types/variant.h"
 namespace xla {

-se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() {
+tensorflow::se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() {
  if (HasOwnership()) {
-    return absl::get<OwningDeviceMemory>(mem_).AsDeviceMemoryBase();
+    return absl::get<tensorflow::se::OwningDeviceMemory>(mem_)
+        .AsDeviceMemoryBase();
  } else {
-    return absl::get<se::DeviceMemoryBase>(mem_);
+    return absl::get<tensorflow::se::DeviceMemoryBase>(mem_);
  }
 }

 bool MaybeOwningDeviceMemory::HasOwnership() const {
-  return absl::holds_alternative<OwningDeviceMemory>(mem_);
+  return absl::holds_alternative<tensorflow::se::OwningDeviceMemory>(mem_);
 }

-absl::optional<OwningDeviceMemory> MaybeOwningDeviceMemory::Release() {
+absl::optional<tensorflow::se::OwningDeviceMemory>
+MaybeOwningDeviceMemory::Release() {
  if (!HasOwnership()) {
    return {};
  }
-  OwningDeviceMemory result = std::move(absl::get<OwningDeviceMemory>(mem_));
+  tensorflow::se::OwningDeviceMemory result =
+      std::move(absl::get<tensorflow::se::OwningDeviceMemory>(mem_));
  mem_ = result.AsDeviceMemoryBase();
-  return absl::make_optional<OwningDeviceMemory>(std::move(result));
+  return absl::make_optional<tensorflow::se::OwningDeviceMemory>(
+      std::move(result));
 }

 }  // namespace xla
--- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
+++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h
@ -18,30 +18,30 @@ limitations under the License.

 #include "absl/types/optional.h"
 #include "absl/types/variant.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"

 namespace xla {

 // MaybeOwningDeviceMemory represents either an owned or unowned device memory.
-// Like std::variant<OwningDeviceMemory, DeviceMemory>. When the object goes
+// Like std::variant<se::OwningDeviceMemory, DeviceMemory>. When the object goes
 // output of scope, it will free the underlying memory if it owns it.
 class MaybeOwningDeviceMemory {
 public:
  MaybeOwningDeviceMemory() = default;
-  explicit MaybeOwningDeviceMemory(OwningDeviceMemory owned)
+  explicit MaybeOwningDeviceMemory(tensorflow::se::OwningDeviceMemory owned)
      : mem_(std::move(owned)) {}
-  explicit MaybeOwningDeviceMemory(se::DeviceMemoryBase unowned)
+  explicit MaybeOwningDeviceMemory(tensorflow::se::DeviceMemoryBase unowned)
      : mem_(unowned) {}
  MaybeOwningDeviceMemory(MaybeOwningDeviceMemory&&) = default;
  ~MaybeOwningDeviceMemory() = default;

-  MaybeOwningDeviceMemory& operator=(se::DeviceMemoryBase unowned) {
+  MaybeOwningDeviceMemory& operator=(tensorflow::se::DeviceMemoryBase unowned) {
    mem_ = unowned;
    return *this;
  }

-  MaybeOwningDeviceMemory& operator=(OwningDeviceMemory owned) {
+  MaybeOwningDeviceMemory& operator=(tensorflow::se::OwningDeviceMemory owned) {
    mem_ = std::move(owned);
    return *this;
  }
@ -50,19 +50,21 @@ class MaybeOwningDeviceMemory {

  // Fetches the underlying DeviceMemoryBase from a MaybeOwningDeviceMemory. The
  // caller of this function is *not* responsible for freeing the memory.
-  se::DeviceMemoryBase AsDeviceMemoryBase();
+  tensorflow::se::DeviceMemoryBase AsDeviceMemoryBase();

-  // Release the OwningDeviceMemory without freeing it, and moves the ownership
-  // of the memory buffer from the object to the caller.
+  // Release the tensorflow::se::OwningDeviceMemory without freeing it, and
+  // moves the ownership of the memory buffer from the object to the caller.
  //
  // A nullopt is returned if the HasOwnership() == false;
-  absl::optional<OwningDeviceMemory> Release();
+  absl::optional<tensorflow::se::OwningDeviceMemory> Release();

  // Returns true if the device_memory has ownership over underlying memory.
  bool HasOwnership() const;

 private:
-  absl::variant<OwningDeviceMemory, se::DeviceMemoryBase> mem_;
+  absl::variant<tensorflow::se::OwningDeviceMemory,
+                tensorflow::se::DeviceMemoryBase>
+      mem_;
 };

 }  // namespace xla
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
 #include "tensorflow/compiler/xla/service/executable.h"
@ -58,6 +57,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {
 namespace {
@ -347,7 +347,7 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
    const std::vector<const HloModuleProto*>& module_protos,
    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
    Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
-    DeviceMemoryAllocator* device_allocator) {
+    se::DeviceMemoryAllocator* device_allocator) {
  VLOG(1) << StrFormat("BuildExecutable on service %p", this);

  // Dump computation proto state if flag is set.
@ -783,7 +783,7 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
    const HloModuleProto& module_proto,
    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-    se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
+    se::StreamExecutor* executor, se::DeviceMemoryAllocator* device_allocator) {
  VLOG(1) << StrFormat(
      "BuildExecutable on service %p with serialized module proto: %s", this,
      module_proto.name());
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/channel_tracker.h"
 #include "tensorflow/compiler/xla/service/compilation_cache.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/execution_tracker.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
@ -43,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {

@ -234,7 +234,7 @@ class Service : public ServiceInterface {
      const HloModuleProto& module_proto,
      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
      se::StreamExecutor* executor,
-      DeviceMemoryAllocator* device_allocator = nullptr);
+      se::DeviceMemoryAllocator* device_allocator = nullptr);

  // Same as BuildExecutable() above, but builds a list of Executables for the
  // given computations that may interact with each other.
@ -242,7 +242,7 @@ class Service : public ServiceInterface {
      const std::vector<const HloModuleProto*>& module_protos,
      std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
      Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
-      DeviceMemoryAllocator* device_allocator);
+      se::DeviceMemoryAllocator* device_allocator);

  // Runs the given executable with the given arguments and register the result
  // in the allocation tracker. The handle of the result from the tracker is
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@ -43,7 +43,9 @@ class ServiceExecutableRunOptions {

  // Delegate to `ExecutableRunOptions` member.
  se::Stream* stream() const { return run_options_.stream(); }
-  DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); }
+  se::DeviceMemoryAllocator* allocator() const {
+    return run_options_.allocator();
+  }
  int device_ordinal() const { return run_options_.device_ordinal(); }

  // Borrows a stream and returns a smart pointer which returns the stream on
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@ -119,14 +119,14 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) {

 ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape,
                                       const Shape& on_device_shape,
-                                       DeviceMemoryAllocator* allocator,
+                                       se::DeviceMemoryAllocator* allocator,
                                       int device_ordinal)
    : ShapedBuffer(on_host_shape, on_device_shape, allocator->platform(),
                   device_ordinal),
      allocator_(allocator) {}

 ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer,
-                                       DeviceMemoryAllocator* allocator)
+                                       se::DeviceMemoryAllocator* allocator)
    : ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {}

 ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s)
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@ -21,12 +21,12 @@ limitations under the License.
 #include <string>

 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {

@ -138,13 +138,13 @@ class ScopedShapedBuffer : public ShapedBuffer {
  // Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index.
  explicit ScopedShapedBuffer(const Shape& on_host_shape,
                              const Shape& on_device_shape,
-                              DeviceMemoryAllocator* allocator,
+                              se::DeviceMemoryAllocator* allocator,
                              int device_ordinal);

  // Create a ScopedShapedBuffer by taking over the memory from the incoming
  // ShapedBuffer.
  explicit ScopedShapedBuffer(ShapedBuffer shaped_buffer,
-                              DeviceMemoryAllocator* allocator);
+                              se::DeviceMemoryAllocator* allocator);

  // Movable, but not copyable.
  ScopedShapedBuffer(ScopedShapedBuffer&& s);
@ -157,13 +157,13 @@ class ScopedShapedBuffer : public ShapedBuffer {

  // Return the allocator used to allocate the device memory held in this
  // ScopedShapedBuffer.
-  DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
+  se::DeviceMemoryAllocator* memory_allocator() const { return allocator_; }

  // Sets the device memory buffer at the given index.
  //
  // If the given buffer's device memory is non-null, its device_ordinal and
  // allocator must match those in `this`.
-  void set_buffer(OwningDeviceMemory buffer, const ShapeIndex& index) {
+  void set_buffer(se::OwningDeviceMemory buffer, const ShapeIndex& index) {
    if (!buffer.is_null()) {
      CHECK_EQ(buffer.device_ordinal(), device_ordinal());
      CHECK_EQ(buffer.allocator(), allocator_);
@ -187,7 +187,7 @@ class ScopedShapedBuffer : public ShapedBuffer {
 protected:
  void Deallocate();

-  DeviceMemoryAllocator* allocator_;
+  se::DeviceMemoryAllocator* allocator_;
 };

 }  // namespace xla
--- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@ -16,13 +16,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"

 #include "absl/memory/memory.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/util/ptr_util.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {
 namespace {
@ -34,7 +34,7 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
  auto* platform = platforms[0];
  TF_ASSERT_OK_AND_ASSIGN(auto executors,
                          xla::PlatformUtil::GetStreamExecutors(platform));
-  xla::StreamExecutorMemoryAllocator allocator(platform, executors);
+  xla::se::StreamExecutorMemoryAllocator allocator(platform, executors);
  const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {});
  const int kDeviceOrdinal = 0;
  auto scoped_buffer = absl::make_unique<xla::ScopedShapedBuffer>(
@ -43,11 +43,11 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
  buffer = nullptr;
 }

-class TestAllocator : public DeviceMemoryAllocator {
+class TestAllocator : public se::DeviceMemoryAllocator {
 public:
  TestAllocator()
-      : DeviceMemoryAllocator(PlatformUtil::GetDefaultPlatform().ValueOrDie()) {
-  }
+      : se::DeviceMemoryAllocator(
+            PlatformUtil::GetDefaultPlatform().ValueOrDie()) {}

  ~TestAllocator() override {
    if (!allocations_.empty()) {
@ -56,18 +56,18 @@ class TestAllocator : public DeviceMemoryAllocator {
  }

  // Pull in two-arg overload of Allocate.
-  using DeviceMemoryAllocator::Allocate;
+  using se::DeviceMemoryAllocator::Allocate;

-  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                        bool /*retry_on_failure*/) override {
+  StatusOr<se::OwningDeviceMemory> Allocate(
+      int device_ordinal, uint64 size, bool /*retry_on_failure*/) override {
    // By contract, we must return null if size == 0.
    if (size == 0) {
-      return OwningDeviceMemory();
+      return se::OwningDeviceMemory();
    }
    void* buf = malloc(size);
    allocations_.insert({device_ordinal, buf});
-    return OwningDeviceMemory(se::DeviceMemoryBase(buf, size), device_ordinal,
-                              this);
+    return se::OwningDeviceMemory(se::DeviceMemoryBase(buf, size),
+                                  device_ordinal, this);
  }

  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override {
@ -120,7 +120,7 @@ TEST(ScopedShapedBufferTest, TestTakeSubTree) {
  sb.buffers().ForEachMutableElement(
      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
        TF_ASSERT_OK_AND_ASSIGN(
-            OwningDeviceMemory m,
+            se::OwningDeviceMemory m,
            allocator.Allocate(/*device_ordinal=*/0, /*size=*/77));
        *buffer = m.Forget();
      });
@ -158,7 +158,7 @@ TEST(ScopedShapedBufferTest, TestSubShapeTree) {
  sb.buffers().ForEachMutableElement(
      [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
        TF_ASSERT_OK_AND_ASSIGN(
-            OwningDeviceMemory m,
+            se::OwningDeviceMemory m,
            allocator.Allocate(/*device_ordinal=*/0, /*size=*/32));
        *buffer = m.Forget();
      });
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@ -308,7 +308,7 @@ Status TransferManager::TransferBufferToDevice(
 }

 StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
-    const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+    const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator,
    int device_ordinal) {
  if (!LayoutUtil::HasLayout(on_host_shape)) {
    return InvalidArgument("Shape must have a layout: %s",
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@ -229,7 +229,7 @@ class TransferManager {
  // shape. The on-device shape may be different as indicated by
  // HostShapeToDeviceShape.
  StatusOr<ScopedShapedBuffer> AllocateScopedShapedBuffer(
-      const Shape& on_host_shape, DeviceMemoryAllocator* allocator,
+      const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator,
      int device_ordinal);

  // The given ShapedBuffer holds a handle to allocated memory, but it is not
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@ -259,7 +259,6 @@ cc_library(
        "//tensorflow/compiler/xla/client:local_client",
        "//tensorflow/compiler/xla/client:xla_computation",
        "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:local_service",
        "//tensorflow/compiler/xla/service:platform_util",
        "//tensorflow/compiler/xla/service:shaped_buffer",
@ -268,6 +267,7 @@ cc_library(
        "//tensorflow/core:core_cpu_internal",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "//third_party/eigen3",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/types:span",
@ -1172,7 +1172,6 @@ xla_test(
        "//tensorflow/compiler/xla/client:local_client",
        "//tensorflow/compiler/xla/client:xla_builder",
        "//tensorflow/compiler/xla/service:computation_placer",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:local_service",
        "//tensorflow/compiler/xla/service:platform_util",
        "//tensorflow/compiler/xla/service:shaped_buffer",
@ -1183,6 +1182,7 @@ xla_test(
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/core:test",
+        "//tensorflow/stream_executor:device_memory_allocator",
    ],
 )

@ -2078,7 +2078,6 @@ xla_test(
        "//tensorflow/compiler/xla/client:local_client",
        "//tensorflow/compiler/xla/client:xla_builder",
        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:local_service",
        "//tensorflow/compiler/xla/service:platform_util",
        "//tensorflow/compiler/xla/service:shaped_buffer",
@ -2090,6 +2089,7 @@ xla_test(
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/core:test",
+        "//tensorflow/stream_executor:device_memory_allocator",
    ],
 )

@ -2206,13 +2206,13 @@ xla_test(
        "//tensorflow/compiler/xla:statusor",
        "//tensorflow/compiler/xla:types",
        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:generic_transfer_manager",
        "//tensorflow/compiler/xla/service:shaped_buffer",
        "//tensorflow/compiler/xla/service:stream_pool",
        "//tensorflow/core:lib",
        "//tensorflow/core:stream_executor_no_cuda",
        "//tensorflow/core:test",
+        "//tensorflow/stream_executor:device_memory_allocator",
    ],
 )

--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@ -1521,7 +1521,7 @@ void DOT_ReorderContracting(int num_iters) {

  se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
  auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);

  xla::LocalClientOptions client_options;
  client_options.set_platform(platform);
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@ -34,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {
 namespace {
@ -736,7 +736,7 @@ void BM_DynamicSlice(int num_iters) {

  se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
  auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
  LocalClient* client =
      ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
  auto* transfer_manager =
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@ -829,7 +829,7 @@ void BM_ParallelFusion(int num_iters) {

  se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
  auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);

  const int64 intra_op_parallelism_threads = 24;
  xla::LocalClientOptions client_options;
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@ -41,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {
 namespace {
@ -902,7 +902,7 @@ void BM_LocalClientOverhead(int num_iters) {

  se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
  auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
  LocalClient* client =
      ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();
  auto* transfer_manager =
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@ -35,17 +35,16 @@ namespace xla {

 /* static */ TestAllocator* LocalClientTestBase::allocator_;

-StatusOr<OwningDeviceMemory> TestAllocator::Allocate(int device_ordinal,
-                                                     uint64 size,
-                                                     bool retry_on_failure) {
+StatusOr<se::OwningDeviceMemory> TestAllocator::Allocate(
+    int device_ordinal, uint64 size, bool retry_on_failure) {
  VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
  {
    tensorflow::mutex_lock lock(count_mutex_);
    allocation_count_++;
    device_allocation_count_[device_ordinal]++;
  }
-  return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size,
-                                                 retry_on_failure);
+  return se::StreamExecutorMemoryAllocator::Allocate(device_ordinal, size,
+                                                     retry_on_failure);
 }

 Status TestAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
@ -55,7 +54,7 @@ Status TestAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
    deallocation_count_++;
    device_deallocation_count_[device_ordinal]++;
  }
-  return StreamExecutorMemoryAllocator::Deallocate(device_ordinal, mem);
+  return se::StreamExecutorMemoryAllocator::Deallocate(device_ordinal, mem);
 }

 int64 TestAllocator::allocation_count() const {
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
@ -36,18 +35,19 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {

-class TestAllocator : public StreamExecutorMemoryAllocator {
+class TestAllocator : public se::StreamExecutorMemoryAllocator {
 public:
  explicit TestAllocator(se::Platform* platform)
-      : StreamExecutorMemoryAllocator(
+      : se::StreamExecutorMemoryAllocator(
            platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) {
  }

-  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                        bool retry_on_failure) override;
+  StatusOr<se::OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                            bool retry_on_failure) override;
  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;

  // Return the number of allocations that have been performed.
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@ -19,7 +19,6 @@ limitations under the License.

 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/generic_transfer_manager.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
@ -34,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/test_benchmark.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 namespace xla {
 namespace {
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@ -1265,7 +1265,7 @@ void BM_WhileLoop(int num_iters) {

  se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie();
  auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie();
-  StreamExecutorMemoryAllocator allocator(platform, executors);
+  se::StreamExecutorMemoryAllocator allocator(platform, executors);
  LocalClient* client =
      ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie();

--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@ -135,7 +135,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
  LocalService* service = ClientLibrary::GetXlaService(client->platform());
  Backend* backend = service->mutable_backend();
  se::StreamExecutor* executor = backend->default_stream_executor();
-  DeviceMemoryAllocator* allocator = backend->memory_allocator();
+  se::DeviceMemoryAllocator* allocator = backend->memory_allocator();
  auto* transfer_manager = backend->transfer_manager();
  TF_ASSERT_OK_AND_ASSIGN(
      StreamPool::Ptr stream_ptr,
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@ -271,7 +271,7 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
  // Run the computation num_runs times, and return the result from the last
  // execution.
  const bool xla_hlo_profile = GetDebugOptionsFromFlags().xla_hlo_profile();
-  StreamExecutorMemoryAllocator allocator(
+  se::StreamExecutorMemoryAllocator allocator(
      client->platform(),
      {client->platform()->ExecutorForDevice(0).ValueOrDie()});
  absl::optional<ScopedShapedBuffer> final_result;
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@ -67,13 +67,13 @@ cc_library(
        "//tensorflow/compiler/xla:xla_proto",
        "//tensorflow/compiler/xla/client:local_client",
        "//tensorflow/compiler/xla/service:backend",
-        "//tensorflow/compiler/xla/service:device_memory_allocator",
        "//tensorflow/compiler/xla/service:shaped_buffer",
        "//tensorflow/core:core_cpu_internal",
        "//tensorflow/core:framework",
        "//tensorflow/core:lib",
        "//tensorflow/core:lib_internal",
        "//tensorflow/stream_executor",
+        "//tensorflow/stream_executor:device_memory_allocator",
        "@com_google_absl//absl/memory",
        "@com_google_absl//absl/strings",
        "@com_google_absl//absl/synchronization",
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@ -117,7 +117,7 @@ Status AllocateScopedShapedBuffer(
        xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
    uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
    TF_ASSIGN_OR_RETURN(
-        xla::OwningDeviceMemory buffer,
+        se::OwningDeviceMemory buffer,
        allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/false));
    // Move our buffer into shaped_buffer, which takes ownership of it.
    index_to_buffer.second = buffer.Forget();
@ -135,7 +135,7 @@ Status AllocateScopedShapedBuffer(

 XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
                                         int device_ordinal,
-                                         xla::DeviceMemoryAllocator* allocator)
+                                         se::DeviceMemoryAllocator* allocator)
    : size_(allocation.size()),
      allocation_(allocation),
      device_ordinal_(device_ordinal),
@ -169,7 +169,7 @@ void XRTBufferAllocation::DiscardAllocation() {
 }

 XRTTupleAllocation::XRTTupleAllocation(int device_ordinal,
-                                       xla::DeviceMemoryAllocator* allocator,
+                                       se::DeviceMemoryAllocator* allocator,
                                       const xla::Shape& on_host_shape,
                                       const xla::Shape& on_device_shape)
    : device_ordinal_(device_ordinal),
@ -342,7 +342,7 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;

 /* static */ Status XRTTupleAllocation::ExpandTreeOfTuples(
    const xla::ShapeTree<ExpandedTupleInput>& elements, int device_ordinal,
-    xla::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
+    se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
    xla::Shape* device_shape) {
  // Initialize both host and device shape to be the 'spine' of the new tuple
  // shape, given by the shape of the tree of tuples.
@ -415,7 +415,7 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr;
          xla::Shape subshape =
              xla::ShapeUtil::GetSubshape(device_shape, index);
          uint64 size = transfer_manager->GetByteSizeRequirement(subshape);
-          TF_ASSIGN_OR_RETURN(xla::OwningDeviceMemory buffer,
+          TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer,
                              allocator->Allocate(device_ordinal, size,
                                                  /*retry_on_failure=*/false));
          VLOG(2) << "Allocated buffer at " << buffer.opaque() << " index "
@ -502,7 +502,7 @@ bool XRTTupleAllocation::IsExclusiveOwner() {

 void XRTTupleAllocation::InitializeFromShapedBuffer(
    const xla::ShapedBuffer& shaped_buffer,
-    xla::DeviceMemoryAllocator* allocator, int device_ordinal) {
+    se::DeviceMemoryAllocator* allocator, int device_ordinal) {
  for (auto& buffer : buffers_) {
    // Make a reference-counted version of the allocated buffer.
    buffer.second = new XRTBufferAllocation(shaped_buffer.buffer(buffer.first),
@ -549,7 +549,7 @@ XRTTupleAllocation::ToDeviceMemoryTree(
    if (!release_checker(buffer.first)) {
      *shaped_tree.mutable_element(buffer.first) = buffer.second->allocation();
    } else {
-      *shaped_tree.mutable_element(buffer.first) = xla::OwningDeviceMemory(
+      *shaped_tree.mutable_element(buffer.first) = se::OwningDeviceMemory(
          buffer.second->allocation(), device_ordinal_, allocator_);
      DiscardAllocation(buffer.first);
    }
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@ -25,7 +25,6 @@ limitations under the License.

 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/backend.h"
-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@ -34,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"
 #include "tensorflow/stream_executor/stream_executor.h"

 namespace tensorflow {
@ -45,8 +45,7 @@ namespace tensorflow {
 class XRTBufferAllocation : public core::RefCounted {
 public:
  XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
-                      int device_ordinal,
-                      xla::DeviceMemoryAllocator* allocator);
+                      int device_ordinal, se::DeviceMemoryAllocator* allocator);
  ~XRTBufferAllocation() override;

  // The region of device memory being wrapped.
@ -69,7 +68,7 @@ class XRTBufferAllocation : public core::RefCounted {
  uint64 size_ = 0;
  se::DeviceMemoryBase allocation_;
  int device_ordinal_;
-  xla::DeviceMemoryAllocator* allocator_;
+  se::DeviceMemoryAllocator* allocator_;
 };

 // Entry in the resource manager corresponding to an allocation handle returned
@ -197,14 +196,14 @@ class XRTTupleAllocation : public ResourceBase {

 private:
  // Creates a new handle with (tuple) shape.
-  XRTTupleAllocation(int device_ordinal, xla::DeviceMemoryAllocator* allocator,
+  XRTTupleAllocation(int device_ordinal, se::DeviceMemoryAllocator* allocator,
                     const xla::Shape& on_host_shape,
                     const xla::Shape& on_device_shape);

  // Inherits the allocations represented in buffer, which must have the same
  // shape as buffers_.
  void InitializeFromShapedBuffer(const xla::ShapedBuffer& shaped_buffer,
-                                  xla::DeviceMemoryAllocator* allocator,
+                                  se::DeviceMemoryAllocator* allocator,
                                  int device_ordinal);

  // Takes a tree 'elements' where each leaf is an allocation, validates that
@ -214,12 +213,12 @@ class XRTTupleAllocation : public ResourceBase {
  // grafted on.
  static Status ExpandTreeOfTuples(
      const xla::ShapeTree<ExpandedTupleInput>& elements, int device_ordinal,
-      xla::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
+      se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape,
      xla::Shape* device_shape);

  // Location of the memory that is being managed.
  int device_ordinal_;
-  xla::DeviceMemoryAllocator* allocator_;
+  se::DeviceMemoryAllocator* allocator_;

  // The shape that the caller thinks the tuple has.
  const xla::Shape on_host_shape_;
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@ -666,6 +666,27 @@ cc_library(
    ],
 )

+cc_library(
+    name = "device_memory_allocator",
+    srcs = [
+        "device_memory_allocator.cc",
+        "owning_device_memory.cc",
+    ],
+    hdrs = [
+        "device_memory_allocator.h",
+        "owning_device_memory.h",
+    ],
+    deps = [
+        ":platform",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/stream_executor/lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 tf_cc_test(
    name = "stream_test",
    size = "small",
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@ -13,30 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

 #include <string>

-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/core/lib/strings/numbers.h"

-namespace xla {
+namespace stream_executor {

 StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
-    const se::Platform* platform,
-    absl::Span<se::StreamExecutor* const> stream_executors)
+    const Platform* platform,
+    absl::Span<StreamExecutor* const> stream_executors)
    : DeviceMemoryAllocator(platform),
      stream_executors_(stream_executors.begin(), stream_executors.end()) {}

-StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
+port::StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
    int device_ordinal, uint64 size, bool retry_on_failure) {
-  TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
-                      GetStreamExecutor(device_ordinal));
-  se::DeviceMemoryBase result = stream_executor->AllocateArray<uint8>(size);
+  port::StatusOr<StreamExecutor*> stream_executor_or =
+      GetStreamExecutor(device_ordinal);
+  TF_RETURN_IF_ERROR(stream_executor_or.status());
+  DeviceMemoryBase result =
+      stream_executor_or.ValueOrDie()->AllocateArray<uint8>(size);
  if (size > 0 && result == nullptr) {
-    return ResourceExhausted(
+    return tensorflow::errors::ResourceExhausted(
        "Failed to allocate request for %s (%uB) on device ordinal %d",
        tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal);
  }
@ -47,32 +48,34 @@ StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
  return OwningDeviceMemory(result, device_ordinal, this);
 }

-Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
-                                                 se::DeviceMemoryBase mem) {
+port::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
+                                                       DeviceMemoryBase mem) {
  if (!mem.is_null()) {
-    TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
-                        GetStreamExecutor(device_ordinal));
+    port::StatusOr<StreamExecutor*> stream_executor_or =
+        GetStreamExecutor(device_ordinal);
+    TF_RETURN_IF_ERROR(stream_executor_or.status());
    VLOG(3) << absl::StreamFormat("Freeing %p on device ordinal %d",
                                  mem.opaque(), device_ordinal);
-    stream_executor->Deallocate(&mem);
+    stream_executor_or.ValueOrDie()->Deallocate(&mem);
  }
-  return Status::OK();
+  return port::Status::OK();
 }

-StatusOr<se::StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
-    int device_ordinal) {
+port::StatusOr<StreamExecutor*>
+StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) {
  if (device_ordinal < 0) {
-    return InvalidArgument("device ordinal value (%d) must be non-negative",
-                           device_ordinal);
+    return tensorflow::errors::InvalidArgument(
+        "device ordinal value (%d) must be non-negative", device_ordinal);
  }
  if (device_ordinal >= stream_executors_.size()) {
-    return InvalidArgument(
+    return tensorflow::errors::InvalidArgument(
        "device ordinal value (%d) >= number of devices (%u)", device_ordinal,
        stream_executors_.size());
  }
  if (stream_executors_[device_ordinal] == nullptr) {
-    return NotFound("Device %s:%d present but not supported",
-                    platform()->Name(), device_ordinal);
+    return tensorflow::errors::NotFound(
+        absl::StrFormat("Device %s:%d present but not supported",
+                        platform()->Name(), device_ordinal));
  }
  return stream_executors_[device_ordinal];
 }
@ -81,4 +84,4 @@ bool StreamExecutorMemoryAllocator::AllowsAsynchronousDeallocation() const {
  return false;
 }

-}  // namespace xla
+}  // namespace stream_executor
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@ -19,13 +19,13 @@ limitations under the License.
 #include <vector>

 #include "absl/types/span.h"
-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/stream_executor/lib/statusor.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"
+#include "tensorflow/stream_executor/platform.h"

-namespace xla {
+namespace stream_executor {

 // Interface for device memory allocators used within the XLA service. An
 // allocator is responsible for allocating memory on all devices of a particular
@ -34,7 +34,7 @@ class DeviceMemoryAllocator {
 public:
  // Parameter platform indicates which platform the allocator allocates memory
  // on. Must be non-null.
-  explicit DeviceMemoryAllocator(const se::Platform* platform)
+  explicit DeviceMemoryAllocator(const Platform* platform)
      : platform_(platform) {}
  virtual ~DeviceMemoryAllocator() {}

@ -47,23 +47,23 @@ class DeviceMemoryAllocator {
  // fails, the allocation should return immediately without retrying.  An
  // example use case is optional scratch spaces where a failure has only
  // performance impact.
-  virtual StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                                bool retry_on_failure) = 0;
+  virtual port::StatusOr<OwningDeviceMemory> Allocate(
+      int device_ordinal, uint64 size, bool retry_on_failure) = 0;

  // Two-arg version of Allocate(), which sets retry-on-failure to true.
  //
  // (We don't simply use a default argument on the virtual Allocate function
  // because default args on virtual functions are disallowed by the Google
  // style guide.)
-  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size) {
+  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size) {
    return Allocate(device_ordinal, size, /*retry_on_failure=*/true);
  }

  // Must be a nop for null pointers.
-  virtual Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) = 0;
+  virtual port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) = 0;

  // Return the platform that the allocator allocates memory on.
-  const se::Platform* platform() const { return platform_; }
+  const Platform* platform() const { return platform_; }

  // Can we call Deallocate() as soon as a computation has been scheduled on
  // a stream, or do we have to wait for the computation to complete first?
@ -71,7 +71,7 @@ class DeviceMemoryAllocator {

 protected:
  friend class OwningDeviceMemory;
-  const se::Platform* platform_;
+  const Platform* platform_;
 };

 // Default memory allocator for a platform which uses
@ -79,28 +79,28 @@ class DeviceMemoryAllocator {
 class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
 public:
  StreamExecutorMemoryAllocator(
-      const se::Platform* platform,
-      absl::Span<se::StreamExecutor* const> stream_executors);
+      const Platform* platform,
+      absl::Span<StreamExecutor* const> stream_executors);

-  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
-                                        bool retry_on_failure) override;
+  port::StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                              bool retry_on_failure) override;

  // Pull in two-arg overload that sets retry_on_failure to true.
  using DeviceMemoryAllocator::Allocate;

-  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
+  port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override;

  bool AllowsAsynchronousDeallocation() const override;

 private:
-  StatusOr<se::StreamExecutor*> GetStreamExecutor(int device_ordinal);
+  port::StatusOr<StreamExecutor*> GetStreamExecutor(int device_ordinal);

  // A vector indexed by device ordinal of StreamExecutors for each device of
  // the allocator's platform type. If an element is nullptr, then the device
  // with the respective device ordinal is not supported by XLA.
-  std::vector<se::StreamExecutor*> stream_executors_;
+  std::vector<StreamExecutor*> stream_executors_;
 };

-}  // namespace xla
+}  // namespace stream_executor

 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DEVICE_MEMORY_ALLOCATOR_H_
--- a/tensorflow/compiler/xla/service/owning_device_memory.cc
+++ b/tensorflow/compiler/xla/service/owning_device_memory.cc
@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/

-#include "tensorflow/compiler/xla/service/owning_device_memory.h"
+#include "tensorflow/stream_executor/owning_device_memory.h"

-#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/stream_executor/device_memory_allocator.h"

-namespace xla {
+namespace stream_executor {

 void OwningDeviceMemory::Free() {
  CHECK(allocator_ != nullptr)
@ -29,7 +29,7 @@ void OwningDeviceMemory::Free() {
  }

  allocator_ = nullptr;
-  mem_ = se::DeviceMemoryBase();
+  mem_ = DeviceMemoryBase();
 }

-}  // namespace xla
+}  // namespace stream_executor
--- a/tensorflow/compiler/xla/service/owning_device_memory.h
+++ b/tensorflow/compiler/xla/service/owning_device_memory.h
@ -16,12 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_

-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"

-namespace xla {
+namespace stream_executor {

 // Break circular dependency between this file and device_memory_allocator.h.
 class DeviceMemoryAllocator;
@ -43,7 +41,7 @@ class OwningDeviceMemory {
 public:
  OwningDeviceMemory() : device_ordinal_(-1), allocator_(nullptr) {}

-  explicit OwningDeviceMemory(se::DeviceMemoryBase mem, int device_ordinal,
+  explicit OwningDeviceMemory(DeviceMemoryBase mem, int device_ordinal,
                              DeviceMemoryAllocator* allocator)
      : mem_(mem), device_ordinal_(device_ordinal), allocator_(allocator) {
    CHECK(allocator != nullptr) << "allocator cannot be null.";
@ -53,7 +51,7 @@ class OwningDeviceMemory {
      : mem_(other.mem_),
        device_ordinal_(other.device_ordinal_),
        allocator_(other.allocator_) {
-    other.mem_ = se::DeviceMemoryBase();
+    other.mem_ = DeviceMemoryBase();
    other.allocator_ = nullptr;
  }

@ -65,7 +63,7 @@ class OwningDeviceMemory {
    device_ordinal_ = other.device_ordinal_;
    allocator_ = other.allocator_;

-    other.mem_ = se::DeviceMemoryBase();
+    other.mem_ = DeviceMemoryBase();
    other.allocator_ = nullptr;
    return *this;
  }
@ -100,25 +98,25 @@ class OwningDeviceMemory {
  // !is_null() is sufficient but not necessary to imply `this` is active.
  bool is_null() const { return mem_.is_null(); }

-  se::DeviceMemoryBase AsDeviceMemoryBase() const {
+  DeviceMemoryBase AsDeviceMemoryBase() const {
    // This const_cast is necessary because DeviceMemoryBase's constructor
    // doesn't accept a const void*.  This isn't ideal, but it's better than the
    // alternative of making a AsDeviceMemoryBase non-const member function.
    //
    // This is safe (i.e. not UB) because the casted pointer is derived from a
    // non-const pointer, namely mem_.opaque().
-    return se::DeviceMemoryBase(const_cast<void*>(opaque()), size());
+    return DeviceMemoryBase(const_cast<void*>(opaque()), size());
  }

  // Returns the wrapped DeviceMemoryBase without freeing it, and deactivates
  // this object.  Precondition: `this` is active.
-  TF_MUST_USE_RESULT se::DeviceMemoryBase Forget() {
+  TF_MUST_USE_RESULT DeviceMemoryBase Forget() {
    CHECK(allocator_ != nullptr)
        << "Can't call Forget() on an inactive (i.e. moved from, Forget()'ten, "
           "or Free()'ed) instance.";
    allocator_ = nullptr;
-    se::DeviceMemoryBase mem(mem_);
-    mem_ = se::DeviceMemoryBase();
+    DeviceMemoryBase mem(mem_);
+    mem_ = DeviceMemoryBase();
    return mem;
  }

@ -127,11 +125,11 @@ class OwningDeviceMemory {
  void Free();

 private:
-  se::DeviceMemoryBase mem_;
+  DeviceMemoryBase mem_;
  int device_ordinal_;
  DeviceMemoryAllocator* allocator_;  // Null if this object is inactive.
 };

-}  // namespace xla
+}  // namespace stream_executor

 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_