diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD index ef91c85ec36..cd3ce757110 100644 --- a/tensorflow/compiler/jit/BUILD +++ b/tensorflow/compiler/jit/BUILD @@ -262,7 +262,6 @@ cc_library( "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla/client:client_library", "//tensorflow/compiler/xla/client:local_client", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:framework_internal", @@ -270,6 +269,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/base:core_headers", "@com_google_absl//absl/memory", diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc index 88d00f7f8e1..6df0991e354 100644 --- a/tensorflow/compiler/jit/kernels/xla_ops.cc +++ b/tensorflow/compiler/jit/kernels/xla_ops.cc @@ -62,7 +62,7 @@ XlaPlatformInfo PlatformInfoFromContext(OpKernelConstruction* ctx) { se::Platform::Id platform_id = nullptr; const XlaDevice::Metadata* xla_device_metadata = nullptr; std::unique_ptr xla_allocator; - xla::DeviceMemoryAllocator* device_allocator = nullptr; + se::DeviceMemoryAllocator* device_allocator = nullptr; if (ctx->device_type() == DeviceType(DEVICE_CPU)) { platform_id = se::host::kHostPlatformId; diff --git a/tensorflow/compiler/jit/kernels/xla_ops.h b/tensorflow/compiler/jit/kernels/xla_ops.h index 7b4d4b5b473..eaa686780e4 100644 --- a/tensorflow/compiler/jit/kernels/xla_ops.h +++ b/tensorflow/compiler/jit/kernels/xla_ops.h @@ -40,7 +40,7 @@ class XlaPlatformInfo { se::Platform::Id platform_id, const XlaDevice::Metadata* xla_device_metadata, std::unique_ptr xla_allocator, - xla::DeviceMemoryAllocator* device_allocator) + se::DeviceMemoryAllocator* device_allocator) : device_type_(device_type), platform_id_(platform_id), xla_device_metadata_(xla_device_metadata), @@ -55,7 +55,7 @@ class XlaPlatformInfo { return xla_device_metadata_ && xla_device_metadata_->UseMultipleStreams(); } - xla::DeviceMemoryAllocator* allocator() const { + se::DeviceMemoryAllocator* allocator() const { return device_allocator_ ? device_allocator_ : xla_allocator_.get(); } DeviceType device_type() const { return device_type_; } @@ -86,7 +86,7 @@ class XlaPlatformInfo { // then device_allocator_ is null and xla_allocator_ points to an appropriate // XlaAllocator instance. std::unique_ptr xla_allocator_; - xla::DeviceMemoryAllocator* device_allocator_; + se::DeviceMemoryAllocator* device_allocator_; TF_DISALLOW_COPY_AND_ASSIGN(XlaPlatformInfo); }; diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc index 777763342a6..3bb698b33d6 100644 --- a/tensorflow/compiler/jit/xla_launch_util.cc +++ b/tensorflow/compiler/jit/xla_launch_util.cc @@ -168,11 +168,11 @@ Status SnapshotResourceVariables(OpKernelContext* ctx, } XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped) - : xla::DeviceMemoryAllocator(platform), wrapped_(wrapped) {} + : se::DeviceMemoryAllocator(platform), wrapped_(wrapped) {} XlaAllocator::~XlaAllocator() {} -xla::StatusOr XlaAllocator::Allocate( +xla::StatusOr XlaAllocator::Allocate( int device_ordinal, uint64 size, bool retry_on_failure) { AllocationAttributes attrs; attrs.no_retry_on_failure = !retry_on_failure; @@ -184,8 +184,8 @@ xla::StatusOr XlaAllocator::Allocate( "Out of memory while trying to allocate ", size, " bytes."); } } - return xla::OwningDeviceMemory(se::DeviceMemoryBase(data, size), - device_ordinal, this); + return se::OwningDeviceMemory(se::DeviceMemoryBase(data, size), + device_ordinal, this); } Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) { @@ -194,7 +194,7 @@ Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) { } XlaComputationLaunchContext::XlaComputationLaunchContext( - xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator, + xla::LocalClient* client, se::DeviceMemoryAllocator* xla_allocator, bool allocate_xla_tensors, bool use_multiple_streams) : client_(client), xla_allocator_(xla_allocator), @@ -374,7 +374,7 @@ Status XlaComputationLaunchContext::PopulateOutputs( } else { Tensor output_tensor = XlaTensorBuffer::MakeTensor( ctx->expected_output_dtype(i), shape, buffer, allocator); - output.set_buffer(xla::OwningDeviceMemory(), {output_num}); + output.set_buffer(se::OwningDeviceMemory(), {output_num}); ctx->set_output(i, output_tensor); } ++output_num; @@ -435,7 +435,7 @@ Status XlaComputationLaunchContext::PopulateOutputs( *variable_infos[i].var()->tensor() = output_tensor; } else { se::DeviceMemoryBase buffer = output.buffer({output_num}); - output.set_buffer(xla::OwningDeviceMemory(), {output_num}); + output.set_buffer(se::OwningDeviceMemory(), {output_num}); Tensor output_tensor = XlaTensorBuffer::MakeTensor( write.type, write.shape, buffer, allocator); *variable_infos[i].var()->tensor() = output_tensor; diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h index c915b7118d0..c6a9b931401 100644 --- a/tensorflow/compiler/jit/xla_launch_util.h +++ b/tensorflow/compiler/jit/xla_launch_util.h @@ -23,14 +23,14 @@ limitations under the License. #include "tensorflow/compiler/jit/xla_tensor.h" #include "tensorflow/compiler/tf2xla/xla_compiler.h" #include "tensorflow/compiler/xla/client/local_client.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" -#include "tensorflow/compiler/xla/service/owning_device_memory.h" #include "tensorflow/core/framework/allocation_description.pb.h" #include "tensorflow/core/framework/resource_var.h" #include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/types.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/gtl/array_slice.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" +#include "tensorflow/stream_executor/owning_device_memory.h" namespace tensorflow { class XlaAllocator; @@ -108,11 +108,11 @@ Status LockVariables(absl::Span variables) // Adapter class that wraps a Tensorflow allocator as an XLA allocator. // Assumes that the Tensorflow allocator permits asynchronous deallocation: // see comment on `AllowsAsynchronousDeallocation()`. -class XlaAllocator : public xla::DeviceMemoryAllocator { +class XlaAllocator : public se::DeviceMemoryAllocator { public: XlaAllocator(const se::Platform* platform, Allocator* wrapped); ~XlaAllocator() override; - xla::StatusOr Allocate( + xla::StatusOr Allocate( int device_ordinal, uint64 size, bool retry_on_failure) override; Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override; @@ -142,7 +142,7 @@ class XlaComputationLaunchContext { // because we track inter-stream dependencies through events inside XlaTensor // objects. XlaComputationLaunchContext(xla::LocalClient* client, - xla::DeviceMemoryAllocator* xla_allocator, + se::DeviceMemoryAllocator* xla_allocator, bool allocate_xla_tensors, bool use_multiple_streams); @@ -186,7 +186,7 @@ class XlaComputationLaunchContext { private: xla::LocalClient* client_; - xla::DeviceMemoryAllocator* xla_allocator_; + se::DeviceMemoryAllocator* xla_allocator_; bool allocate_xla_tensors_; bool use_multiple_streams_; std::vector> arg_buffers_; diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc index b92bd675378..1c1080f2385 100644 --- a/tensorflow/compiler/jit/xla_tensor.cc +++ b/tensorflow/compiler/jit/xla_tensor.cc @@ -59,7 +59,7 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype, xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first); uint64 size = client->backend().transfer_manager()->GetByteSizeRequirement(subshape); - TF_ASSIGN_OR_RETURN(xla::OwningDeviceMemory buffer, + TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer, client->backend().memory_allocator()->Allocate( device_ordinal, size, /*retry_on_failure=*/false)); // Move our buffer into shaped_buffer, which takes ownership of it. diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index 406d5ba197b..1cc5d8d4728 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -339,7 +339,7 @@ class XlaCompiler { // here, but on some devices (notably, GPUs), TensorFlow tends to eagerly // allocate most or all available memory on the device, leaving none for the // compiler to access, unless it can use TensorFlow's allocator. - xla::DeviceMemoryAllocator* device_allocator = nullptr; + se::DeviceMemoryAllocator* device_allocator = nullptr; }; explicit XlaCompiler(Options options); diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD index d5ade8f6262..b800229bd90 100644 --- a/tensorflow/compiler/xla/client/BUILD +++ b/tensorflow/compiler/xla/client/BUILD @@ -96,7 +96,7 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla:xla_proto", - "//tensorflow/compiler/xla/service:device_memory_allocator", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/types:optional", @@ -117,7 +117,6 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:backend", "//tensorflow/compiler/xla/service:compiler", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:dump", "//tensorflow/compiler/xla/service:executable", "//tensorflow/compiler/xla/service:hlo_proto", @@ -126,6 +125,7 @@ cc_library( "//tensorflow/compiler/xla/service:source_map_util", "//tensorflow/compiler/xla/service:stream_pool", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/memory", "@com_google_absl//absl/types:span", "@llvm//:support", @@ -165,11 +165,11 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/service:backend", "//tensorflow/compiler/xla/service:compile_only_service", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:local_service", "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/memory", "@com_google_absl//absl/types:optional", ], diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h index 62d225c6c29..33d1de370de 100644 --- a/tensorflow/compiler/xla/client/client_library.h +++ b/tensorflow/compiler/xla/client/client_library.h @@ -31,7 +31,6 @@ limitations under the License. #include "tensorflow/compiler/xla/client/compile_only_client.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/service/compile_only_service.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" @@ -39,6 +38,7 @@ limitations under the License. #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc index f2d124d099b..d5de53a7941 100644 --- a/tensorflow/compiler/xla/client/executable_build_options.cc +++ b/tensorflow/compiler/xla/client/executable_build_options.cc @@ -22,12 +22,12 @@ limitations under the License. namespace xla { ExecutableBuildOptions& ExecutableBuildOptions::set_device_allocator( - DeviceMemoryAllocator* allocator) { + se::DeviceMemoryAllocator* allocator) { device_allocator_ = allocator; return *this; } -DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const { +se::DeviceMemoryAllocator* ExecutableBuildOptions::device_allocator() const { return device_allocator_; } diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h index 1d85fb34304..e2e231981bf 100644 --- a/tensorflow/compiler/xla/client/executable_build_options.h +++ b/tensorflow/compiler/xla/client/executable_build_options.h @@ -18,11 +18,11 @@ limitations under the License. #include "absl/strings/string_view.h" #include "absl/types/optional.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/shape.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/compiler/xla/xla.pb.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { @@ -57,11 +57,11 @@ class ExecutableBuildOptions { // want to run various algorithms on the device and pick the fastest one -- it // might allocate buffers for use by these algorithms using this allocator. // - // This does not need to be the same as the DeviceMemoryAllocator passed when - // running the executable. + // This does not need to be the same as the se::DeviceMemoryAllocator passed + // when running the executable. ExecutableBuildOptions& set_device_allocator( - DeviceMemoryAllocator* allocator); - DeviceMemoryAllocator* device_allocator() const; + se::DeviceMemoryAllocator* allocator); + se::DeviceMemoryAllocator* device_allocator() const; // Returns a string representation of the build options, suitable for // debugging. @@ -77,7 +77,7 @@ class ExecutableBuildOptions { Shape result_layout_; bool result_layout_set_ = false; absl::optional debug_options_; - DeviceMemoryAllocator* device_allocator_ = nullptr; + se::DeviceMemoryAllocator* device_allocator_ = nullptr; int num_replicas_ = 1; }; diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc index 192785646ec..1bd9d7b7228 100644 --- a/tensorflow/compiler/xla/client/local_client.cc +++ b/tensorflow/compiler/xla/client/local_client.cc @@ -279,7 +279,7 @@ StatusOr> LocalClient::Compile( StatusOr LocalClient::LiteralToShapedBuffer( const LiteralSlice& literal, int device_ordinal, - DeviceMemoryAllocator* allocator) { + se::DeviceMemoryAllocator* allocator) { if (allocator == nullptr) { allocator = backend().memory_allocator(); } diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h index 7f4a3db10b8..1e7c97d6f06 100644 --- a/tensorflow/compiler/xla/client/local_client.h +++ b/tensorflow/compiler/xla/client/local_client.h @@ -24,7 +24,6 @@ limitations under the License. #include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/executable_run_options.h" #include "tensorflow/compiler/xla/service/compiler.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/local_service.h" @@ -32,6 +31,7 @@ limitations under the License. #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { @@ -137,7 +137,7 @@ class LocalClient : public Client { // device is used. StatusOr LiteralToShapedBuffer( const LiteralSlice& literal, int device_ordinal, - DeviceMemoryAllocator* allocator = nullptr); + se::DeviceMemoryAllocator* allocator = nullptr); // Transfer the BorrowingLiteral to the device with the given ordinal. StatusOr TransferToLocalServer( diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc index 230f3b202a4..39c90b60a09 100644 --- a/tensorflow/compiler/xla/executable_run_options.cc +++ b/tensorflow/compiler/xla/executable_run_options.cc @@ -26,12 +26,13 @@ ExecutableRunOptions& ExecutableRunOptions::set_device_ordinal( int ExecutableRunOptions::device_ordinal() const { return device_ordinal_; } ExecutableRunOptions& ExecutableRunOptions::set_allocator( - DeviceMemoryAllocator* allocator) { + stream_executor::DeviceMemoryAllocator* allocator) { allocator_ = allocator; return *this; } -DeviceMemoryAllocator* ExecutableRunOptions::allocator() const { +stream_executor::DeviceMemoryAllocator* ExecutableRunOptions::allocator() + const { return allocator_; } diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h index 1ac26a0fb40..84629593953 100644 --- a/tensorflow/compiler/xla/executable_run_options.h +++ b/tensorflow/compiler/xla/executable_run_options.h @@ -23,6 +23,7 @@ limitations under the License. namespace stream_executor { class Stream; class Platform; +class DeviceMemoryAllocator; } // namespace stream_executor namespace Eigen { @@ -31,7 +32,6 @@ struct ThreadPoolDevice; namespace xla { -class DeviceMemoryAllocator; class DeviceAssignment; class ExecutionProfile; @@ -39,8 +39,9 @@ class ExecutionProfile; class ExecutableRunOptions { public: // Specifies the allocator to use during execution. - ExecutableRunOptions& set_allocator(DeviceMemoryAllocator* allocator); - DeviceMemoryAllocator* allocator() const; + ExecutableRunOptions& set_allocator( + stream_executor::DeviceMemoryAllocator* allocator); + stream_executor::DeviceMemoryAllocator* allocator() const; // If set, this is the device to run the computation on. Valid device_ordinal // values are: 0 to # of devices - 1. These values are identical to the device @@ -87,7 +88,7 @@ class ExecutableRunOptions { int rng_seed() const; private: - DeviceMemoryAllocator* allocator_ = nullptr; + stream_executor::DeviceMemoryAllocator* allocator_ = nullptr; int device_ordinal_ = -1; const DeviceAssignment* device_assignment_ = nullptr; stream_executor::Stream* stream_ = nullptr; diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index 339f8f004fa..ebb489c380d 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -67,8 +67,8 @@ cc_library( "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/core:lib", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/types:optional", "@pybind11", @@ -109,9 +109,9 @@ cc_library( hdrs = ["shared_device_buffer.h"], deps = [ "//tensorflow/compiler/xla:shape_util", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/compiler/xla/service:transfer_manager", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/container:flat_hash_set", ], ) @@ -178,7 +178,7 @@ tf_pybind_extension( "//tensorflow/compiler/xla/client/lib:self_adjoint_eig", "//tensorflow/compiler/xla/client/lib:svd", "//tensorflow/compiler/xla/service:computation_placer", - "//tensorflow/compiler/xla/service:device_memory_allocator", + "//tensorflow/stream_executor:device_memory_allocator", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_graph_dumper", "//tensorflow/compiler/xla/service:name_uniquer", diff --git a/tensorflow/compiler/xla/python/local_client.cc b/tensorflow/compiler/xla/python/local_client.cc index fe5142f40a1..3b35efa6439 100644 --- a/tensorflow/compiler/xla/python/local_client.cc +++ b/tensorflow/compiler/xla/python/local_client.cc @@ -212,7 +212,7 @@ StatusOr PyLocalClient::TransferFromOutfeed( static StatusOr TransferHostToDeviceAsync( const PythonBufferTree& tree, int device_ordinal, PyLocalClient* client, const Device& device) { - DeviceMemoryAllocator* allocator = + se::DeviceMemoryAllocator* allocator = client->client()->backend().memory_allocator(); TransferManager* transfer_manager = client->client()->backend().transfer_manager(); @@ -367,7 +367,7 @@ PyLocalBuffer::FromPythonValues( host_shapes.push_back(buffer.on_host_shape()); device_buffers.push_back(buffer.device_buffer()); } - DeviceMemoryAllocator* allocator = + se::DeviceMemoryAllocator* allocator = client->client()->backend().memory_allocator(); TransferManager* transfer_manager = client->client()->backend().transfer_manager(); diff --git a/tensorflow/compiler/xla/python/shared_device_buffer.cc b/tensorflow/compiler/xla/python/shared_device_buffer.cc index 6ff733c1ed5..f9fbd9eb933 100644 --- a/tensorflow/compiler/xla/python/shared_device_buffer.cc +++ b/tensorflow/compiler/xla/python/shared_device_buffer.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/xla/python/shared_device_buffer.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { @@ -47,14 +47,14 @@ void BufferDefinitionEvent::WaitForEventOnStream(se::Stream* stream) { static std::shared_ptr BufferFromScopedShapedBufferIterator( const Shape& on_device_shape, int device_ordinal, - DeviceMemoryAllocator* allocator, + se::DeviceMemoryAllocator* allocator, ShapeTree::iterator* iterator, const ShapeTree::iterator& end, const std::shared_ptr& definition_event) { CHECK(*iterator != end); - OwningDeviceMemory device_memory((*iterator)->second, device_ordinal, - allocator); + se::OwningDeviceMemory device_memory((*iterator)->second, device_ordinal, + allocator); (*iterator)->second = se::DeviceMemoryBase(); ++*iterator; @@ -90,7 +90,7 @@ PySharedDeviceBuffer::FromScopedShapedBuffer( /* static */ StatusOr> PySharedDeviceBuffer::MakeTuple( std::vector> children, - TransferManager* transfer_manager, DeviceMemoryAllocator* allocator, + TransferManager* transfer_manager, se::DeviceMemoryAllocator* allocator, int device_ordinal, std::shared_ptr definition_event) { std::vector child_shapes; @@ -102,7 +102,7 @@ PySharedDeviceBuffer::MakeTuple( Shape shape = ShapeUtil::MakeTupleShape(child_shapes); TF_ASSIGN_OR_RETURN( - OwningDeviceMemory device_memory, + se::OwningDeviceMemory device_memory, allocator->Allocate(device_ordinal, transfer_manager->GetByteSizeRequirement(shape))); return std::make_shared( @@ -113,10 +113,10 @@ PySharedDeviceBuffer::MakeTuple( /* static */ StatusOr> PySharedDeviceBuffer::MakeArray( Shape on_device_shape, TransferManager* transfer_manager, - DeviceMemoryAllocator* allocator, int device_ordinal, + se::DeviceMemoryAllocator* allocator, int device_ordinal, std::shared_ptr definition_event) { TF_ASSIGN_OR_RETURN( - OwningDeviceMemory device_memory, + se::OwningDeviceMemory device_memory, allocator->Allocate( device_ordinal, transfer_manager->GetByteSizeRequirement(on_device_shape))); @@ -153,7 +153,7 @@ ShapedBuffer PySharedDeviceBuffer::AsShapedBuffer( } PySharedDeviceBuffer::PySharedDeviceBuffer( - Shape on_device_shape, OwningDeviceMemory device_memory, + Shape on_device_shape, se::OwningDeviceMemory device_memory, std::vector> children, std::shared_ptr definition_event) : on_device_shape_(std::move(on_device_shape)), diff --git a/tensorflow/compiler/xla/python/shared_device_buffer.h b/tensorflow/compiler/xla/python/shared_device_buffer.h index 705b3a0cfe4..6a57d7fd6a5 100644 --- a/tensorflow/compiler/xla/python/shared_device_buffer.h +++ b/tensorflow/compiler/xla/python/shared_device_buffer.h @@ -17,11 +17,11 @@ limitations under the License. #define TENSORFLOW_COMPILER_XLA_PYTHON_SHARED_DEVICE_BUFFER_H_ #include "absl/container/flat_hash_set.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" -#include "tensorflow/compiler/xla/service/owning_device_memory.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" #include "tensorflow/compiler/xla/shape.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" +#include "tensorflow/stream_executor/owning_device_memory.h" namespace xla { @@ -93,14 +93,14 @@ class PySharedDeviceBuffer { // Makes a tuple buffer. Does not initialize the tuple table. static StatusOr> MakeTuple( std::vector> children, - TransferManager* transfer_manager, DeviceMemoryAllocator* allocator, + TransferManager* transfer_manager, se::DeviceMemoryAllocator* allocator, int device_ordinal, std::shared_ptr definition_event); // Makes an uninitialized array buffer. static StatusOr> MakeArray( Shape on_device_shape, TransferManager* transfer_manager, - DeviceMemoryAllocator* allocator, int device_ordinal, + se::DeviceMemoryAllocator* allocator, int device_ordinal, std::shared_ptr definition_event); // Builds a ShapedBuffer view onto the buffers of 'tree'. Since @@ -113,7 +113,7 @@ class PySharedDeviceBuffer { const std::vector>& children() const { return children_; } - const OwningDeviceMemory& device_memory() const { return device_memory_; } + const se::OwningDeviceMemory& device_memory() const { return device_memory_; } int device_ordinal() const { return device_memory_.device_ordinal(); } const std::shared_ptr definition_event() const { return definition_event_; @@ -121,7 +121,7 @@ class PySharedDeviceBuffer { PySharedDeviceBuffer() = default; PySharedDeviceBuffer( - Shape on_device_shape, OwningDeviceMemory device_memory, + Shape on_device_shape, se::OwningDeviceMemory device_memory, std::vector> children, std::shared_ptr definition_event); @@ -130,7 +130,7 @@ class PySharedDeviceBuffer { // one-to-one with the tree of device buffers, so to avoid representational // awkwardness we maintain on-host shapes separately. Shape on_device_shape_; - OwningDeviceMemory device_memory_; + se::OwningDeviceMemory device_memory_; std::vector> children_; // An event that is triggered when the content of one or more buffers is diff --git a/tensorflow/compiler/xla/python/types.cc b/tensorflow/compiler/xla/python/types.cc index 2d0eb8af855..da842318f3e 100644 --- a/tensorflow/compiler/xla/python/types.cc +++ b/tensorflow/compiler/xla/python/types.cc @@ -16,8 +16,8 @@ limitations under the License. #include "tensorflow/compiler/xla/python/types.h" #include "absl/container/flat_hash_map.h" -#include "tensorflow/compiler/xla/service/owning_device_memory.h" #include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/stream_executor/owning_device_memory.h" namespace xla { diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index e4abf742888..fbdc9cf9a9e 100644 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -437,10 +437,10 @@ tf_cc_test( srcs = ["pattern_matcher_test.cc"], deps = [ ":hlo", + ":hlo_parser", ":pattern_matcher", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", "@com_google_absl//absl/strings", @@ -508,8 +508,8 @@ cc_library( hdrs = ["hlo_matchers.h"], deps = [ ":hlo", + ":hlo_parser", "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "@com_google_absl//absl/strings", "@com_google_absl//absl/types:optional", @@ -552,13 +552,13 @@ tf_cc_test( srcs = ["hlo_sharding_test.cc"], deps = [ ":hlo", + ":hlo_parser", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:protobuf_util", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", ], @@ -586,6 +586,7 @@ tf_cc_test( srcs = ["call_graph_test.cc"], deps = [ ":call_graph", + ":hlo", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -593,7 +594,6 @@ tf_cc_test( "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -656,6 +656,7 @@ tf_cc_test( deps = [ ":call_graph", ":flatten_call_graph", + ":hlo", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -663,7 +664,6 @@ tf_cc_test( "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -694,7 +694,6 @@ cc_library( deps = [ ":compiler", ":computation_placer", - ":device_memory_allocator", ":platform_util", ":stream_pool", ":transfer_manager", @@ -704,6 +703,7 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/stream_executor:device_memory_allocator", "//third_party/eigen3", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/memory", @@ -724,7 +724,6 @@ cc_library( ":compiler", ":computation_layout", ":computation_placer", - ":device_memory_allocator", ":dump", ":dynamic_dimension_inference", ":executable", @@ -754,6 +753,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:ptr_util", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", @@ -770,7 +770,6 @@ cc_library( ":backend", ":compiler", ":computation_layout", - ":device_memory_allocator", ":executable", ":hlo", ":hlo_execution_profile", @@ -790,6 +789,7 @@ cc_library( "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", @@ -858,7 +858,6 @@ cc_library( srcs = ["shaped_buffer.cc"], hdrs = ["shaped_buffer.h"], deps = [ - ":device_memory_allocator", "//tensorflow/compiler/xla:shape_tree", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -868,6 +867,7 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", @@ -881,7 +881,6 @@ tf_cc_test( srcs = ["shaped_buffer_test.cc"], deps = [ ":cpu_plugin", - ":device_memory_allocator", ":platform_util", ":shaped_buffer", "//tensorflow/compiler/xla:shape_util", @@ -891,6 +890,7 @@ tf_cc_test( "//tensorflow/core:ptr_util", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core:test", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/memory", ], ) @@ -904,7 +904,6 @@ cc_library( ], deps = [ ":computation_layout", - ":device_memory_allocator", ":dump", ":hlo", ":hlo_execution_profile", @@ -925,6 +924,7 @@ cc_library( "//tensorflow/core:lib_internal", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/stream_executor", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings:str_format", "@com_google_absl//absl/types:span", @@ -991,7 +991,6 @@ cc_library( hdrs = ["allocation_tracker.h"], deps = [ ":backend", - ":device_memory_allocator", ":transfer_manager", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", @@ -1000,6 +999,7 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/core:lib", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", @@ -1159,6 +1159,7 @@ tf_cc_test( ":hlo", ":hlo_memory_scheduler", ":hlo_ordering", + ":hlo_parser", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", @@ -1166,7 +1167,6 @@ tf_cc_test( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -1208,10 +1208,10 @@ tf_cc_test( ":hlo_dataflow_analysis", ":hlo_memory_scheduler", ":hlo_ordering", + ":hlo_parser", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -1458,8 +1458,8 @@ tf_cc_test( srcs = ["instruction_fusion_test.cc"], deps = [ ":hlo_matchers", + ":hlo_parser", ":instruction_fusion", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", ], @@ -1470,11 +1470,11 @@ cc_library( srcs = ["multi_output_fusion.cc"], hdrs = ["multi_output_fusion.h"], deps = [ + ":hlo", + ":hlo_pass", ":hlo_reachability", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/service:hlo_pass", "//tensorflow/core:lib", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", @@ -1791,8 +1791,8 @@ tf_cc_test( srcs = ["gather_expander_test.cc"], deps = [ ":gather_expander", + ":hlo_parser", "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:test_macros_header", "//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep ], @@ -1890,9 +1890,9 @@ tf_cc_test( name = "while_loop_analysis_test", srcs = ["while_loop_analysis_test.cc"], deps = [ + ":hlo_parser", ":while_loop_analysis", "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:test", @@ -2297,7 +2297,7 @@ tf_cc_test( ":cpu_plugin", ":hlo_cost_analysis", ":hlo_execution_profile", - "//tensorflow/compiler/xla/service:hlo_parser", + ":hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -2310,14 +2310,14 @@ tf_cc_test( srcs = ["hlo_computation_test.cc"], deps = [ ":hlo", + ":hlo_matchers", + ":hlo_parser", ":pattern_matcher", ":pattern_matcher_gmock", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", - "//tensorflow/compiler/xla/service:hlo_matchers", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "@com_google_absl//absl/container:flat_hash_map", @@ -2522,13 +2522,13 @@ tf_cc_test( deps = [ ":hlo", ":hlo_liveness_analysis", + ":hlo_parser", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "//tensorflow/core:lib", @@ -2912,12 +2912,12 @@ tf_cc_test( deps = [ ":hlo", ":hlo_module_dce", + ":hlo_parser", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -3043,12 +3043,12 @@ tf_cc_test( ":hlo", ":hlo_cse", ":hlo_matchers", + ":hlo_parser", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:literal_test_util", "//tensorflow/compiler/xla/tests:test_utils", @@ -3232,27 +3232,6 @@ tf_cc_test( ], ) -cc_library( - name = "device_memory_allocator", - srcs = [ - "device_memory_allocator.cc", - "owning_device_memory.cc", - ], - hdrs = [ - "device_memory_allocator.h", - "owning_device_memory.h", - ], - deps = [ - "//tensorflow/compiler/xla:status_macros", - "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla:util", - "//tensorflow/core:lib", - "//tensorflow/core:stream_executor_no_cuda", - "@com_google_absl//absl/types:span", - ], -) - cc_library( name = "maybe_owning_device_memory", srcs = [ @@ -3262,7 +3241,7 @@ cc_library( "maybe_owning_device_memory.h", ], deps = [ - ":device_memory_allocator", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/types:optional", "@com_google_absl//absl/types:variant", ], @@ -3305,10 +3284,10 @@ xla_test( "gpu", ], deps = [ + ":hlo_parser", "//tensorflow/compiler/xla:execution_options_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:client_library_test_base", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -3431,6 +3410,7 @@ tf_cc_test( deps = [ ":hlo", ":hlo_matchers", + ":hlo_parser", ":shape_inference", ":transpose_folding", "//tensorflow/compiler/xla:literal", @@ -3439,7 +3419,6 @@ tf_cc_test( "//tensorflow/compiler/xla:test_helpers", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/client:xla_builder", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/service/gpu:ir_emission_utils", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/compiler/xla/tests:xla_internal_test_main", @@ -3682,10 +3661,10 @@ tf_cc_test( name = "tuple_util_test", srcs = ["tuple_util_test.cc"], deps = [ + ":hlo_matchers", + ":hlo_parser", ":tuple_util", "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla/service:hlo_matchers", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:xla_internal_test_main", ], ) @@ -3711,11 +3690,11 @@ tf_cc_test( name = "while_util_test", srcs = ["while_util_test.cc"], deps = [ + ":hlo_matchers", + ":hlo_parser", ":while_util", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla/service:hlo_matchers", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:xla_internal_test_main", "@com_google_absl//absl/algorithm:container", ], @@ -3746,9 +3725,9 @@ tf_cc_test( srcs = ["while_loop_invariant_code_motion_test.cc"], deps = [ ":hlo_matchers", + ":hlo_parser", ":while_loop_invariant_code_motion", "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/core:test", ], @@ -3774,9 +3753,9 @@ tf_cc_test( srcs = ["while_loop_constant_sinking_test.cc"], deps = [ ":hlo_matchers", + ":hlo_parser", ":while_loop_constant_sinking", "//tensorflow/compiler/xla:test", - "//tensorflow/compiler/xla/service:hlo_parser", "//tensorflow/compiler/xla/tests:hlo_test_base", "//tensorflow/core:test", ], @@ -3976,6 +3955,8 @@ cc_library( hdrs = ["ar_crs_combiner.h"], deps = [ ":call_graph", + ":hlo", + ":hlo_pass", ":pattern_matcher", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:literal_util", @@ -3983,8 +3964,6 @@ cc_library( "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/service:hlo_pass", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/strings", ], @@ -4008,11 +3987,11 @@ cc_library( srcs = ["dynamic_index_splitter.cc"], hdrs = ["dynamic_index_splitter.h"], deps = [ + ":hlo", ":hlo_casting_utils", + ":hlo_pass", "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:statusor", - "//tensorflow/compiler/xla/service:hlo", - "//tensorflow/compiler/xla/service:hlo_pass", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/container:inlined_vector", diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc index 6cb0e985e57..ea56c75b2f2 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.cc +++ b/tensorflow/compiler/xla/service/allocation_tracker.cc @@ -20,13 +20,13 @@ limitations under the License. #include "absl/memory/memory.h" #include "absl/strings/str_cat.h" #include "tensorflow/compiler/xla/map_util.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { @@ -221,8 +221,8 @@ void AllocationTracker::AddAllocationOrIncrementRefCount( auto it = allocation_map.find(device_memory.opaque()); if (it == allocation_map.end()) { allocation_map[device_memory.opaque()] = { - OwningDeviceMemory(device_memory, device_ordinal, - backend_->memory_allocator()), + se::OwningDeviceMemory(device_memory, device_ordinal, + backend_->memory_allocator()), /*ref_count=*/1}; } else { it->second.ref_count++; diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h index 98d1a302a9f..6e7f9fdfc13 100644 --- a/tensorflow/compiler/xla/service/allocation_tracker.h +++ b/tensorflow/compiler/xla/service/allocation_tracker.h @@ -77,7 +77,7 @@ class AllocationTracker { // Data structure encapsulating single memory allocation on the device. struct Allocation { // The pointer to this allocation. - OwningDeviceMemory device_memory; + se::OwningDeviceMemory device_memory; // This is the number of times this memory allocation is referred to by // registered data handles. diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc index 1528ec61354..d859f647ea0 100644 --- a/tensorflow/compiler/xla/service/backend.cc +++ b/tensorflow/compiler/xla/service/backend.cc @@ -134,7 +134,7 @@ Backend::Backend(se::Platform* platform, Compiler* compiler, } } // Create a memory allocator for the valid stream executors. - memory_allocator_ = absl::make_unique( + memory_allocator_ = absl::make_unique( platform, stream_executors); CHECK(!stream_executors_.empty()) << "Service found no devices for backend " << platform_->Name() << '.'; diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h index e7f29a044b9..79fdeb2b0bc 100644 --- a/tensorflow/compiler/xla/service/backend.h +++ b/tensorflow/compiler/xla/service/backend.h @@ -27,7 +27,6 @@ limitations under the License. #include "absl/types/span.h" #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/computation_placer.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/stream_pool.h" #include "tensorflow/compiler/xla/service/transfer_manager.h" #include "tensorflow/compiler/xla/statusor.h" @@ -35,6 +34,7 @@ limitations under the License. #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace Eigen { struct ThreadPoolDevice; @@ -88,7 +88,7 @@ class Backend { // Accessors for the various objects. se::Platform* platform() const { return platform_; } Compiler* compiler() const { return compiler_; } - DeviceMemoryAllocator* memory_allocator() const { + se::DeviceMemoryAllocator* memory_allocator() const { return memory_allocator_.get(); } TransferManager* transfer_manager() const { return transfer_manager_; } @@ -179,7 +179,7 @@ class Backend { stream_pools_ GUARDED_BY(mu_); // The default memory allocator to use. - std::unique_ptr memory_allocator_; + std::unique_ptr memory_allocator_; // For the CPU backend, an Eigen threadpool device for use by Eigen code. struct IntraOpThreadPool; diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h index 9b483bd97e9..631a7dd7e6a 100644 --- a/tensorflow/compiler/xla/service/compiler.h +++ b/tensorflow/compiler/xla/service/compiler.h @@ -75,8 +75,10 @@ class AotCompilationOptions { // Optional allocator that may be used for allocating temp space on the device // during compilation. - DeviceMemoryAllocator* device_allocator() const { return device_allocator_; } - void set_device_allocator(DeviceMemoryAllocator* device_allocator) { + se::DeviceMemoryAllocator* device_allocator() const { + return device_allocator_; + } + void set_device_allocator(se::DeviceMemoryAllocator* device_allocator) { device_allocator_ = device_allocator; } @@ -98,7 +100,7 @@ class AotCompilationOptions { AotCompilationOptions(); private: - DeviceMemoryAllocator* device_allocator_ = nullptr; + se::DeviceMemoryAllocator* device_allocator_ = nullptr; DebugOptions debug_options_; absl::optional static_device_assignment_; }; @@ -147,14 +149,14 @@ class Compiler { // allocated should be deallocated before this function returns. virtual StatusOr> RunHloPasses( std::unique_ptr module, se::StreamExecutor* executor, - DeviceMemoryAllocator* device_allocator) = 0; + se::DeviceMemoryAllocator* device_allocator) = 0; // Optimizes a HLO module group, a set of module which runs concurrently on // multiple devices potentially communicating data between the modules. virtual Status RunHloPassesOnModuleGroup( HloModuleGroup* module_group, absl::Span executors, - DeviceMemoryAllocator* device_allocator) = 0; + se::DeviceMemoryAllocator* device_allocator) = 0; // Compiles the HLO module for execution on a device given by the executor, // and returns an executable object or an error status. No HLO passes are @@ -168,7 +170,7 @@ class Compiler { // device_allocator is optional; see RunHloPasses. virtual StatusOr> RunBackend( std::unique_ptr module, se::StreamExecutor* executor, - DeviceMemoryAllocator* device_allocator) = 0; + se::DeviceMemoryAllocator* device_allocator) = 0; // Compiles a set of HLO modules that can run in parallel, potentially // communicating data between the modules. @@ -176,7 +178,7 @@ class Compiler { RunBackendOnModuleGroup( std::unique_ptr module_group, std::vector> stream_exec, - DeviceMemoryAllocator* device_allocator) = 0; + se::DeviceMemoryAllocator* device_allocator) = 0; // Compiles a set of HLO modules that can run in parallel, potentially // communicating data between the modules, and returns a corresponding @@ -189,7 +191,7 @@ class Compiler { virtual StatusOr>> Compile( std::unique_ptr module_group, std::vector> stream_exec, - DeviceMemoryAllocator* device_allocator) = 0; + se::DeviceMemoryAllocator* device_allocator) = 0; // Returns the backend configurations that the backend will consider for the // given HLO. Returns no configurations if the backend does not support diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD index 529ed121731..ed57929f381 100644 --- a/tensorflow/compiler/xla/service/cpu/BUILD +++ b/tensorflow/compiler/xla/service/cpu/BUILD @@ -245,7 +245,6 @@ cc_library( "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:buffer_assignment", "//tensorflow/compiler/xla/service:computation_layout", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:executable", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_execution_profile", @@ -255,6 +254,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core/profiler/lib:traceme", + "//tensorflow/stream_executor:device_memory_allocator", "//tensorflow/stream_executor/host:host_stream", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc index cbebfb08f06..06ea1e2f8bd 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc @@ -537,7 +537,7 @@ Status CreateHloProfilingArtifacts( StatusOr> CpuCompiler::RunHloPasses( std::unique_ptr module, se::StreamExecutor* /*stream_exec*/, - DeviceMemoryAllocator* /*device_allocator*/) { + se::DeviceMemoryAllocator* /*device_allocator*/) { std::unique_ptr jit_target_machine = SimpleOrcJIT::InferTargetMachineForJIT( CompilerTargetOptions(module->config()), @@ -597,7 +597,7 @@ struct OrcJITPostCompilationHook { StatusOr> CpuCompiler::RunBackend( std::unique_ptr module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* /*device_allocator*/) { + se::DeviceMemoryAllocator* /*device_allocator*/) { VLOG(1) << "Compiling: " << module->name(); XLA_SCOPED_LOGGING_TIMER( absl::StrFormat("Compiling [%s] for CPU using JIT", module->name())); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h index 8ff0fd5a5c5..dd15891f175 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h @@ -133,11 +133,11 @@ class CpuCompiler : public LLVMCompiler { StatusOr> RunHloPasses( std::unique_ptr module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) override; + se::DeviceMemoryAllocator* device_allocator) override; StatusOr> RunBackend( std::unique_ptr module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) override; + se::DeviceMemoryAllocator* device_allocator) override; StatusOr>> CompileAheadOfTime(std::unique_ptr module_group, diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc index 23d0af34233..cc0f808569a 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc @@ -73,13 +73,13 @@ CpuExecutable::CpuExecutable( } StatusOr, - std::vector>> + std::vector>> CpuExecutable::CreateBufferTable( - DeviceMemoryAllocator* memory_allocator, int device_ordinal, + se::DeviceMemoryAllocator* memory_allocator, int device_ordinal, absl::Span arguments) { std::vector unowning_buffers( assignment_->Allocations().size()); - std::vector owning_buffers( + std::vector owning_buffers( assignment_->Allocations().size()); VLOG(3) << "Allocating " << assignment_->Allocations().size() << " allocations for module " << module().name(); @@ -207,7 +207,7 @@ Status CpuExecutable::ExecuteComputeFunction( StatusOr CpuExecutable::CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, - absl::Span buffers) { + absl::Span buffers) { se::Stream* stream = run_options->stream(); ScopedShapedBuffer result_buffer( /*on_host_shape=*/result_shape(), @@ -216,7 +216,7 @@ StatusOr CpuExecutable::CreateResultShapedBuffer( const HloInputOutputAliasConfig& input_output_alias = module().input_output_alias_config(); - // Move OwningDeviceMemory values which contain the array(s) of the result + // Move se::OwningDeviceMemory values which contain the array(s) of the result // into the respective location in ScopedShapedBuffer which is returned to the // caller. TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus( @@ -235,7 +235,7 @@ StatusOr CpuExecutable::CreateResultShapedBuffer( const BufferAllocation::Slice slice, this->assignment_->GetUniqueSlice(src, buffer_source->index())); const BufferAllocation::Index buffer_index = slice.index(); - OwningDeviceMemory& buffer = buffers[buffer_index]; + se::OwningDeviceMemory& buffer = buffers[buffer_index]; if (!slice.allocation()->is_entry_computation_parameter()) { // If the buffer coming out of the result is from a parameter, the // owning buffer will be null, and that means the caller aliased some @@ -297,8 +297,8 @@ StatusOr CpuExecutable::ExecuteAsyncOnStreamImpl( auto* host_stream = dynamic_cast( run_options->stream()->implementation()); se::Stream* stream = run_options->stream(); - DeviceMemoryAllocator* memory_allocator = run_options->allocator(); - std::vector owning_buffers; + se::DeviceMemoryAllocator* memory_allocator = run_options->allocator(); + std::vector owning_buffers; std::vector unowning_buffers; TF_ASSIGN_OR_RETURN( std::tie(unowning_buffers, owning_buffers), @@ -326,7 +326,7 @@ StatusOr CpuExecutable::ExecuteAsyncOnStreamImpl( CpuExecutable* executable; ServiceExecutableRunOptions run_options; std::vector unowning_buffers; - std::shared_ptr> buffers; + std::shared_ptr> buffers; HloExecutionProfile* hlo_execution_profile; void operator()() { @@ -338,7 +338,7 @@ StatusOr CpuExecutable::ExecuteAsyncOnStreamImpl( }; host_stream->EnqueueTask( AsyncRunTask{this, *run_options, std::move(unowning_buffers), - std::make_shared>( + std::make_shared>( std::move(owning_buffers)), hlo_execution_profile}); diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h index 3b91b15ba9b..735a20749b9 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h @@ -25,7 +25,6 @@ limitations under the License. #include "absl/types/span.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" @@ -37,6 +36,7 @@ limitations under the License. #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { namespace cpu { @@ -111,8 +111,9 @@ class CpuExecutable : public Executable { // storage and the live-out buffer into which the computation writes it // result. StatusOr, - std::vector>> - CreateBufferTable(DeviceMemoryAllocator* memory_allocator, int device_ordinal, + std::vector>> + CreateBufferTable(se::DeviceMemoryAllocator* memory_allocator, + int device_ordinal, absl::Span arguments); // Calls the generated function performing the computation with the given @@ -126,7 +127,7 @@ class CpuExecutable : public Executable { // The addresses are set according to buffer assignment. StatusOr CreateResultShapedBuffer( const ServiceExecutableRunOptions* run_options, - absl::Span buffers); + absl::Span buffers); // Returns the points-to set of the root instruction of the entry // computation. Uses points-to analysis from buffer assignment. diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h index a08ec181d49..e71629526ed 100644 --- a/tensorflow/compiler/xla/service/executable.h +++ b/tensorflow/compiler/xla/service/executable.h @@ -24,13 +24,11 @@ limitations under the License. #include "absl/types/variant.h" #include "tensorflow/compiler/xla/debug_options_flags.h" #include "tensorflow/compiler/xla/service/computation_layout.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/maybe_owning_device_memory.h" -#include "tensorflow/compiler/xla/service/owning_device_memory.h" #include "tensorflow/compiler/xla/service/service_executable_run_options.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/shape_tree.h" @@ -40,6 +38,8 @@ limitations under the License. #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/thread_annotations.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" +#include "tensorflow/stream_executor/owning_device_memory.h" namespace xla { @@ -47,13 +47,13 @@ namespace xla { // leftover buffers to be released by the caller. struct ExecutionOutput { ExecutionOutput(ScopedShapedBuffer result, - std::vector to_be_released) + std::vector to_be_released) : result(std::move(result)), to_be_released(std::move(to_be_released)) {} ScopedShapedBuffer result; // Leftover buffers for the caller to release. Elements in this list are // donated input memory buffers that are not reused by XLA as outputs. - std::vector to_be_released; + std::vector to_be_released; }; // A given platform's compiler will produce an Executable -- this is a uniform diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index d102eafaa7e..1cdeb4c88e2 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -282,10 +282,10 @@ cc_library( "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/service:buffer_assignment", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/memory", "@com_google_absl//absl/types:span", @@ -408,7 +408,6 @@ cc_library( "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", "//tensorflow/compiler/xla/service:buffer_assignment", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:executable", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_execution_profile", @@ -428,6 +427,7 @@ cc_library( "//tensorflow/stream_executor", "//tensorflow/stream_executor:blas", "//tensorflow/stream_executor:device_memory", + "//tensorflow/stream_executor:device_memory_allocator", "//tensorflow/stream_executor:kernel", "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/base:core_headers", @@ -476,7 +476,6 @@ cc_library( "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla/service:compiler", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_casting_utils", "//tensorflow/compiler/xla/service:hlo_pass", @@ -485,6 +484,7 @@ cc_library( "//tensorflow/core:logger", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core/util/proto:proto_utils", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/algorithm:container", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", @@ -500,8 +500,8 @@ cc_library( deps = [ "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/stream_executor:device_memory_allocator", ], ) @@ -517,12 +517,12 @@ cc_library( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:hlo_module_config", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/stream_executor:device_memory", + "//tensorflow/stream_executor:device_memory_allocator", "//tensorflow/stream_executor:stream_executor_headers", ], ) @@ -536,12 +536,12 @@ tf_cc_test( "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:test", "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:hlo_module_config", "//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core:test", "//tensorflow/core/platform/default/build_config:stream_executor_cuda", + "//tensorflow/stream_executor:device_memory_allocator", "//tensorflow/stream_executor:event", "//tensorflow/stream_executor:kernel", "//tensorflow/stream_executor/cuda:cuda_activation", @@ -634,12 +634,12 @@ cc_library( "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:util", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_pass", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/stream_executor:blas", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/types:optional", ], ) @@ -1164,7 +1164,6 @@ cc_library( "//tensorflow/compiler/xla:shape_util", "//tensorflow/compiler/xla:status_macros", "//tensorflow/compiler/xla:util", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:hlo_module_config", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/stream_executor:stream_executor_headers", diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc index f46a1bc51d9..3afc18d949a 100644 --- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc +++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc @@ -39,7 +39,7 @@ void BufferAllocations::Builder::RegisterBuffer(BufferAllocation::Index index, StatusOr> BufferAllocations::Builder::Build( const BufferAssignment* buffer_assignment, int device_ordinal, - DeviceMemoryAllocator* memory_allocator) { + se::DeviceMemoryAllocator* memory_allocator) { const int64 num_buffers = buffer_assignment->Allocations().size(); auto buffer_allocations = absl::WrapUnique(new BufferAllocations( num_buffers, device_ordinal, memory_allocator, buffer_assignment)); @@ -77,7 +77,7 @@ StatusOr> BufferAllocations::Builder::Build( const int64 buffer_size = allocation.size(); se::DeviceMemoryBase buffer_address; if (buffer_size > 0) { - OwningDeviceMemory buffer; + se::OwningDeviceMemory buffer; TF_ASSIGN_OR_RETURN( buffer, memory_allocator->Allocate(device_ordinal, buffer_size)); if (reinterpret_cast(buffer.opaque()) % expected_alignment != diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h index 9413ac2cff7..cf78b92fe5b 100644 --- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h +++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h @@ -23,9 +23,9 @@ limitations under the License. #include "absl/container/flat_hash_map.h" #include "absl/types/span.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { namespace gpu { @@ -50,7 +50,7 @@ class BufferAllocations { // memory on. StatusOr> Build( const BufferAssignment* buffer_assignment, int device_ordinal, - DeviceMemoryAllocator* memory_allocator); + se::DeviceMemoryAllocator* memory_allocator); private: absl::flat_hash_map @@ -62,7 +62,9 @@ class BufferAllocations { BufferAllocations(const BufferAllocations&) = delete; BufferAllocations& operator=(const BufferAllocations&) = delete; - DeviceMemoryAllocator* memory_allocator() const { return memory_allocator_; } + se::DeviceMemoryAllocator* memory_allocator() const { + return memory_allocator_; + } int device_ordinal() const { return device_ordinal_; } // Returns the device address of buffer `buffer_index`. `buffer_index` must be @@ -84,7 +86,7 @@ class BufferAllocations { private: BufferAllocations(BufferAllocation::Index buffer_count, int device_ordinal, - DeviceMemoryAllocator* memory_allocator, + se::DeviceMemoryAllocator* memory_allocator, const BufferAssignment* buffer_assignment) : buffers_(buffer_count), device_ordinal_(device_ordinal), @@ -104,7 +106,7 @@ class BufferAllocations { se::DeviceMemoryBase temp_buffer_base_; int device_ordinal_; - DeviceMemoryAllocator* memory_allocator_; + se::DeviceMemoryAllocator* memory_allocator_; const BufferAssignment* buffer_assignment_; bool torn_down_ = false; }; diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc index c110b338b65..b3f274e1130 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.cc @@ -256,9 +256,9 @@ StatusOr CudnnConvAlgorithmPicker::PickBestAlgorithmNoCache( const auto device_ordinal = stream_exec_->device_ordinal(); // allocator either points to this->allocator_ or, if that's null, to a - // StreamExecutorMemoryAllocator for stream_exec_. - DeviceMemoryAllocator* allocator; - optional se_allocator; + // se::StreamExecutorMemoryAllocator for stream_exec_. + se::DeviceMemoryAllocator* allocator; + optional se_allocator; if (allocator_ != nullptr) { allocator = allocator_; } else { diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h index 664fd7ff1c6..9e8a797739a 100644 --- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h +++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_algorithm_picker.h @@ -19,13 +19,13 @@ limitations under the License. #include "absl/time/time.h" #include "absl/types/optional.h" #include "tensorflow/compiler/xla/service/compiler.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.h" #include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/protobuf/autotuning.pb.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { namespace gpu { @@ -38,7 +38,8 @@ class CudnnConvAlgorithmPicker : public HloModulePass { // memory while timing the various convolution algorithms. If it's null, // we'll use the default allocator on the StreamExecutor. CudnnConvAlgorithmPicker(se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* allocator, Compiler* compiler) + se::DeviceMemoryAllocator* allocator, + Compiler* compiler) : stream_exec_(stream_exec), allocator_(allocator), compiler_(compiler) {} absl::string_view name() const override { @@ -56,7 +57,7 @@ class CudnnConvAlgorithmPicker : public HloModulePass { const HloCustomCallInstruction* instr); se::StreamExecutor* stream_exec_; // never null - DeviceMemoryAllocator* allocator_; // may be null + se::DeviceMemoryAllocator* allocator_; // may be null Compiler* compiler_; }; diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc index 7861eb1ef04..2ba6e8fc3c5 100644 --- a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc +++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.cc @@ -174,9 +174,9 @@ StatusOr CusolverRewriter::RunOnComputation(HloComputation* computation) { const auto device_ordinal = stream_exec_->device_ordinal(); // allocator either points to this->allocator_ or, if that's null, to a - // StreamExecutorMemoryAllocator for stream_exec_. - DeviceMemoryAllocator* allocator; - absl::optional se_allocator; + // se::StreamExecutorMemoryAllocator for stream_exec_. + se::DeviceMemoryAllocator* allocator; + absl::optional se_allocator; if (allocator_ != nullptr) { allocator = allocator_; } else { @@ -200,7 +200,7 @@ StatusOr CusolverRewriter::RunOnComputation(HloComputation* computation) { } CusolverRewriter::CusolverRewriter(se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* allocator) + se::DeviceMemoryAllocator* allocator) : stream_exec_(stream_exec), allocator_(allocator) {} StatusOr CusolverRewriter::Run(HloModule* module) { diff --git a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h index c82233188f7..d8c2cc55872 100644 --- a/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h +++ b/tensorflow/compiler/xla/service/gpu/cusolver_rewriter.h @@ -16,12 +16,12 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_CUSOLVER_REWRITER_H_ -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/gpu/cusolver_context.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_pass_interface.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { namespace gpu { @@ -30,7 +30,7 @@ namespace gpu { class CusolverRewriter : public HloModulePass { public: CusolverRewriter(se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* allocator); + se::DeviceMemoryAllocator* allocator); absl::string_view name() const override { return "cusolver-rewriter"; } StatusOr Run(HloModule* module) override; @@ -39,7 +39,7 @@ class CusolverRewriter : public HloModulePass { StatusOr RunOnComputation(HloComputation* computation); se::StreamExecutor* stream_exec_; // never null - DeviceMemoryAllocator* allocator_; // may be null + se::DeviceMemoryAllocator* allocator_; // may be null }; } // namespace gpu diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc index ca4a605af5d..8f40010bdcb 100644 --- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc +++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc @@ -29,7 +29,7 @@ namespace xla { namespace gpu { FftScratchAllocator::FftScratchAllocator( - int device_ordinal, DeviceMemoryAllocator* memory_allocator) + int device_ordinal, se::DeviceMemoryAllocator* memory_allocator) : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {} int64 FftScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) { @@ -48,7 +48,7 @@ StatusOr> FftScratchAllocator::AllocateBytes( byte_size, GetMemoryLimitInBytes(stream))); } - TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer, + TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer, memory_allocator_->Allocate(device_ordinal_, byte_size, /*retry_on_failure=*/false)); total_allocated_bytes_ += byte_size; diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h index 2be50e08bd2..f653e4f12fe 100644 --- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h +++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h @@ -38,7 +38,7 @@ namespace gpu { class FftScratchAllocator : public se::ScratchAllocator { public: FftScratchAllocator(int device_ordinal, - DeviceMemoryAllocator* memory_allocator); + se::DeviceMemoryAllocator* memory_allocator); int64 GetMemoryLimitInBytes(se::Stream* stream) override; @@ -49,8 +49,8 @@ class FftScratchAllocator : public se::ScratchAllocator { private: const int device_ordinal_; - DeviceMemoryAllocator* memory_allocator_; - std::vector allocated_buffers_; + se::DeviceMemoryAllocator* memory_allocator_; + std::vector allocated_buffers_; int64 total_allocated_bytes_ = 0; }; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index f65ff9b2cd8..dec40c5e49c 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -230,7 +230,7 @@ StatusOr GpuExecutable::Execute( const ServiceExecutableRunOptions* run_options, absl::Span arguments, HloExecutionProfile* hlo_execution_profile, bool block_host_until_done) { - DeviceMemoryAllocator* memory_allocator = run_options->allocator(); + se::DeviceMemoryAllocator* memory_allocator = run_options->allocator(); if (GetRootPointsToSet().IsAmbiguous()) { return Unimplemented("Points-to set of root instruction is ambiguous"); @@ -348,7 +348,7 @@ StatusOr GpuExecutable::ExecuteOnStream( StatusOr GpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, absl::Span arguments) { - DeviceMemoryAllocator* memory_allocator = run_options->allocator(); + se::DeviceMemoryAllocator* memory_allocator = run_options->allocator(); // Force synchronous execution if the allocator requires it. bool block_host_until_done = !memory_allocator->AllowsAsynchronousDeallocation(); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index 8e71647a0da..b1f63bc672e 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -24,7 +24,6 @@ limitations under the License. #include "absl/types/optional.h" #include "absl/types/span.h" #include "tensorflow/compiler/xla/service/buffer_assignment.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h" @@ -38,6 +37,7 @@ limitations under the License. #include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { namespace gpu { diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index d977e0a1629..ace03589803 100644 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -164,7 +164,7 @@ string GetLibdeviceDir(const HloModuleConfig& hlo_module_config) { // It takes a compiler pointer, as passes may compile and execute HLOs on the // fly for cuDNN verification or other purposes. Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator, + se::DeviceMemoryAllocator* device_allocator, Compiler* compiler) { { HloPassPipeline pipeline("optimization"); @@ -463,7 +463,7 @@ NVPTXCompiler::NVPTXCompiler() StatusOr> NVPTXCompiler::RunHloPasses( std::unique_ptr module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) { + se::DeviceMemoryAllocator* device_allocator) { // We dump the post-optimization HLO in RunBackend so no need to dump it here. XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses"); tensorflow::profiler::TraceMe activity( @@ -479,7 +479,7 @@ StatusOr> NVPTXCompiler::RunHloPasses( StatusOr> NVPTXCompiler::RunBackend( std::unique_ptr module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) { + se::DeviceMemoryAllocator* device_allocator) { XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunBackend"); TF_RET_CHECK(stream_exec != nullptr); diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h index b74e5f01c6e..25e4b9427c0 100644 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h @@ -53,11 +53,11 @@ class NVPTXCompiler : public LLVMCompiler { StatusOr> RunHloPasses( std::unique_ptr module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) override; + se::DeviceMemoryAllocator* device_allocator) override; StatusOr> RunBackend( std::unique_ptr module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) override; + se::DeviceMemoryAllocator* device_allocator) override; StatusOr>> CompileAheadOfTime(std::unique_ptr module_group, diff --git a/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc b/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc index c5f812b68cd..b2229971e9f 100644 --- a/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc +++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator.cc @@ -50,7 +50,7 @@ StatusOr> RedzoneAllocator::AllocateBytes( int64 rhs_slop = RoundUpToNearest(byte_size, kRhsRedzoneAlign) - byte_size; TF_ASSIGN_OR_RETURN( - OwningDeviceMemory allocated_buffer, + se::OwningDeviceMemory allocated_buffer, memory_allocator_->Allocate(device_ordinal_, byte_size + 2 * redzone_size_ + rhs_slop, /*retry_on_failure=*/false)); diff --git a/tensorflow/compiler/xla/service/gpu/redzone_allocator.h b/tensorflow/compiler/xla/service/gpu/redzone_allocator.h index f92167bcc42..4e3438c6dfc 100644 --- a/tensorflow/compiler/xla/service/gpu/redzone_allocator.h +++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator.h @@ -18,12 +18,12 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" -#include "tensorflow/compiler/xla/service/owning_device_memory.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" +#include "tensorflow/stream_executor/owning_device_memory.h" namespace xla { namespace gpu { @@ -41,7 +41,8 @@ namespace gpu { // memory for cudnn convolutions. class RedzoneAllocator : public se::ScratchAllocator { public: - RedzoneAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator, + RedzoneAllocator(int device_ordinal, + se::DeviceMemoryAllocator* memory_allocator, const HloModuleConfig& hlo_module_config, int64 redzone_size = 1 << 23, // 8MiB per side, 16MiB total uint8 redzone_pattern = -1) @@ -76,14 +77,14 @@ class RedzoneAllocator : public se::ScratchAllocator { const int64 redzone_size_; const uint8 redzone_pattern_; - DeviceMemoryAllocator* memory_allocator_; + se::DeviceMemoryAllocator* memory_allocator_; const HloModuleConfig& hlo_module_config_; // The second element of the pair is the size of the user allocation. This // isn't necessarily just first.size() - 2 * redzone_size_ because when the // user allocation size is not a multiple of 4 bytes, we round up the size of // the RHS redzone. - std::vector> allocated_buffers_; + std::vector> allocated_buffers_; int64 allocated_bytes_excluding_redzones_ = 0; }; diff --git a/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc b/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc index a36aaa3f216..a3b0ac3ecae 100644 --- a/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc +++ b/tensorflow/compiler/xla/service/gpu/redzone_allocator_test.cc @@ -15,13 +15,13 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/redzone_allocator.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/hlo_module_config.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/core/lib/core/status_test_util.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" #include "tensorflow/stream_executor/multi_platform_manager.h" #include "tensorflow/stream_executor/platform.h" @@ -42,7 +42,7 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) { se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); se::StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie(); HloModuleConfig config; - StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec}); + se::StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec}); RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, config, kRedzoneSize, kRedzonePattern); @@ -118,7 +118,7 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) { se::MultiPlatformManager::PlatformWithName("cuda").ValueOrDie(); se::StreamExecutor* stream_exec = platform->ExecutorForDevice(0).ValueOrDie(); HloModuleConfig config; - StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec}); + se::StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec}); RedzoneAllocator allocator(/*device_ordinal=*/0, &se_allocator, config, kRedzoneSize, /*redzone_pattern=*/-1); se::Stream stream(stream_exec); diff --git a/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc b/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc index 197367e8168..7a3220483a8 100644 --- a/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc +++ b/tensorflow/compiler/xla/service/gpu/scratch_allocator.cc @@ -29,7 +29,7 @@ StatusOr> ScratchAllocator::AllocateBytes( byte_size, GetMemoryLimitInBytes(stream))); } - TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer, + TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer, memory_allocator_->Allocate(device_ordinal_, byte_size, /*retry_on_failure=*/false)); total_allocated_bytes_ += byte_size; diff --git a/tensorflow/compiler/xla/service/gpu/scratch_allocator.h b/tensorflow/compiler/xla/service/gpu/scratch_allocator.h index 620c7e78912..a22e7f5ea24 100644 --- a/tensorflow/compiler/xla/service/gpu/scratch_allocator.h +++ b/tensorflow/compiler/xla/service/gpu/scratch_allocator.h @@ -18,18 +18,19 @@ limitations under the License. #include -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" -#include "tensorflow/compiler/xla/service/owning_device_memory.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/util.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" +#include "tensorflow/stream_executor/owning_device_memory.h" namespace xla { namespace gpu { class ScratchAllocator : public se::ScratchAllocator { public: - ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator) + ScratchAllocator(int device_ordinal, + se::DeviceMemoryAllocator* memory_allocator) : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {} int64 GetMemoryLimitInBytes(se::Stream* stream) override { @@ -50,8 +51,8 @@ class ScratchAllocator : public se::ScratchAllocator { private: const int device_ordinal_; - DeviceMemoryAllocator* memory_allocator_; - std::vector allocated_buffers_; + se::DeviceMemoryAllocator* memory_allocator_; + std::vector allocated_buffers_; int64 total_allocated_bytes_ = 0; }; diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc index a8f8ab4f725..b959273bbcf 100644 --- a/tensorflow/compiler/xla/service/interpreter/compiler.cc +++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc @@ -96,7 +96,7 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) { StatusOr> InterpreterCompiler::RunHloPasses( std::unique_ptr hlo_module, se::StreamExecutor* /*stream_exec*/, - DeviceMemoryAllocator* /*device_allocator*/) { + se::DeviceMemoryAllocator* /*device_allocator*/) { VLOG(1) << "Run hlo passes on graph " << hlo_module->name(); TF_RETURN_IF_ERROR(RunHloOptimization(hlo_module.get())); return std::move(hlo_module); @@ -105,13 +105,13 @@ StatusOr> InterpreterCompiler::RunHloPasses( Status InterpreterCompiler::RunHloPassesOnModuleGroup( HloModuleGroup* module_group, absl::Span executors, - DeviceMemoryAllocator* device_allocator) { + se::DeviceMemoryAllocator* device_allocator) { return Unimplemented("Module group compilation not supported on Interpreter"); } StatusOr> InterpreterCompiler::RunBackend( std::unique_ptr hlo_module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* /*device_allocator*/) { + se::DeviceMemoryAllocator* /*device_allocator*/) { TF_RET_CHECK(stream_exec != nullptr); VLOG(1) << "Run backend " << hlo_module->name(); @@ -137,7 +137,7 @@ StatusOr>> InterpreterCompiler::RunBackendOnModuleGroup( std::unique_ptr module_group, std::vector> stream_exec, - DeviceMemoryAllocator* device_allocator) { + se::DeviceMemoryAllocator* device_allocator) { return Unimplemented( "Module group compilation is not supported on Interpreter."); } @@ -145,7 +145,7 @@ InterpreterCompiler::RunBackendOnModuleGroup( StatusOr>> InterpreterCompiler::Compile( std::unique_ptr module_group, std::vector> stream_exec, - DeviceMemoryAllocator* device_allocator) { + se::DeviceMemoryAllocator* device_allocator) { if (module_group->empty()) { return std::vector>(); } diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.h b/tensorflow/compiler/xla/service/interpreter/compiler.h index 591272951a0..dc83295b527 100644 --- a/tensorflow/compiler/xla/service/interpreter/compiler.h +++ b/tensorflow/compiler/xla/service/interpreter/compiler.h @@ -45,24 +45,24 @@ class InterpreterCompiler : public Compiler { StatusOr> RunHloPasses( std::unique_ptr hlo_module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) override; + se::DeviceMemoryAllocator* device_allocator) override; Status RunHloPassesOnModuleGroup( HloModuleGroup* module_group, absl::Span executors, - DeviceMemoryAllocator* device_allocator) override; + se::DeviceMemoryAllocator* device_allocator) override; StatusOr> RunBackend( std::unique_ptr hlo_module, se::StreamExecutor* stream_exec, - DeviceMemoryAllocator* device_allocator) override; + se::DeviceMemoryAllocator* device_allocator) override; StatusOr>> RunBackendOnModuleGroup( std::unique_ptr module_group, std::vector> stream_exec, - DeviceMemoryAllocator* device_allocator) override; + se::DeviceMemoryAllocator* device_allocator) override; StatusOr>> Compile( std::unique_ptr module_group, std::vector> stream_exec, - DeviceMemoryAllocator* device_allocator) override; + se::DeviceMemoryAllocator* device_allocator) override; StatusOr>> CompileAheadOfTime(std::unique_ptr module_group, diff --git a/tensorflow/compiler/xla/service/llvm_compiler.cc b/tensorflow/compiler/xla/service/llvm_compiler.cc index 382b5751202..82e955c818e 100644 --- a/tensorflow/compiler/xla/service/llvm_compiler.cc +++ b/tensorflow/compiler/xla/service/llvm_compiler.cc @@ -24,7 +24,7 @@ namespace xla { Status LLVMCompiler::RunHloPassesOnModuleGroup( HloModuleGroup* module_group, absl::Span executors, - DeviceMemoryAllocator* device_allocator) { + se::DeviceMemoryAllocator* device_allocator) { return Unimplemented( "Model partitioning not implemented for the CPU/GPU compilers!"); } @@ -33,7 +33,7 @@ StatusOr>> LLVMCompiler::RunBackendOnModuleGroup( std::unique_ptr module_group, std::vector> stream_exec, - DeviceMemoryAllocator* device_allocator) { + se::DeviceMemoryAllocator* device_allocator) { return Unimplemented( "Model partitioning not implemented for the CPU/GPU compilers!"); } @@ -41,7 +41,7 @@ LLVMCompiler::RunBackendOnModuleGroup( StatusOr>> LLVMCompiler::Compile( std::unique_ptr module_group, std::vector> stream_execs, - DeviceMemoryAllocator* device_allocator) { + se::DeviceMemoryAllocator* device_allocator) { // Tensorflow tries to enable the following behaviors in all its threads: // // - Denormals are zero (DAZ): roughly, operations treat denormal floats as diff --git a/tensorflow/compiler/xla/service/llvm_compiler.h b/tensorflow/compiler/xla/service/llvm_compiler.h index afd9f370383..888815bea3d 100644 --- a/tensorflow/compiler/xla/service/llvm_compiler.h +++ b/tensorflow/compiler/xla/service/llvm_compiler.h @@ -61,28 +61,28 @@ class LLVMCompiler : public Compiler { // StatusOr> RunBackend( // std::unique_ptr module, // se::StreamExecutor* stream_exec, - // DeviceMemoryAllocator* device_allocator) + // se::DeviceMemoryAllocator* device_allocator) // StatusOr> RunHloPasses( // std::unique_ptr module, // se::StreamExecutor* stream_exec, - // DeviceMemoryAllocator* device_allocator) + // se::DeviceMemoryAllocator* device_allocator) using Compiler::RunBackend; using Compiler::RunHloPasses; Status RunHloPassesOnModuleGroup( HloModuleGroup* module_group, absl::Span executors, - DeviceMemoryAllocator* device_allocator) override; + se::DeviceMemoryAllocator* device_allocator) override; StatusOr>> RunBackendOnModuleGroup( std::unique_ptr module_group, std::vector> stream_exec, - DeviceMemoryAllocator* device_allocator) override; + se::DeviceMemoryAllocator* device_allocator) override; StatusOr>> Compile( std::unique_ptr module_group, std::vector> stream_execs, - DeviceMemoryAllocator* device_allocator) override; + se::DeviceMemoryAllocator* device_allocator) override; protected: ModuleHook user_pre_optimization_hook_; diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h index f56ba32b04b..170d226e336 100644 --- a/tensorflow/compiler/xla/service/local_service.h +++ b/tensorflow/compiler/xla/service/local_service.h @@ -23,13 +23,13 @@ limitations under the License. #include "tensorflow/compiler/xla/client/xla_computation.h" #include "tensorflow/compiler/xla/service/backend.h" #include "tensorflow/compiler/xla/service/compiler.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/service/service.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc index 8269842426e..1642c50d225 100644 --- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc +++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.cc @@ -17,25 +17,29 @@ limitations under the License. #include "absl/types/variant.h" namespace xla { -se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() { +tensorflow::se::DeviceMemoryBase MaybeOwningDeviceMemory::AsDeviceMemoryBase() { if (HasOwnership()) { - return absl::get(mem_).AsDeviceMemoryBase(); + return absl::get(mem_) + .AsDeviceMemoryBase(); } else { - return absl::get(mem_); + return absl::get(mem_); } } bool MaybeOwningDeviceMemory::HasOwnership() const { - return absl::holds_alternative(mem_); + return absl::holds_alternative(mem_); } -absl::optional MaybeOwningDeviceMemory::Release() { +absl::optional +MaybeOwningDeviceMemory::Release() { if (!HasOwnership()) { return {}; } - OwningDeviceMemory result = std::move(absl::get(mem_)); + tensorflow::se::OwningDeviceMemory result = + std::move(absl::get(mem_)); mem_ = result.AsDeviceMemoryBase(); - return absl::make_optional(std::move(result)); + return absl::make_optional( + std::move(result)); } } // namespace xla diff --git a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h index 82e7f1183c0..e4c3196640e 100644 --- a/tensorflow/compiler/xla/service/maybe_owning_device_memory.h +++ b/tensorflow/compiler/xla/service/maybe_owning_device_memory.h @@ -18,30 +18,30 @@ limitations under the License. #include "absl/types/optional.h" #include "absl/types/variant.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" -#include "tensorflow/compiler/xla/service/owning_device_memory.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" +#include "tensorflow/stream_executor/owning_device_memory.h" namespace xla { // MaybeOwningDeviceMemory represents either an owned or unowned device memory. -// Like std::variant. When the object goes +// Like std::variant. When the object goes // output of scope, it will free the underlying memory if it owns it. class MaybeOwningDeviceMemory { public: MaybeOwningDeviceMemory() = default; - explicit MaybeOwningDeviceMemory(OwningDeviceMemory owned) + explicit MaybeOwningDeviceMemory(tensorflow::se::OwningDeviceMemory owned) : mem_(std::move(owned)) {} - explicit MaybeOwningDeviceMemory(se::DeviceMemoryBase unowned) + explicit MaybeOwningDeviceMemory(tensorflow::se::DeviceMemoryBase unowned) : mem_(unowned) {} MaybeOwningDeviceMemory(MaybeOwningDeviceMemory&&) = default; ~MaybeOwningDeviceMemory() = default; - MaybeOwningDeviceMemory& operator=(se::DeviceMemoryBase unowned) { + MaybeOwningDeviceMemory& operator=(tensorflow::se::DeviceMemoryBase unowned) { mem_ = unowned; return *this; } - MaybeOwningDeviceMemory& operator=(OwningDeviceMemory owned) { + MaybeOwningDeviceMemory& operator=(tensorflow::se::OwningDeviceMemory owned) { mem_ = std::move(owned); return *this; } @@ -50,19 +50,21 @@ class MaybeOwningDeviceMemory { // Fetches the underlying DeviceMemoryBase from a MaybeOwningDeviceMemory. The // caller of this function is *not* responsible for freeing the memory. - se::DeviceMemoryBase AsDeviceMemoryBase(); + tensorflow::se::DeviceMemoryBase AsDeviceMemoryBase(); - // Release the OwningDeviceMemory without freeing it, and moves the ownership - // of the memory buffer from the object to the caller. + // Release the tensorflow::se::OwningDeviceMemory without freeing it, and + // moves the ownership of the memory buffer from the object to the caller. // // A nullopt is returned if the HasOwnership() == false; - absl::optional Release(); + absl::optional Release(); // Returns true if the device_memory has ownership over underlying memory. bool HasOwnership() const; private: - absl::variant mem_; + absl::variant + mem_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc index 49c346d87fc..42b9e566d71 100644 --- a/tensorflow/compiler/xla/service/service.cc +++ b/tensorflow/compiler/xla/service/service.cc @@ -29,7 +29,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/compiler.h" #include "tensorflow/compiler/xla/service/computation_layout.h" #include "tensorflow/compiler/xla/service/computation_placer.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/dump.h" #include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h" #include "tensorflow/compiler/xla/service/executable.h" @@ -58,6 +57,7 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/ptr_util.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { namespace { @@ -347,7 +347,7 @@ StatusOr>> Service::BuildExecutables( const std::vector& module_protos, std::vector> module_configs, Backend* backend, std::vector> executors, - DeviceMemoryAllocator* device_allocator) { + se::DeviceMemoryAllocator* device_allocator) { VLOG(1) << StrFormat("BuildExecutable on service %p", this); // Dump computation proto state if flag is set. @@ -783,7 +783,7 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg, StatusOr> Service::BuildExecutable( const HloModuleProto& module_proto, std::unique_ptr module_config, Backend* backend, - se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) { + se::StreamExecutor* executor, se::DeviceMemoryAllocator* device_allocator) { VLOG(1) << StrFormat( "BuildExecutable on service %p with serialized module proto: %s", this, module_proto.name()); diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h index f127e340b59..ba51e457c20 100644 --- a/tensorflow/compiler/xla/service/service.h +++ b/tensorflow/compiler/xla/service/service.h @@ -29,7 +29,6 @@ limitations under the License. #include "tensorflow/compiler/xla/service/backend.h" #include "tensorflow/compiler/xla/service/channel_tracker.h" #include "tensorflow/compiler/xla/service/compilation_cache.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/executable.h" #include "tensorflow/compiler/xla/service/execution_tracker.h" #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" @@ -43,6 +42,7 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { @@ -234,7 +234,7 @@ class Service : public ServiceInterface { const HloModuleProto& module_proto, std::unique_ptr module_config, Backend* backend, se::StreamExecutor* executor, - DeviceMemoryAllocator* device_allocator = nullptr); + se::DeviceMemoryAllocator* device_allocator = nullptr); // Same as BuildExecutable() above, but builds a list of Executables for the // given computations that may interact with each other. @@ -242,7 +242,7 @@ class Service : public ServiceInterface { const std::vector& module_protos, std::vector> module_configs, Backend* backend, std::vector> executors, - DeviceMemoryAllocator* device_allocator); + se::DeviceMemoryAllocator* device_allocator); // Runs the given executable with the given arguments and register the result // in the allocation tracker. The handle of the result from the tracker is diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h index 6bee6710565..7fc66310ee7 100644 --- a/tensorflow/compiler/xla/service/service_executable_run_options.h +++ b/tensorflow/compiler/xla/service/service_executable_run_options.h @@ -43,7 +43,9 @@ class ServiceExecutableRunOptions { // Delegate to `ExecutableRunOptions` member. se::Stream* stream() const { return run_options_.stream(); } - DeviceMemoryAllocator* allocator() const { return run_options_.allocator(); } + se::DeviceMemoryAllocator* allocator() const { + return run_options_.allocator(); + } int device_ordinal() const { return run_options_.device_ordinal(); } // Borrows a stream and returns a smart pointer which returns the stream on diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc index 69d34583d9e..9b0ec31e9da 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer.cc @@ -119,14 +119,14 @@ std::ostream& operator<<(std::ostream& out, const ShapedBuffer& buffer) { ScopedShapedBuffer::ScopedShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape, - DeviceMemoryAllocator* allocator, + se::DeviceMemoryAllocator* allocator, int device_ordinal) : ShapedBuffer(on_host_shape, on_device_shape, allocator->platform(), device_ordinal), allocator_(allocator) {} ScopedShapedBuffer::ScopedShapedBuffer(ShapedBuffer shaped_buffer, - DeviceMemoryAllocator* allocator) + se::DeviceMemoryAllocator* allocator) : ShapedBuffer(std::move(shaped_buffer)), allocator_(allocator) {} ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s) diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h index 619b6ccd1cc..39346540d8d 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer.h +++ b/tensorflow/compiler/xla/service/shaped_buffer.h @@ -21,12 +21,12 @@ limitations under the License. #include #include "absl/types/span.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/shape_tree.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { @@ -138,13 +138,13 @@ class ScopedShapedBuffer : public ShapedBuffer { // Creates a ScopedShapedBuffer with null DeviceMemoryBases at each index. explicit ScopedShapedBuffer(const Shape& on_host_shape, const Shape& on_device_shape, - DeviceMemoryAllocator* allocator, + se::DeviceMemoryAllocator* allocator, int device_ordinal); // Create a ScopedShapedBuffer by taking over the memory from the incoming // ShapedBuffer. explicit ScopedShapedBuffer(ShapedBuffer shaped_buffer, - DeviceMemoryAllocator* allocator); + se::DeviceMemoryAllocator* allocator); // Movable, but not copyable. ScopedShapedBuffer(ScopedShapedBuffer&& s); @@ -157,13 +157,13 @@ class ScopedShapedBuffer : public ShapedBuffer { // Return the allocator used to allocate the device memory held in this // ScopedShapedBuffer. - DeviceMemoryAllocator* memory_allocator() const { return allocator_; } + se::DeviceMemoryAllocator* memory_allocator() const { return allocator_; } // Sets the device memory buffer at the given index. // // If the given buffer's device memory is non-null, its device_ordinal and // allocator must match those in `this`. - void set_buffer(OwningDeviceMemory buffer, const ShapeIndex& index) { + void set_buffer(se::OwningDeviceMemory buffer, const ShapeIndex& index) { if (!buffer.is_null()) { CHECK_EQ(buffer.device_ordinal(), device_ordinal()); CHECK_EQ(buffer.allocator(), allocator_); @@ -187,7 +187,7 @@ class ScopedShapedBuffer : public ShapedBuffer { protected: void Deallocate(); - DeviceMemoryAllocator* allocator_; + se::DeviceMemoryAllocator* allocator_; }; } // namespace xla diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc index 3f0042e4bcb..3885c5f3759 100644 --- a/tensorflow/compiler/xla/service/shaped_buffer_test.cc +++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc @@ -16,13 +16,13 @@ limitations under the License. #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "absl/memory/memory.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/test.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/util/ptr_util.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { namespace { @@ -34,7 +34,7 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) { auto* platform = platforms[0]; TF_ASSERT_OK_AND_ASSIGN(auto executors, xla::PlatformUtil::GetStreamExecutors(platform)); - xla::StreamExecutorMemoryAllocator allocator(platform, executors); + xla::se::StreamExecutorMemoryAllocator allocator(platform, executors); const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {}); const int kDeviceOrdinal = 0; auto scoped_buffer = absl::make_unique( @@ -43,11 +43,11 @@ TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) { buffer = nullptr; } -class TestAllocator : public DeviceMemoryAllocator { +class TestAllocator : public se::DeviceMemoryAllocator { public: TestAllocator() - : DeviceMemoryAllocator(PlatformUtil::GetDefaultPlatform().ValueOrDie()) { - } + : se::DeviceMemoryAllocator( + PlatformUtil::GetDefaultPlatform().ValueOrDie()) {} ~TestAllocator() override { if (!allocations_.empty()) { @@ -56,18 +56,18 @@ class TestAllocator : public DeviceMemoryAllocator { } // Pull in two-arg overload of Allocate. - using DeviceMemoryAllocator::Allocate; + using se::DeviceMemoryAllocator::Allocate; - StatusOr Allocate(int device_ordinal, uint64 size, - bool /*retry_on_failure*/) override { + StatusOr Allocate( + int device_ordinal, uint64 size, bool /*retry_on_failure*/) override { // By contract, we must return null if size == 0. if (size == 0) { - return OwningDeviceMemory(); + return se::OwningDeviceMemory(); } void* buf = malloc(size); allocations_.insert({device_ordinal, buf}); - return OwningDeviceMemory(se::DeviceMemoryBase(buf, size), device_ordinal, - this); + return se::OwningDeviceMemory(se::DeviceMemoryBase(buf, size), + device_ordinal, this); } Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override { @@ -120,7 +120,7 @@ TEST(ScopedShapedBufferTest, TestTakeSubTree) { sb.buffers().ForEachMutableElement( [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) { TF_ASSERT_OK_AND_ASSIGN( - OwningDeviceMemory m, + se::OwningDeviceMemory m, allocator.Allocate(/*device_ordinal=*/0, /*size=*/77)); *buffer = m.Forget(); }); @@ -158,7 +158,7 @@ TEST(ScopedShapedBufferTest, TestSubShapeTree) { sb.buffers().ForEachMutableElement( [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) { TF_ASSERT_OK_AND_ASSIGN( - OwningDeviceMemory m, + se::OwningDeviceMemory m, allocator.Allocate(/*device_ordinal=*/0, /*size=*/32)); *buffer = m.Forget(); }); diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc index b93ce99ca18..6474edf2701 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.cc +++ b/tensorflow/compiler/xla/service/transfer_manager.cc @@ -308,7 +308,7 @@ Status TransferManager::TransferBufferToDevice( } StatusOr TransferManager::AllocateScopedShapedBuffer( - const Shape& on_host_shape, DeviceMemoryAllocator* allocator, + const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator, int device_ordinal) { if (!LayoutUtil::HasLayout(on_host_shape)) { return InvalidArgument("Shape must have a layout: %s", diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h index 17a0a3c17f4..f08862bff26 100644 --- a/tensorflow/compiler/xla/service/transfer_manager.h +++ b/tensorflow/compiler/xla/service/transfer_manager.h @@ -229,7 +229,7 @@ class TransferManager { // shape. The on-device shape may be different as indicated by // HostShapeToDeviceShape. StatusOr AllocateScopedShapedBuffer( - const Shape& on_host_shape, DeviceMemoryAllocator* allocator, + const Shape& on_host_shape, se::DeviceMemoryAllocator* allocator, int device_ordinal); // The given ShapedBuffer holds a handle to allocated memory, but it is not diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD index c60ae52fef2..82447adace0 100644 --- a/tensorflow/compiler/xla/tests/BUILD +++ b/tensorflow/compiler/xla/tests/BUILD @@ -259,7 +259,6 @@ cc_library( "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:xla_computation", "//tensorflow/compiler/xla/service:computation_placer", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:local_service", "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/compiler/xla/service:shaped_buffer", @@ -268,6 +267,7 @@ cc_library( "//tensorflow/core:core_cpu_internal", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/stream_executor:device_memory_allocator", "//third_party/eigen3", "@com_google_absl//absl/memory", "@com_google_absl//absl/types:span", @@ -1172,7 +1172,6 @@ xla_test( "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/service:computation_placer", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:local_service", "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/compiler/xla/service:shaped_buffer", @@ -1183,6 +1182,7 @@ xla_test( "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core:test", + "//tensorflow/stream_executor:device_memory_allocator", ], ) @@ -2078,7 +2078,6 @@ xla_test( "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/client:xla_computation", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:local_service", "//tensorflow/compiler/xla/service:platform_util", "//tensorflow/compiler/xla/service:shaped_buffer", @@ -2090,6 +2089,7 @@ xla_test( "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core:test", + "//tensorflow/stream_executor:device_memory_allocator", ], ) @@ -2206,13 +2206,13 @@ xla_test( "//tensorflow/compiler/xla:statusor", "//tensorflow/compiler/xla:types", "//tensorflow/compiler/xla:xla_data_proto", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:generic_transfer_manager", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/compiler/xla/service:stream_pool", "//tensorflow/core:lib", "//tensorflow/core:stream_executor_no_cuda", "//tensorflow/core:test", + "//tensorflow/stream_executor:device_memory_allocator", ], ) diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc index 587db49957b..59c3d4f5c7e 100644 --- a/tensorflow/compiler/xla/tests/dot_operation_test.cc +++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc @@ -1521,7 +1521,7 @@ void DOT_ReorderContracting(int num_iters) { se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie(); auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie(); - StreamExecutorMemoryAllocator allocator(platform, executors); + se::StreamExecutorMemoryAllocator allocator(platform, executors); xla::LocalClientOptions client_options; client_options.set_platform(platform); diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc index 82e2db36143..1ea72af5f5f 100644 --- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc +++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc @@ -21,7 +21,6 @@ limitations under the License. #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/reference_util.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" @@ -34,6 +33,7 @@ limitations under the License. #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { namespace { @@ -736,7 +736,7 @@ void BM_DynamicSlice(int num_iters) { se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie(); auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie(); - StreamExecutorMemoryAllocator allocator(platform, executors); + se::StreamExecutorMemoryAllocator allocator(platform, executors); LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie(); auto* transfer_manager = diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc index f4a7309adc9..2d0805cdb0e 100644 --- a/tensorflow/compiler/xla/tests/fusion_test.cc +++ b/tensorflow/compiler/xla/tests/fusion_test.cc @@ -829,7 +829,7 @@ void BM_ParallelFusion(int num_iters) { se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie(); auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie(); - StreamExecutorMemoryAllocator allocator(platform, executors); + se::StreamExecutorMemoryAllocator allocator(platform, executors); const int64 intra_op_parallelism_threads = 24; xla::LocalClientOptions client_options; diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc index 2d4d480cd48..67a1abacd18 100644 --- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc +++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc @@ -22,7 +22,6 @@ limitations under the License. #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/literal.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" @@ -41,6 +40,7 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/test_benchmark.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { namespace { @@ -902,7 +902,7 @@ void BM_LocalClientOverhead(int num_iters) { se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie(); auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie(); - StreamExecutorMemoryAllocator allocator(platform, executors); + se::StreamExecutorMemoryAllocator allocator(platform, executors); LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie(); auto* transfer_manager = diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc index 710d8ae40aa..7eaa2791d47 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.cc +++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc @@ -35,17 +35,16 @@ namespace xla { /* static */ TestAllocator* LocalClientTestBase::allocator_; -StatusOr TestAllocator::Allocate(int device_ordinal, - uint64 size, - bool retry_on_failure) { +StatusOr TestAllocator::Allocate( + int device_ordinal, uint64 size, bool retry_on_failure) { VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")"; { tensorflow::mutex_lock lock(count_mutex_); allocation_count_++; device_allocation_count_[device_ordinal]++; } - return StreamExecutorMemoryAllocator::Allocate(device_ordinal, size, - retry_on_failure); + return se::StreamExecutorMemoryAllocator::Allocate(device_ordinal, size, + retry_on_failure); } Status TestAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) { @@ -55,7 +54,7 @@ Status TestAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) { deallocation_count_++; device_deallocation_count_[device_ordinal]++; } - return StreamExecutorMemoryAllocator::Deallocate(device_ordinal, mem); + return se::StreamExecutorMemoryAllocator::Deallocate(device_ordinal, mem); } int64 TestAllocator::allocation_count() const { diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h index 4027c7b124f..292baacf969 100644 --- a/tensorflow/compiler/xla/tests/local_client_test_base.h +++ b/tensorflow/compiler/xla/tests/local_client_test_base.h @@ -24,7 +24,6 @@ limitations under the License. #include "tensorflow/compiler/xla/client/client_library.h" #include "tensorflow/compiler/xla/client/local_client.h" #include "tensorflow/compiler/xla/client/xla_computation.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/local_service.h" #include "tensorflow/compiler/xla/service/platform_util.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" @@ -36,18 +35,19 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { -class TestAllocator : public StreamExecutorMemoryAllocator { +class TestAllocator : public se::StreamExecutorMemoryAllocator { public: explicit TestAllocator(se::Platform* platform) - : StreamExecutorMemoryAllocator( + : se::StreamExecutorMemoryAllocator( platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) { } - StatusOr Allocate(int device_ordinal, uint64 size, - bool retry_on_failure) override; + StatusOr Allocate(int device_ordinal, uint64 size, + bool retry_on_failure) override; Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override; // Return the number of allocations that have been performed. diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc index c27ab5af76e..00b72cedbf5 100644 --- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc +++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc @@ -19,7 +19,6 @@ limitations under the License. #include "tensorflow/compiler/xla/layout_util.h" #include "tensorflow/compiler/xla/literal.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/generic_transfer_manager.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/service/stream_pool.h" @@ -34,6 +33,7 @@ limitations under the License. #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/test_benchmark.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" namespace xla { namespace { diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc index 85212fa56d7..4d80a57ad40 100644 --- a/tensorflow/compiler/xla/tests/while_test.cc +++ b/tensorflow/compiler/xla/tests/while_test.cc @@ -1265,7 +1265,7 @@ void BM_WhileLoop(int num_iters) { se::Platform* platform = PlatformUtil::GetDefaultPlatform().ValueOrDie(); auto executors = PlatformUtil::GetStreamExecutors(platform).ValueOrDie(); - StreamExecutorMemoryAllocator allocator(platform, executors); + se::StreamExecutorMemoryAllocator allocator(platform, executors); LocalClient* client = ClientLibrary::GetOrCreateLocalClient(platform).ValueOrDie(); diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc index 7b7b8f5d02d..b36fc4174ae 100644 --- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc +++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc @@ -135,7 +135,7 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client, LocalService* service = ClientLibrary::GetXlaService(client->platform()); Backend* backend = service->mutable_backend(); se::StreamExecutor* executor = backend->default_stream_executor(); - DeviceMemoryAllocator* allocator = backend->memory_allocator(); + se::DeviceMemoryAllocator* allocator = backend->memory_allocator(); auto* transfer_manager = backend->transfer_manager(); TF_ASSERT_OK_AND_ASSIGN( StreamPool::Ptr stream_ptr, diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc index d66561315b4..3d443beeecb 100644 --- a/tensorflow/compiler/xla/tools/replay_computation.cc +++ b/tensorflow/compiler/xla/tools/replay_computation.cc @@ -271,7 +271,7 @@ StatusOr ReplayComputation(const HloSnapshot& module, // Run the computation num_runs times, and return the result from the last // execution. const bool xla_hlo_profile = GetDebugOptionsFromFlags().xla_hlo_profile(); - StreamExecutorMemoryAllocator allocator( + se::StreamExecutorMemoryAllocator allocator( client->platform(), {client->platform()->ExecutorForDevice(0).ValueOrDie()}); absl::optional final_result; diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD index 4320a4c5eae..acd984f9e99 100644 --- a/tensorflow/compiler/xrt/BUILD +++ b/tensorflow/compiler/xrt/BUILD @@ -67,13 +67,13 @@ cc_library( "//tensorflow/compiler/xla:xla_proto", "//tensorflow/compiler/xla/client:local_client", "//tensorflow/compiler/xla/service:backend", - "//tensorflow/compiler/xla/service:device_memory_allocator", "//tensorflow/compiler/xla/service:shaped_buffer", "//tensorflow/core:core_cpu_internal", "//tensorflow/core:framework", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/stream_executor", + "//tensorflow/stream_executor:device_memory_allocator", "@com_google_absl//absl/memory", "@com_google_absl//absl/strings", "@com_google_absl//absl/synchronization", diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc index 2ae6f964623..07abd60f6b2 100644 --- a/tensorflow/compiler/xrt/xrt_state.cc +++ b/tensorflow/compiler/xrt/xrt_state.cc @@ -117,7 +117,7 @@ Status AllocateScopedShapedBuffer( xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first); uint64 size = transfer_manager->GetByteSizeRequirement(subshape); TF_ASSIGN_OR_RETURN( - xla::OwningDeviceMemory buffer, + se::OwningDeviceMemory buffer, allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/false)); // Move our buffer into shaped_buffer, which takes ownership of it. index_to_buffer.second = buffer.Forget(); @@ -135,7 +135,7 @@ Status AllocateScopedShapedBuffer( XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation, int device_ordinal, - xla::DeviceMemoryAllocator* allocator) + se::DeviceMemoryAllocator* allocator) : size_(allocation.size()), allocation_(allocation), device_ordinal_(device_ordinal), @@ -169,7 +169,7 @@ void XRTBufferAllocation::DiscardAllocation() { } XRTTupleAllocation::XRTTupleAllocation(int device_ordinal, - xla::DeviceMemoryAllocator* allocator, + se::DeviceMemoryAllocator* allocator, const xla::Shape& on_host_shape, const xla::Shape& on_device_shape) : device_ordinal_(device_ordinal), @@ -342,7 +342,7 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr; /* static */ Status XRTTupleAllocation::ExpandTreeOfTuples( const xla::ShapeTree& elements, int device_ordinal, - xla::DeviceMemoryAllocator* allocator, xla::Shape* host_shape, + se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape, xla::Shape* device_shape) { // Initialize both host and device shape to be the 'spine' of the new tuple // shape, given by the shape of the tree of tuples. @@ -415,7 +415,7 @@ typedef XRTBufferAllocation* XRTBufferAllocationPtr; xla::Shape subshape = xla::ShapeUtil::GetSubshape(device_shape, index); uint64 size = transfer_manager->GetByteSizeRequirement(subshape); - TF_ASSIGN_OR_RETURN(xla::OwningDeviceMemory buffer, + TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory buffer, allocator->Allocate(device_ordinal, size, /*retry_on_failure=*/false)); VLOG(2) << "Allocated buffer at " << buffer.opaque() << " index " @@ -502,7 +502,7 @@ bool XRTTupleAllocation::IsExclusiveOwner() { void XRTTupleAllocation::InitializeFromShapedBuffer( const xla::ShapedBuffer& shaped_buffer, - xla::DeviceMemoryAllocator* allocator, int device_ordinal) { + se::DeviceMemoryAllocator* allocator, int device_ordinal) { for (auto& buffer : buffers_) { // Make a reference-counted version of the allocated buffer. buffer.second = new XRTBufferAllocation(shaped_buffer.buffer(buffer.first), @@ -549,7 +549,7 @@ XRTTupleAllocation::ToDeviceMemoryTree( if (!release_checker(buffer.first)) { *shaped_tree.mutable_element(buffer.first) = buffer.second->allocation(); } else { - *shaped_tree.mutable_element(buffer.first) = xla::OwningDeviceMemory( + *shaped_tree.mutable_element(buffer.first) = se::OwningDeviceMemory( buffer.second->allocation(), device_ordinal_, allocator_); DiscardAllocation(buffer.first); } diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h index 38dcf3d2891..0cc0d3d62f4 100644 --- a/tensorflow/compiler/xrt/xrt_state.h +++ b/tensorflow/compiler/xrt/xrt_state.h @@ -25,7 +25,6 @@ limitations under the License. #include "tensorflow/compiler/xla/literal.h" #include "tensorflow/compiler/xla/service/backend.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" #include "tensorflow/compiler/xla/service/shaped_buffer.h" #include "tensorflow/compiler/xla/shape_util.h" #include "tensorflow/compiler/xla/xla_data.pb.h" @@ -34,6 +33,7 @@ limitations under the License. #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/gtl/array_slice.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" #include "tensorflow/stream_executor/stream_executor.h" namespace tensorflow { @@ -45,8 +45,7 @@ namespace tensorflow { class XRTBufferAllocation : public core::RefCounted { public: XRTBufferAllocation(const se::DeviceMemoryBase& allocation, - int device_ordinal, - xla::DeviceMemoryAllocator* allocator); + int device_ordinal, se::DeviceMemoryAllocator* allocator); ~XRTBufferAllocation() override; // The region of device memory being wrapped. @@ -69,7 +68,7 @@ class XRTBufferAllocation : public core::RefCounted { uint64 size_ = 0; se::DeviceMemoryBase allocation_; int device_ordinal_; - xla::DeviceMemoryAllocator* allocator_; + se::DeviceMemoryAllocator* allocator_; }; // Entry in the resource manager corresponding to an allocation handle returned @@ -197,14 +196,14 @@ class XRTTupleAllocation : public ResourceBase { private: // Creates a new handle with (tuple) shape. - XRTTupleAllocation(int device_ordinal, xla::DeviceMemoryAllocator* allocator, + XRTTupleAllocation(int device_ordinal, se::DeviceMemoryAllocator* allocator, const xla::Shape& on_host_shape, const xla::Shape& on_device_shape); // Inherits the allocations represented in buffer, which must have the same // shape as buffers_. void InitializeFromShapedBuffer(const xla::ShapedBuffer& shaped_buffer, - xla::DeviceMemoryAllocator* allocator, + se::DeviceMemoryAllocator* allocator, int device_ordinal); // Takes a tree 'elements' where each leaf is an allocation, validates that @@ -214,12 +213,12 @@ class XRTTupleAllocation : public ResourceBase { // grafted on. static Status ExpandTreeOfTuples( const xla::ShapeTree& elements, int device_ordinal, - xla::DeviceMemoryAllocator* allocator, xla::Shape* host_shape, + se::DeviceMemoryAllocator* allocator, xla::Shape* host_shape, xla::Shape* device_shape); // Location of the memory that is being managed. int device_ordinal_; - xla::DeviceMemoryAllocator* allocator_; + se::DeviceMemoryAllocator* allocator_; // The shape that the caller thinks the tuple has. const xla::Shape on_host_shape_; diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD index b12a661cd3c..84d07806bde 100644 --- a/tensorflow/stream_executor/BUILD +++ b/tensorflow/stream_executor/BUILD @@ -666,6 +666,27 @@ cc_library( ], ) +cc_library( + name = "device_memory_allocator", + srcs = [ + "device_memory_allocator.cc", + "owning_device_memory.cc", + ], + hdrs = [ + "device_memory_allocator.h", + "owning_device_memory.h", + ], + deps = [ + ":platform", + "//tensorflow/core:lib", + "//tensorflow/core:stream_executor_no_cuda", + "//tensorflow/stream_executor/lib", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/strings:str_format", + "@com_google_absl//absl/types:span", + ], +) + tf_cc_test( name = "stream_test", size = "small", diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/stream_executor/device_memory_allocator.cc similarity index 57% rename from tensorflow/compiler/xla/service/device_memory_allocator.cc rename to tensorflow/stream_executor/device_memory_allocator.cc index e1e3b156fb3..e925b7be2ee 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.cc +++ b/tensorflow/stream_executor/device_memory_allocator.cc @@ -13,30 +13,31 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" #include -#include "tensorflow/compiler/xla/status_macros.h" -#include "tensorflow/compiler/xla/types.h" -#include "tensorflow/compiler/xla/util.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" #include "tensorflow/core/lib/strings/numbers.h" -namespace xla { +namespace stream_executor { StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator( - const se::Platform* platform, - absl::Span stream_executors) + const Platform* platform, + absl::Span stream_executors) : DeviceMemoryAllocator(platform), stream_executors_(stream_executors.begin(), stream_executors.end()) {} -StatusOr StreamExecutorMemoryAllocator::Allocate( +port::StatusOr StreamExecutorMemoryAllocator::Allocate( int device_ordinal, uint64 size, bool retry_on_failure) { - TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor, - GetStreamExecutor(device_ordinal)); - se::DeviceMemoryBase result = stream_executor->AllocateArray(size); + port::StatusOr stream_executor_or = + GetStreamExecutor(device_ordinal); + TF_RETURN_IF_ERROR(stream_executor_or.status()); + DeviceMemoryBase result = + stream_executor_or.ValueOrDie()->AllocateArray(size); if (size > 0 && result == nullptr) { - return ResourceExhausted( + return tensorflow::errors::ResourceExhausted( "Failed to allocate request for %s (%uB) on device ordinal %d", tensorflow::strings::HumanReadableNumBytes(size), size, device_ordinal); } @@ -47,32 +48,34 @@ StatusOr StreamExecutorMemoryAllocator::Allocate( return OwningDeviceMemory(result, device_ordinal, this); } -Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal, - se::DeviceMemoryBase mem) { +port::Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal, + DeviceMemoryBase mem) { if (!mem.is_null()) { - TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor, - GetStreamExecutor(device_ordinal)); + port::StatusOr stream_executor_or = + GetStreamExecutor(device_ordinal); + TF_RETURN_IF_ERROR(stream_executor_or.status()); VLOG(3) << absl::StreamFormat("Freeing %p on device ordinal %d", mem.opaque(), device_ordinal); - stream_executor->Deallocate(&mem); + stream_executor_or.ValueOrDie()->Deallocate(&mem); } - return Status::OK(); + return port::Status::OK(); } -StatusOr StreamExecutorMemoryAllocator::GetStreamExecutor( - int device_ordinal) { +port::StatusOr +StreamExecutorMemoryAllocator::GetStreamExecutor(int device_ordinal) { if (device_ordinal < 0) { - return InvalidArgument("device ordinal value (%d) must be non-negative", - device_ordinal); + return tensorflow::errors::InvalidArgument( + "device ordinal value (%d) must be non-negative", device_ordinal); } if (device_ordinal >= stream_executors_.size()) { - return InvalidArgument( + return tensorflow::errors::InvalidArgument( "device ordinal value (%d) >= number of devices (%u)", device_ordinal, stream_executors_.size()); } if (stream_executors_[device_ordinal] == nullptr) { - return NotFound("Device %s:%d present but not supported", - platform()->Name(), device_ordinal); + return tensorflow::errors::NotFound( + absl::StrFormat("Device %s:%d present but not supported", + platform()->Name(), device_ordinal)); } return stream_executors_[device_ordinal]; } @@ -81,4 +84,4 @@ bool StreamExecutorMemoryAllocator::AllowsAsynchronousDeallocation() const { return false; } -} // namespace xla +} // namespace stream_executor diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/stream_executor/device_memory_allocator.h similarity index 74% rename from tensorflow/compiler/xla/service/device_memory_allocator.h rename to tensorflow/stream_executor/device_memory_allocator.h index a2308ee7a41..0d911e25e3a 100644 --- a/tensorflow/compiler/xla/service/device_memory_allocator.h +++ b/tensorflow/stream_executor/device_memory_allocator.h @@ -19,13 +19,13 @@ limitations under the License. #include #include "absl/types/span.h" -#include "tensorflow/compiler/xla/service/owning_device_memory.h" -#include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/stream_executor/lib/statusor.h" +#include "tensorflow/stream_executor/owning_device_memory.h" +#include "tensorflow/stream_executor/platform.h" -namespace xla { +namespace stream_executor { // Interface for device memory allocators used within the XLA service. An // allocator is responsible for allocating memory on all devices of a particular @@ -34,7 +34,7 @@ class DeviceMemoryAllocator { public: // Parameter platform indicates which platform the allocator allocates memory // on. Must be non-null. - explicit DeviceMemoryAllocator(const se::Platform* platform) + explicit DeviceMemoryAllocator(const Platform* platform) : platform_(platform) {} virtual ~DeviceMemoryAllocator() {} @@ -47,23 +47,23 @@ class DeviceMemoryAllocator { // fails, the allocation should return immediately without retrying. An // example use case is optional scratch spaces where a failure has only // performance impact. - virtual StatusOr Allocate(int device_ordinal, uint64 size, - bool retry_on_failure) = 0; + virtual port::StatusOr Allocate( + int device_ordinal, uint64 size, bool retry_on_failure) = 0; // Two-arg version of Allocate(), which sets retry-on-failure to true. // // (We don't simply use a default argument on the virtual Allocate function // because default args on virtual functions are disallowed by the Google // style guide.) - StatusOr Allocate(int device_ordinal, uint64 size) { + port::StatusOr Allocate(int device_ordinal, uint64 size) { return Allocate(device_ordinal, size, /*retry_on_failure=*/true); } // Must be a nop for null pointers. - virtual Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) = 0; + virtual port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) = 0; // Return the platform that the allocator allocates memory on. - const se::Platform* platform() const { return platform_; } + const Platform* platform() const { return platform_; } // Can we call Deallocate() as soon as a computation has been scheduled on // a stream, or do we have to wait for the computation to complete first? @@ -71,7 +71,7 @@ class DeviceMemoryAllocator { protected: friend class OwningDeviceMemory; - const se::Platform* platform_; + const Platform* platform_; }; // Default memory allocator for a platform which uses @@ -79,28 +79,28 @@ class DeviceMemoryAllocator { class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator { public: StreamExecutorMemoryAllocator( - const se::Platform* platform, - absl::Span stream_executors); + const Platform* platform, + absl::Span stream_executors); - StatusOr Allocate(int device_ordinal, uint64 size, - bool retry_on_failure) override; + port::StatusOr Allocate(int device_ordinal, uint64 size, + bool retry_on_failure) override; // Pull in two-arg overload that sets retry_on_failure to true. using DeviceMemoryAllocator::Allocate; - Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override; + port::Status Deallocate(int device_ordinal, DeviceMemoryBase mem) override; bool AllowsAsynchronousDeallocation() const override; private: - StatusOr GetStreamExecutor(int device_ordinal); + port::StatusOr GetStreamExecutor(int device_ordinal); // A vector indexed by device ordinal of StreamExecutors for each device of // the allocator's platform type. If an element is nullptr, then the device // with the respective device ordinal is not supported by XLA. - std::vector stream_executors_; + std::vector stream_executors_; }; -} // namespace xla +} // namespace stream_executor #endif // TENSORFLOW_COMPILER_XLA_SERVICE_DEVICE_MEMORY_ALLOCATOR_H_ diff --git a/tensorflow/compiler/xla/service/owning_device_memory.cc b/tensorflow/stream_executor/owning_device_memory.cc similarity index 82% rename from tensorflow/compiler/xla/service/owning_device_memory.cc rename to tensorflow/stream_executor/owning_device_memory.cc index c115bc097f3..8b92ccfef10 100644 --- a/tensorflow/compiler/xla/service/owning_device_memory.cc +++ b/tensorflow/stream_executor/owning_device_memory.cc @@ -13,11 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/compiler/xla/service/owning_device_memory.h" +#include "tensorflow/stream_executor/owning_device_memory.h" -#include "tensorflow/compiler/xla/service/device_memory_allocator.h" +#include "tensorflow/stream_executor/device_memory_allocator.h" -namespace xla { +namespace stream_executor { void OwningDeviceMemory::Free() { CHECK(allocator_ != nullptr) @@ -29,7 +29,7 @@ void OwningDeviceMemory::Free() { } allocator_ = nullptr; - mem_ = se::DeviceMemoryBase(); + mem_ = DeviceMemoryBase(); } -} // namespace xla +} // namespace stream_executor diff --git a/tensorflow/compiler/xla/service/owning_device_memory.h b/tensorflow/stream_executor/owning_device_memory.h similarity index 88% rename from tensorflow/compiler/xla/service/owning_device_memory.h rename to tensorflow/stream_executor/owning_device_memory.h index 4be9bd80477..46946c4acf6 100644 --- a/tensorflow/compiler/xla/service/owning_device_memory.h +++ b/tensorflow/stream_executor/owning_device_memory.h @@ -16,12 +16,10 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_ #define TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_ -#include "tensorflow/compiler/xla/statusor.h" -#include "tensorflow/compiler/xla/types.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/stream_executor_no_cuda.h" -namespace xla { +namespace stream_executor { // Break circular dependency between this file and device_memory_allocator.h. class DeviceMemoryAllocator; @@ -43,7 +41,7 @@ class OwningDeviceMemory { public: OwningDeviceMemory() : device_ordinal_(-1), allocator_(nullptr) {} - explicit OwningDeviceMemory(se::DeviceMemoryBase mem, int device_ordinal, + explicit OwningDeviceMemory(DeviceMemoryBase mem, int device_ordinal, DeviceMemoryAllocator* allocator) : mem_(mem), device_ordinal_(device_ordinal), allocator_(allocator) { CHECK(allocator != nullptr) << "allocator cannot be null."; @@ -53,7 +51,7 @@ class OwningDeviceMemory { : mem_(other.mem_), device_ordinal_(other.device_ordinal_), allocator_(other.allocator_) { - other.mem_ = se::DeviceMemoryBase(); + other.mem_ = DeviceMemoryBase(); other.allocator_ = nullptr; } @@ -65,7 +63,7 @@ class OwningDeviceMemory { device_ordinal_ = other.device_ordinal_; allocator_ = other.allocator_; - other.mem_ = se::DeviceMemoryBase(); + other.mem_ = DeviceMemoryBase(); other.allocator_ = nullptr; return *this; } @@ -100,25 +98,25 @@ class OwningDeviceMemory { // !is_null() is sufficient but not necessary to imply `this` is active. bool is_null() const { return mem_.is_null(); } - se::DeviceMemoryBase AsDeviceMemoryBase() const { + DeviceMemoryBase AsDeviceMemoryBase() const { // This const_cast is necessary because DeviceMemoryBase's constructor // doesn't accept a const void*. This isn't ideal, but it's better than the // alternative of making a AsDeviceMemoryBase non-const member function. // // This is safe (i.e. not UB) because the casted pointer is derived from a // non-const pointer, namely mem_.opaque(). - return se::DeviceMemoryBase(const_cast(opaque()), size()); + return DeviceMemoryBase(const_cast(opaque()), size()); } // Returns the wrapped DeviceMemoryBase without freeing it, and deactivates // this object. Precondition: `this` is active. - TF_MUST_USE_RESULT se::DeviceMemoryBase Forget() { + TF_MUST_USE_RESULT DeviceMemoryBase Forget() { CHECK(allocator_ != nullptr) << "Can't call Forget() on an inactive (i.e. moved from, Forget()'ten, " "or Free()'ed) instance."; allocator_ = nullptr; - se::DeviceMemoryBase mem(mem_); - mem_ = se::DeviceMemoryBase(); + DeviceMemoryBase mem(mem_); + mem_ = DeviceMemoryBase(); return mem; } @@ -127,11 +125,11 @@ class OwningDeviceMemory { void Free(); private: - se::DeviceMemoryBase mem_; + DeviceMemoryBase mem_; int device_ordinal_; DeviceMemoryAllocator* allocator_; // Null if this object is inactive. }; -} // namespace xla +} // namespace stream_executor #endif // TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_