diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h index 19a0e6ef6da..515b58d2f5d 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.h +++ b/tensorflow/compiler/xla/service/buffer_assignment.h @@ -358,6 +358,14 @@ class BufferAssignment { return allocations_; } + // This is similar to copying Allocations(), but since it's moved out, it + // preserves the addresses. Since BufferAllocation::Slice keeps a + // BufferAllocation*, and some backends keep BufferAllocation::Slice in + // xla::Executables, migrating off the use of addresses can be hard. + std::vector ReleaseAllocations() { + return std::move(allocations_); + } + // Returns the total size allocation holding all temporary buffers. int64 temp_allocation_total_size() const { return temp_allocation_total_size_; diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index 2b6968162ca..a456b3f026d 100644 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -672,8 +672,8 @@ cc_library( "gpu_debug_info_manager.h", ], deps = [ - "//tensorflow/compiler/xla/service:buffer_assignment", "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_proto_cc", "//tensorflow/compiler/xla/service:hlo_proto_util", "//tensorflow/core:lib", "@com_google_absl//absl/container:flat_hash_map", @@ -685,15 +685,10 @@ tf_cc_test( srcs = ["gpu_debug_info_manager_test.cc"], tags = tf_cuda_tests_tags(), deps = [ - ":gpu_constants", ":gpu_debug_info_manager", - ":gpu_hlo_schedule", - ":stream_assignment", - "//tensorflow/compiler/xla/service:buffer_assignment", + "//tensorflow/compiler/xla/service:hlo_proto_cc", "//tensorflow/compiler/xla/tests:hlo_test_base", - "//tensorflow/compiler/xla/tests:test_utils", "//tensorflow/compiler/xla/tests:xla_internal_test_main", - "//tensorflow/core:test", ], ) diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc index cac335ce087..a89cb435c67 100644 --- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc +++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc @@ -34,13 +34,13 @@ namespace gpu { Status BufferAllocations::TearDown( const std::set& live_addresses, - const BufferAssignment* buffer_assignment) { + absl::Span allocations) { // Deallocate temporary buffers, taking care to try to deallocate all of them // even if one of the deallocations fails. Status status; - const int64 num_buffers = buffer_assignment->Allocations().size(); + const int64 num_buffers = allocations.size(); for (BufferAllocation::Index i = 0; i < num_buffers; ++i) { - const BufferAllocation& allocation = buffer_assignment->GetAllocation(i); + const BufferAllocation& allocation = allocations[i]; se::DeviceMemoryBase buffer_address = GetDeviceAddress(allocation.index()); // Deallocate buffers marked "maybe_live_out" but aren't actually live out, // and temp buffers. diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h index 0d534b0d286..d5fa8c551d7 100644 --- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h +++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h @@ -70,7 +70,7 @@ class BufferAllocations { // Tears down all buffers allocated by this object that are not in // `live_addresses`. Status TearDown(const std::set& live_addresses, - const BufferAssignment* buffer_assignment); + absl::Span allocations); std::string ToString() { std::string out; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index 8a88a31d02a..8084e0eb71d 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -869,13 +869,18 @@ StatusOr> GpuCompiler::RunBackend( absl::flat_hash_map; TF_ASSIGN_OR_RETURN(OutputInfoMap output_info, GetOutputInfo(*module, *buffer_assignment)); + auto buffer_assignment_proto = + std::make_unique(buffer_assignment->ToProto()); + std::vector allocations = + buffer_assignment->ReleaseAllocations(); GpuVersion gpu_version = GetGpuVersion(stream_exec); auto* gpu_executable = new GpuExecutable( {std::move(backend_result.first), std::move(backend_result.second), gpu_version, std::move(thunk_schedule), std::move(constants), - std::move(output_info), std::move(module), std::move(buffer_assignment), - std::move(profile_printer), std::move(profile_index_map)}); + std::move(output_info), std::move(module), std::move(allocations), + std::move(buffer_assignment_proto), std::move(profile_printer), + std::move(profile_index_map)}); if (embed_ir_in_executable) { DCHECK_NE("", ir_module_string_before_opt); gpu_executable->set_ir_module_string(ir_module_string_before_opt); diff --git a/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.cc index 51888c0b8c7..9851ce0b8af 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.cc @@ -22,7 +22,7 @@ namespace gpu { void GpuDebugInfoManager::RegisterModule( const ModuleIdentifier& module_id, std::shared_ptr hlo_module, - std::shared_ptr buffer_assignment) { + std::shared_ptr buffer_assignment) { tensorflow::mutex_lock lock(mutex_); if (active_modules_.find(module_id) != active_modules_.end()) { active_modules_[module_id].instances.emplace_back(hlo_module, @@ -40,7 +40,7 @@ void GpuDebugInfoManager::RegisterModule( // However during tracing, we will defer the cleanup after serialization. void GpuDebugInfoManager::UnregisterModule( const ModuleIdentifier& module_id, std::shared_ptr hlo_module, - std::shared_ptr buffer_assignment) { + std::shared_ptr buffer_assignment) { tensorflow::mutex_lock lock(mutex_); CHECK(active_modules_.find(module_id) != active_modules_.end()); GpuModuleEntry& active_module = active_modules_[module_id]; @@ -146,8 +146,10 @@ void GpuDebugInfoManager::StopTracing( // non-nullptr. Due to the inconvenience of creation of buffer_assignment // object in test, we set it to nullptr and guard this for it. if (m.instances[0].hlo_module && m.instances[0].buffer_assignment) { - info.hlo_proto = absl::make_unique(MakeHloProto( - *m.instances[0].hlo_module, *m.instances[0].buffer_assignment)); + info.hlo_proto = absl::make_unique( + MakeHloProto(*m.instances[0].hlo_module)); + *info.hlo_proto->mutable_buffer_assignment() = + *m.instances[0].buffer_assignment; } module_debug_info->emplace_back(std::move(info)); } diff --git a/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h index 0a8b444243e..36d4435d284 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h @@ -17,7 +17,7 @@ limitations under the License. #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_DEBUG_INFO_MANAGER_H_ #include "absl/container/flat_hash_map.h" -#include "tensorflow/compiler/xla/service/buffer_assignment.h" +#include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/core/lib/core/status.h" @@ -56,14 +56,14 @@ class GpuDebugInfoManager { // Modules with same module id can be registered and tracked separately. void RegisterModule( const ModuleIdentifier& module_id, std::shared_ptr hlo_module, - std::shared_ptr buffer_assignment); + std::shared_ptr buffer_assignment); // Unregister an active module. When the last active module of the same // module id is out of scope, we remove it from our database. // However during tracing, we will defer the cleanup after serialization. void UnregisterModule( const ModuleIdentifier& module_id, std::shared_ptr hlo_module, - std::shared_ptr buffer_assignment); + std::shared_ptr buffer_assignment); // Register when the module start execution on certain device. // TODO(jiesun): Do we need to track which device this is? @@ -110,10 +110,10 @@ class GpuDebugInfoManager { // tracking, they need to be tracked separately. struct GpuModuleInstance { GpuModuleInstance(std::shared_ptr m, - std::shared_ptr b) + std::shared_ptr b) : hlo_module(std::move(m)), buffer_assignment(std::move(b)) {} std::shared_ptr hlo_module; - std::shared_ptr buffer_assignment; + std::shared_ptr buffer_assignment; bool active = true; }; diff --git a/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager_test.cc index 5ea26c55823..e0d42a3d692 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager_test.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager_test.cc @@ -14,7 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h" -#include "tensorflow/compiler/xla/service/buffer_assignment.h" +#include "tensorflow/compiler/xla/service/hlo.pb.h" #include "tensorflow/compiler/xla/tests/hlo_test_base.h" namespace xla { @@ -30,7 +30,7 @@ class GpuDebugInfoManagerTest : public HloTestBase { int unique_id; string id; std::shared_ptr module; - std::shared_ptr buffer_assignment; + std::shared_ptr buffer_assignment; }; // Return unique id of this module. diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index 336665a8058..51cd70e8bbb 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -62,18 +62,19 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params) binary_(std::move(params.binary)), gpu_version_(params.gpu_version), thunk_schedule_(std::move(params.thunk_schedule)), - assignment_(std::move(params.assignment)), + allocations_(std::move(params.allocations)), + debug_buffer_assignment_(std::move(params.debug_buffer_assignment)), constants_(std::move(params.constants)), output_info_(std::move(params.output_info)) { - CHECK(has_module() && assignment_); + CHECK(has_module()); GpuDebugInfoManager::Get()->RegisterModule(module().name(), shared_module(), - assignment_); + debug_buffer_assignment_); } GpuExecutable::~GpuExecutable() { - CHECK(has_module() && assignment_); + CHECK(has_module()); GpuDebugInfoManager::Get()->UnregisterModule(module().name(), shared_module(), - assignment_); + debug_buffer_assignment_); { // We could have issued host->device mem copies in ResolveConstantGlobals. @@ -376,11 +377,11 @@ StatusOr GpuExecutable::GenerateBufferAllocations( [&] { return std::string("Build buffer allocations"); }, tensorflow::profiler::TraceMeLevel::kInfo); - const int64 num_buffers = assignment_->Allocations().size(); + const int64 num_buffers = allocations_.size(); std::vector buffers; buffers.reserve(num_buffers); for (int64 i = 0; i < num_buffers; ++i) { - const BufferAllocation& allocation = assignment_->GetAllocation(i); + const BufferAllocation& allocation = allocations_[i]; TF_ASSIGN_OR_RETURN( se::DeviceMemoryBase buffer, BufferForAllocation(arguments, globals, allocation, memory_allocator, @@ -537,7 +538,7 @@ StatusOr GpuExecutable::ExecuteAsyncOnStream( // Free all temporary allocations. TF_RETURN_IF_ERROR( - buffer_allocations.TearDown(buffers_in_result, assignment_.get())); + buffer_allocations.TearDown(buffers_in_result, allocations_)); // Free allocations for arguments. MarkToBeReleasedArguments(absl::MakeSpan(arguments), result); @@ -551,9 +552,8 @@ int64 GpuExecutable::SizeOfGeneratedCodeInBytes() const { return -1; } int64 size = binary().size(); - for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); - ++i) { - const BufferAllocation& allocation = assignment_->GetAllocation(i); + for (BufferAllocation::Index i = 0; i < allocations_.size(); ++i) { + const BufferAllocation& allocation = allocations_[i]; if (allocation.is_constant()) { size += allocation.size(); } diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h index 690fb6aa94d..e977248872a 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h @@ -71,7 +71,8 @@ class GpuExecutable : public Executable { std::vector constants; absl::flat_hash_map output_info; std::unique_ptr hlo_module; - std::unique_ptr assignment; + std::vector allocations; + std::unique_ptr debug_buffer_assignment; std::unique_ptr hlo_profile_printer_data = nullptr; std::unique_ptr hlo_profile_index_map = nullptr; }; @@ -108,8 +109,8 @@ class GpuExecutable : public Executable { std::vector arguments, HloExecutionProfile* hlo_execution_profile) override; - std::shared_ptr GetBufferAssignment() const { - return assignment_; + absl::Span GetAllocations() const { + return allocations_; } private: @@ -176,7 +177,9 @@ class GpuExecutable : public Executable { // Owns the buffer data at runtime. It provides information to allocate // memory for every output/temp buffers. - const std::shared_ptr assignment_; + const std::vector allocations_; + + std::shared_ptr debug_buffer_assignment_; // Cache of module handles and constant buffer allocation maps used by // `ResolveConstantGlobals`. diff --git a/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc index bc832b4717a..9581673ffdb 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc @@ -46,12 +46,9 @@ class GemmRewriteTest : public GpuCodegenTest { backend().default_stream_executor()->GetAllocator())); GpuExecutable* gpu_executable = static_cast(executable.get()); - std::shared_ptr buffer_assignment = - gpu_executable->GetBufferAssignment(); - CHECK_EQ(buffer_assignment->Allocations().size(), - expected_number_of_allocations) - << "Unexpected buffer assignment. Was:\n" - << buffer_assignment->ToString(); + absl::Span allocations = + gpu_executable->GetAllocations(); + CHECK_EQ(allocations.size(), expected_number_of_allocations); } }; diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc index 8648ba3c684..908355fac27 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc +++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc @@ -589,13 +589,14 @@ StatusOr> MlirCompilerImpl::RunBackend( TF_ASSIGN_OR_RETURN(auto output_info, xla::gpu::GetOutputInfo(*module, *buffer_assignment)); + std::vector allocations = + buffer_assignment->ReleaseAllocations(); // TODO(b/137624192): Add profiling support. return {absl::make_unique(GpuExecutable::Params{ std::move(ptx), std::move(cubin), GetGpuVersion(stream_exec), std::move(thunk_schedule), std::vector(), - std::move(output_info), std::move(module), - std::move(buffer_assignment)})}; + std::move(output_info), std::move(module), std::move(allocations)})}; } StatusOr>> MlirCompilerImpl::Compile(