[XLA/GPU] Remove uses of BufferAssignment in GpuExecutable.

PiperOrigin-RevId: 348539166 Change-Id: I97aa232cd81837ffcfd10e24fd8f8b3dfb7953fe
2020-12-21 15:37:48 -08:00 · 2020-12-21 15:37:48 -08:00 · 63b8cdcb82
commit 63b8cdcb82
parent 722d7601e9
12 changed files with 58 additions and 47 deletions
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@ -358,6 +358,14 @@ class BufferAssignment {
    return allocations_;
  }

+  // This is similar to copying Allocations(), but since it's moved out, it
+  // preserves the addresses. Since BufferAllocation::Slice keeps a
+  // BufferAllocation*, and some backends keep BufferAllocation::Slice in
+  // xla::Executables, migrating off the use of addresses can be hard.
+  std::vector<BufferAllocation> ReleaseAllocations() {
+    return std::move(allocations_);
+  }
+
  // Returns the total size allocation holding all temporary buffers.
  int64 temp_allocation_total_size() const {
    return temp_allocation_total_size_;
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@ -672,8 +672,8 @@ cc_library(
        "gpu_debug_info_manager.h",
    ],
    deps = [
-        "//tensorflow/compiler/xla/service:buffer_assignment",
        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
        "//tensorflow/compiler/xla/service:hlo_proto_util",
        "//tensorflow/core:lib",
        "@com_google_absl//absl/container:flat_hash_map",
@ -685,15 +685,10 @@ tf_cc_test(
    srcs = ["gpu_debug_info_manager_test.cc"],
    tags = tf_cuda_tests_tags(),
    deps = [
-        ":gpu_constants",
        ":gpu_debug_info_manager",
-        ":gpu_hlo_schedule",
-        ":stream_assignment",
-        "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/compiler/xla/service:hlo_proto_cc",
        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:test_utils",
        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:test",
    ],
 )

--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@ -34,13 +34,13 @@ namespace gpu {

 Status BufferAllocations::TearDown(
    const std::set<se::DeviceMemoryBase>& live_addresses,
-    const BufferAssignment* buffer_assignment) {
+    absl::Span<const BufferAllocation> allocations) {
  // Deallocate temporary buffers, taking care to try to deallocate all of them
  // even if one of the deallocations fails.
  Status status;
-  const int64 num_buffers = buffer_assignment->Allocations().size();
+  const int64 num_buffers = allocations.size();
  for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
-    const BufferAllocation& allocation = buffer_assignment->GetAllocation(i);
+    const BufferAllocation& allocation = allocations[i];
    se::DeviceMemoryBase buffer_address = GetDeviceAddress(allocation.index());
    // Deallocate buffers marked "maybe_live_out" but aren't actually live out,
    // and temp buffers.
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@ -70,7 +70,7 @@ class BufferAllocations {
  // Tears down all buffers allocated by this object that are not in
  // `live_addresses`.
  Status TearDown(const std::set<se::DeviceMemoryBase>& live_addresses,
-                  const BufferAssignment* buffer_assignment);
+                  absl::Span<const BufferAllocation> allocations);

  std::string ToString() {
    std::string out;
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@ -869,13 +869,18 @@ StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
      absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo>;
  TF_ASSIGN_OR_RETURN(OutputInfoMap output_info,
                      GetOutputInfo(*module, *buffer_assignment));
+  auto buffer_assignment_proto =
+      std::make_unique<BufferAssignmentProto>(buffer_assignment->ToProto());
+  std::vector<BufferAllocation> allocations =
+      buffer_assignment->ReleaseAllocations();

  GpuVersion gpu_version = GetGpuVersion(stream_exec);
  auto* gpu_executable = new GpuExecutable(
      {std::move(backend_result.first), std::move(backend_result.second),
       gpu_version, std::move(thunk_schedule), std::move(constants),
-       std::move(output_info), std::move(module), std::move(buffer_assignment),
-       std::move(profile_printer), std::move(profile_index_map)});
+       std::move(output_info), std::move(module), std::move(allocations),
+       std::move(buffer_assignment_proto), std::move(profile_printer),
+       std::move(profile_index_map)});
  if (embed_ir_in_executable) {
    DCHECK_NE("", ir_module_string_before_opt);
    gpu_executable->set_ir_module_string(ir_module_string_before_opt);
--- a/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.cc
@ -22,7 +22,7 @@ namespace gpu {

 void GpuDebugInfoManager::RegisterModule(
    const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
-    std::shared_ptr<const BufferAssignment> buffer_assignment) {
+    std::shared_ptr<const BufferAssignmentProto> buffer_assignment) {
  tensorflow::mutex_lock lock(mutex_);
  if (active_modules_.find(module_id) != active_modules_.end()) {
    active_modules_[module_id].instances.emplace_back(hlo_module,
@ -40,7 +40,7 @@ void GpuDebugInfoManager::RegisterModule(
 // However during tracing, we will defer the cleanup after serialization.
 void GpuDebugInfoManager::UnregisterModule(
    const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
-    std::shared_ptr<const BufferAssignment> buffer_assignment) {
+    std::shared_ptr<const BufferAssignmentProto> buffer_assignment) {
  tensorflow::mutex_lock lock(mutex_);
  CHECK(active_modules_.find(module_id) != active_modules_.end());
  GpuModuleEntry& active_module = active_modules_[module_id];
@ -146,8 +146,10 @@ void GpuDebugInfoManager::StopTracing(
      // non-nullptr. Due to the inconvenience of creation of buffer_assignment
      // object in test, we set it to nullptr and guard this for it.
      if (m.instances[0].hlo_module && m.instances[0].buffer_assignment) {
-        info.hlo_proto = absl::make_unique<HloProto>(MakeHloProto(
-            *m.instances[0].hlo_module, *m.instances[0].buffer_assignment));
+        info.hlo_proto = absl::make_unique<HloProto>(
+            MakeHloProto(*m.instances[0].hlo_module));
+        *info.hlo_proto->mutable_buffer_assignment() =
+            *m.instances[0].buffer_assignment;
      }
      module_debug_info->emplace_back(std::move(info));
    }
--- a/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h
@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_DEBUG_INFO_MANAGER_H_

 #include "absl/container/flat_hash_map.h"
-#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/core/lib/core/status.h"

@ -56,14 +56,14 @@ class GpuDebugInfoManager {
  // Modules with same module id can be registered and tracked separately.
  void RegisterModule(
      const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
-      std::shared_ptr<const BufferAssignment> buffer_assignment);
+      std::shared_ptr<const BufferAssignmentProto> buffer_assignment);

  // Unregister an active module. When the last active module of the same
  // module id is out of scope, we remove it from our database.
  // However during tracing, we will defer the cleanup after serialization.
  void UnregisterModule(
      const ModuleIdentifier& module_id, std::shared_ptr<HloModule> hlo_module,
-      std::shared_ptr<const BufferAssignment> buffer_assignment);
+      std::shared_ptr<const BufferAssignmentProto> buffer_assignment);

  // Register when the module start execution on certain device.
  // TODO(jiesun): Do we need to track which device this is?
@ -110,10 +110,10 @@ class GpuDebugInfoManager {
  // tracking, they need to be tracked separately.
  struct GpuModuleInstance {
    GpuModuleInstance(std::shared_ptr<HloModule> m,
-                      std::shared_ptr<const BufferAssignment> b)
+                      std::shared_ptr<const BufferAssignmentProto> b)
        : hlo_module(std::move(m)), buffer_assignment(std::move(b)) {}
    std::shared_ptr<HloModule> hlo_module;
-    std::shared_ptr<const BufferAssignment> buffer_assignment;
+    std::shared_ptr<const BufferAssignmentProto> buffer_assignment;
    bool active = true;
  };

--- a/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager_test.cc
@ -14,7 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/xla/service/gpu/gpu_debug_info_manager.h"

-#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"

 namespace xla {
@ -30,7 +30,7 @@ class GpuDebugInfoManagerTest : public HloTestBase {
    int unique_id;
    string id;
    std::shared_ptr<HloModule> module;
-    std::shared_ptr<BufferAssignment> buffer_assignment;
+    std::shared_ptr<BufferAssignmentProto> buffer_assignment;
  };

  // Return unique id of this module.
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@ -62,18 +62,19 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params)
      binary_(std::move(params.binary)),
      gpu_version_(params.gpu_version),
      thunk_schedule_(std::move(params.thunk_schedule)),
-      assignment_(std::move(params.assignment)),
+      allocations_(std::move(params.allocations)),
+      debug_buffer_assignment_(std::move(params.debug_buffer_assignment)),
      constants_(std::move(params.constants)),
      output_info_(std::move(params.output_info)) {
-  CHECK(has_module() && assignment_);
+  CHECK(has_module());
  GpuDebugInfoManager::Get()->RegisterModule(module().name(), shared_module(),
-                                             assignment_);
+                                             debug_buffer_assignment_);
 }

 GpuExecutable::~GpuExecutable() {
-  CHECK(has_module() && assignment_);
+  CHECK(has_module());
  GpuDebugInfoManager::Get()->UnregisterModule(module().name(), shared_module(),
-                                               assignment_);
+                                               debug_buffer_assignment_);

  {
    // We could have issued host->device mem copies in ResolveConstantGlobals.
@ -376,11 +377,11 @@ StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
      [&] { return std::string("Build buffer allocations"); },
      tensorflow::profiler::TraceMeLevel::kInfo);

-  const int64 num_buffers = assignment_->Allocations().size();
+  const int64 num_buffers = allocations_.size();
  std::vector<se::DeviceMemoryBase> buffers;
  buffers.reserve(num_buffers);
  for (int64 i = 0; i < num_buffers; ++i) {
-    const BufferAllocation& allocation = assignment_->GetAllocation(i);
+    const BufferAllocation& allocation = allocations_[i];
    TF_ASSIGN_OR_RETURN(
        se::DeviceMemoryBase buffer,
        BufferForAllocation(arguments, globals, allocation, memory_allocator,
@ -537,7 +538,7 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(

  // Free all temporary allocations.
  TF_RETURN_IF_ERROR(
-      buffer_allocations.TearDown(buffers_in_result, assignment_.get()));
+      buffer_allocations.TearDown(buffers_in_result, allocations_));

  // Free allocations for arguments.
  MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
@ -551,9 +552,8 @@ int64 GpuExecutable::SizeOfGeneratedCodeInBytes() const {
    return -1;
  }
  int64 size = binary().size();
-  for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
-       ++i) {
-    const BufferAllocation& allocation = assignment_->GetAllocation(i);
+  for (BufferAllocation::Index i = 0; i < allocations_.size(); ++i) {
+    const BufferAllocation& allocation = allocations_[i];
    if (allocation.is_constant()) {
      size += allocation.size();
    }
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@ -71,7 +71,8 @@ class GpuExecutable : public Executable {
    std::vector<ConstantInfo> constants;
    absl::flat_hash_map<ShapeIndex, OutputInfo> output_info;
    std::unique_ptr<HloModule> hlo_module;
-    std::unique_ptr<const BufferAssignment> assignment;
+    std::vector<BufferAllocation> allocations;
+    std::unique_ptr<BufferAssignmentProto> debug_buffer_assignment;
    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data = nullptr;
    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map = nullptr;
  };
@ -108,8 +109,8 @@ class GpuExecutable : public Executable {
      std::vector<ExecutionInput> arguments,
      HloExecutionProfile* hlo_execution_profile) override;

-  std::shared_ptr<const BufferAssignment> GetBufferAssignment() const {
-    return assignment_;
+  absl::Span<const BufferAllocation> GetAllocations() const {
+    return allocations_;
  }

 private:
@ -176,7 +177,9 @@ class GpuExecutable : public Executable {

  // Owns the buffer data at runtime. It provides information to allocate
  // memory for every output/temp buffers.
-  const std::shared_ptr<const BufferAssignment> assignment_;
+  const std::vector<BufferAllocation> allocations_;
+
+  std::shared_ptr<BufferAssignmentProto> debug_buffer_assignment_;

  // Cache of module handles and constant buffer allocation maps used by
  // `ResolveConstantGlobals`.
--- a/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc
@ -46,12 +46,9 @@ class GemmRewriteTest : public GpuCodegenTest {
            backend().default_stream_executor()->GetAllocator()));
    GpuExecutable* gpu_executable =
        static_cast<GpuExecutable*>(executable.get());
-    std::shared_ptr<const BufferAssignment> buffer_assignment =
-        gpu_executable->GetBufferAssignment();
-    CHECK_EQ(buffer_assignment->Allocations().size(),
-             expected_number_of_allocations)
-        << "Unexpected buffer assignment. Was:\n"
-        << buffer_assignment->ToString();
+    absl::Span<const BufferAllocation> allocations =
+        gpu_executable->GetAllocations();
+    CHECK_EQ(allocations.size(), expected_number_of_allocations);
  }
 };

--- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
+++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler_impl.cc
@ -589,13 +589,14 @@ StatusOr<std::unique_ptr<Executable>> MlirCompilerImpl::RunBackend(

  TF_ASSIGN_OR_RETURN(auto output_info,
                      xla::gpu::GetOutputInfo(*module, *buffer_assignment));
+  std::vector<BufferAllocation> allocations =
+      buffer_assignment->ReleaseAllocations();

  // TODO(b/137624192): Add profiling support.
  return {absl::make_unique<GpuExecutable>(GpuExecutable::Params{
      std::move(ptx), std::move(cubin), GetGpuVersion(stream_exec),
      std::move(thunk_schedule), std::vector<GpuExecutable::ConstantInfo>(),
-      std::move(output_info), std::move(module),
-      std::move(buffer_assignment)})};
+      std::move(output_info), std::move(module), std::move(allocations)})};
 }

 StatusOr<std::vector<std::unique_ptr<Executable>>> MlirCompilerImpl::Compile(