From bbe0c4590c90a83dde1c24b91284ef370ba06eb0 Mon Sep 17 00:00:00 2001
From: George Karpenkov <cheshire@google.com>
Date: Tue, 9 Jun 2020 19:10:24 -0700
Subject: [PATCH] [XLA:GPU] Support buffer aliasing for XLA:GPU

This is the change on XLA side, the change on the TF/XLA bridge is still TBD.

Dropping the check in xla_launch_util, as aliased buffers are no longer
represented as nulls.

PiperOrigin-RevId: 315607922
Change-Id: I24903d9288604cff142a7f0872d924a5da621e49
---
 tensorflow/compiler/jit/xla_launch_util.cc    |   4 -
 .../xla/service/gpu/gpu_executable.cc         | 188 +++++++++++-------
 .../xla/tests/buffer_donation_test.cc         |   4 +-
 3 files changed, 120 insertions(+), 76 deletions(-)
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 8c24f182f5c..209220938ed 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -468,10 +468,6 @@ Status XlaComputationLaunchContext::PopulateOutputs(
             << "Invalid input for outputs " << i << ": " << input_index;
         ctx->set_output(i, ctx->input(input_index));
       } else {
-        if (MustAliasOutput(input_output_alias, output_num)) {
-          DCHECK(output.buffer({output_num}).is_null())
-              << "Expected output buffer to be aliased, but it is not nil.";
-        }
         if (allocate_xla_tensors_) {
           TF_RETURN_IF_ERROR(SetBufferForTensorUnderAllocateXlaTensors(
               input_output_alias, output_num, ctx, i, shape, &output,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index bf65df20544..c8b11cab31a 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -360,6 +360,27 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
   }
 }
 
+static Status CheckAlignment(const BufferAllocation& allocation,
+                             se::DeviceMemoryBase buffer, int arg_idx) {
+  const int64 expected_alignment = [&] {
+    if (allocation.is_entry_computation_parameter()) {
+      return kEntryParameterAlignBytes;
+    } else if (allocation.is_constant()) {
+      return kConstantBufferAlignBytes;
+    } else {
+      return kXlaAllocatedBufferAlignBytes;
+    }
+  }();
+  if (!buffer.is_null() &&
+      reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment != 0) {
+    return InternalError(
+        "Address of buffer %d must be a multiple of %x, but "
+        "was %p",
+        arg_idx, expected_alignment, buffer.opaque());
+  }
+  return Status::OK();
+}
+
 StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
     absl::Span<ExecutionInput const> arguments,
     const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
@@ -378,28 +399,37 @@ StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
         se::DeviceMemoryBase buffer,
         BufferForAllocation(arguments, globals, allocation, memory_allocator,
                             executor->device_ordinal(), i));
-    const int64 expected_alignment = [&] {
-      if (allocation.is_entry_computation_parameter()) {
-        return kEntryParameterAlignBytes;
-      } else if (allocation.is_constant()) {
-        return kConstantBufferAlignBytes;
-      } else {
-        return kXlaAllocatedBufferAlignBytes;
-      }
-    }();
-    if (!buffer.is_null() &&
-        reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment !=
-            0) {
-      return InternalError(
-          "Address of buffer %d must be a multiple of %x, but "
-          "was %p",
-          i, expected_alignment, buffer.opaque());
-    }
     buffers.push_back(buffer);
+    TF_RETURN_IF_ERROR(CheckAlignment(allocation, buffer, i));
   }
   return {{buffers, executor->device_ordinal(), memory_allocator}};
 }
 
+// Returns `true` if the entire tuple contents is aliased.
+static bool EntireTupleContentsAliased(
+    const Shape& output_shape, const ShapeIndex& index,
+    const HloInputOutputAliasConfig& alias_config) {
+  const Shape& indexed_shape = ShapeUtil::GetSubshape(output_shape, index);
+  if (!indexed_shape.IsTuple()) {
+    return false;
+  }
+  bool all_aliased = true;
+  ShapeUtil::ForEachSubshape(
+      indexed_shape, [&](const Shape& subshape, const ShapeIndex& subindex) {
+        if (subindex.empty()) {
+          return;
+        }
+        std::vector<int64> full_index;
+        absl::c_copy(index, std::back_inserter(full_index));
+        absl::c_copy(subindex, std::back_inserter(full_index));
+        if (!alias_config.OutputHasAlias(
+                ShapeIndex(full_index.begin(), full_index.end()))) {
+          all_aliased = false;
+        }
+      });
+  return all_aliased;
+}
+
 StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
     std::vector<ExecutionInput> arguments,
@@ -425,84 +455,102 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
   }
 
   se::StreamExecutor* executor = run_options->stream()->parent();
-  TF_ASSIGN_OR_RETURN(BufferAllocations buffer_allocations,
-                      GenerateBufferAllocations(arguments, globals,
-                                                memory_allocator, executor));
-
-  for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
-    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
-  }
-  VLOG(2) << buffer_allocations.ToString();
-  TF_RETURN_IF_ERROR(ExecuteThunks(run_options, buffer_allocations,
-                                   block_host_until_done,
-                                   hlo_execution_profile));
 
   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
   auto device_ordinal = executor->device_ordinal();
-  ExecutionOutput result(root->shape(), root->shape(), memory_allocator,
+  ExecutionOutput result(/*on_host_shape=*/root->shape(),
+                         /*on_device_shape=*/root->shape(), memory_allocator,
                          device_ordinal);
 
+  TF_ASSIGN_OR_RETURN(BufferAllocations buffer_allocations,
+                      GenerateBufferAllocations(arguments, globals,
+                                                memory_allocator, executor));
+  VLOG(2) << buffer_allocations.ToString();
   std::set<se::DeviceMemoryBase> buffers_in_result;
   for (auto& p : result.MutableResult()->buffers()) {
     const ShapeIndex& index = p.first;
-    se::DeviceMemoryBase& device_memory = p.second;
+    se::DeviceMemoryBase& result_buffer = p.second;
     const auto& sources = GetRootValueSet().element(index);
     // The points-to set is unambiguous so the set should be a
     // singleton. That is, we know exactly which instruction
     // produced the array at this element.
     CHECK_EQ(1, sources.values().size());
-    auto src_hlo = sources.values()[0]->instruction();
+    HloInstruction* src_hlo = sources.values()[0]->instruction();
 
     VLOG(4) << "Looking at: " << sources.values()[0];
 
-    // The source instruction should have a non-parameter buffer
-    // assigned.
-    TF_ASSIGN_OR_RETURN(
-        const BufferAllocation::Slice slice,
-        assignment_->GetUniqueSlice(src_hlo, sources.values()[0]->index()));
-
-    se::DeviceMemoryBase src_base =
-        buffer_allocations.GetDeviceAddress(slice.index());
-    CHECK(!src_base.is_null() || src_base.size() == 0);
-    if (!slice.allocation()->is_entry_computation_parameter()) {
-      // If the buffer coming out of the result is from a parameter, it
-      // means the caller aliased some parameter buffer to an output one
-      // (via the HloInputOutputAliasConfig API). If that is the case, the
-      // caller will receive a partially complete scoped shaped buffer,
-      // which they will have to fill up on return.
-      // Unfortunately the interface to the execute APIs are ShapedBuffer
-      // pointer based, which assumes caller ownership, and hence a buffer
-      // coming from there cannot be part of the new ScopedShapedBuffer we
-      // create for the result (which assumes ownership).
-      device_memory = src_base;
-    } else {
-      const HloInputOutputAliasConfig& input_output_alias =
-          module().input_output_alias_config();
-      auto output_alias = input_output_alias.GetAliasedOutput(
-          slice.allocation()->parameter_number(),
-          slice.allocation()->param_shape_index());
-      CHECK(output_alias) << "Output buffer is coming from parameter "
-                          << slice.allocation()->parameter_number()
-                          << " at index "
-                          << slice.allocation()->param_shape_index()
-                          << ", but no alias exists";
-      CHECK_EQ(*output_alias, index);
+    const HloInputOutputAliasConfig& input_output_alias =
+        module().input_output_alias_config();
+    absl::optional<HloInputOutputAliasConfig::Alias> alias =
+        input_output_alias.GetAliasedParameter(index);
+    if (alias) {
+      CHECK_LT(alias->parameter_number, arguments.size());
+      ExecutionInput& input = arguments[alias->parameter_number];
+      MaybeOwningDeviceMemory* maybe_owning_memory =
+          input.MutableBuffer(alias->parameter_index);
+      if (absl::optional<se::OwningDeviceMemory> owning =
+              maybe_owning_memory->Release()) {
+        // If the caller passes the ownership of the device memory, reuse it
+        // as the output buffer. It is up to the caller whether or not to
+        // donate a buffer; the aliasing information describes which buffers
+        // may alias, not buffers that must alias.
+        se::DeviceMemoryBase argument_buffer = owning->Release();
+        *maybe_owning_memory = argument_buffer;
+        result_buffer = argument_buffer;
+        if (alias->kind == HloInputOutputAliasConfig::kUserAlias) {
+          // This is a user alias, so a must alias. The caller is giving us the
+          // input buffer, but in case of error from the execute call, we should
+          // not be releasing it as it contains valid data (for example, it is a
+          // parameter which the user wants us to alias, in a gradient update
+          // computation). So we store the index into the result in the aliased
+          // vector, which will be fed to the ExecutionOutput, which will use
+          // the indices to drop the addresses from its own ScopedShapedBuffer
+          // result, if the ExecutionOutput is not committed.
+          result.AddAliasedIndex(index);
+        }
+      }
     }
-    buffers_in_result.insert(src_base);
+
+    if (result_buffer.is_null()) {
+      // The source instruction should have a non-parameter buffer
+      // assigned.
+      TF_ASSIGN_OR_RETURN(
+          const BufferAllocation::Slice slice,
+          assignment_->GetUniqueSlice(src_hlo, sources.values()[0]->index()));
+      result_buffer = buffer_allocations.GetDeviceAddress(slice.index());
+
+      // If the entire tuple contents is aliased, the copy insertion will *not*
+      // materialize a new tuple, so we mark it as aliased as well.
+      if (EntireTupleContentsAliased(root->shape(), index,
+                                     input_output_alias)) {
+        result.AddAliasedIndex(index);
+      }
+    }
+    buffers_in_result.insert(result_buffer);
   }
+
+  for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
+    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
+  }
+  TF_RETURN_IF_ERROR(ExecuteThunks(run_options, buffer_allocations,
+                                   block_host_until_done,
+                                   hlo_execution_profile));
+
+  // Free all temporary allocations.
   TF_RETURN_IF_ERROR(
       buffer_allocations.TearDown(buffers_in_result, assignment_.get()));
 
-  std::vector<se::OwningDeviceMemory> buffers_to_free;
-  for (auto& argument : arguments) {
+  // Free allocations for arguments.
+  for (ExecutionInput& argument : arguments) {
     for (auto& index_buffer : *argument.MutableBuffers()) {
-      auto maybe_owning_buffer = index_buffer.second.Release();
-      if (maybe_owning_buffer) {
-        buffers_to_free.push_back(std::move(*maybe_owning_buffer));
+      if (absl::optional<se::OwningDeviceMemory> owning =
+              index_buffer.second.Release()) {
+        result.AddToBeReleased(std::move(*owning));
       }
     }
   }
-  return result;
+
+  return std::move(result);
 }
 
 const InstructionValueSet& GpuExecutable::GetRootValueSet() const {
diff --git a/tensorflow/compiler/xla/tests/buffer_donation_test.cc b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
index be76fa74ae2..af2c0b1b3ce 100644
--- a/tensorflow/compiler/xla/tests/buffer_donation_test.cc
+++ b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
@@ -216,8 +216,8 @@ TEST_F(BufferDonationTest, SimpleWhileTupleTest) {
       HloInstruction::CreateGetTupleElement(f32v1_, while0, 1));
   builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
   module->AddEntryComputation(builder.Build());
-  // Input output aliasing is only supported on TPU.
-#if defined(XLA_TEST_BACKEND_TPU)
+  // Input output aliasing is supported on CPU and GPU.
+#if defined(XLA_TEST_BACKEND_TPU) || defined(XLA_TEST_BACKEND_GPU)
   TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({0}, 0, {0}));
   TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({1}, 0, {1}));
 #endif