From bbe0c4590c90a83dde1c24b91284ef370ba06eb0 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Tue, 9 Jun 2020 19:10:24 -0700 Subject: [PATCH] [XLA:GPU] Support buffer aliasing for XLA:GPU This is the change on XLA side, the change on the TF/XLA bridge is still TBD. Dropping the check in xla_launch_util, as aliased buffers are no longer represented as nulls. PiperOrigin-RevId: 315607922 Change-Id: I24903d9288604cff142a7f0872d924a5da621e49 --- tensorflow/compiler/jit/xla_launch_util.cc | 4 - .../xla/service/gpu/gpu_executable.cc | 188 +++++++++++------- .../xla/tests/buffer_donation_test.cc | 4 +- 3 files changed, 120 insertions(+), 76 deletions(-) diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc index 8c24f182f5c..209220938ed 100644 --- a/tensorflow/compiler/jit/xla_launch_util.cc +++ b/tensorflow/compiler/jit/xla_launch_util.cc @@ -468,10 +468,6 @@ Status XlaComputationLaunchContext::PopulateOutputs( << "Invalid input for outputs " << i << ": " << input_index; ctx->set_output(i, ctx->input(input_index)); } else { - if (MustAliasOutput(input_output_alias, output_num)) { - DCHECK(output.buffer({output_num}).is_null()) - << "Expected output buffer to be aliased, but it is not nil."; - } if (allocate_xla_tensors_) { TF_RETURN_IF_ERROR(SetBufferForTensorUnderAllocateXlaTensors( input_output_alias, output_num, ctx, i, shape, &output, diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc index bf65df20544..c8b11cab31a 100644 --- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc @@ -360,6 +360,27 @@ StatusOr GpuExecutable::BufferForAllocation( } } +static Status CheckAlignment(const BufferAllocation& allocation, + se::DeviceMemoryBase buffer, int arg_idx) { + const int64 expected_alignment = [&] { + if (allocation.is_entry_computation_parameter()) { + return kEntryParameterAlignBytes; + } else if (allocation.is_constant()) { + return kConstantBufferAlignBytes; + } else { + return kXlaAllocatedBufferAlignBytes; + } + }(); + if (!buffer.is_null() && + reinterpret_cast(buffer.opaque()) % expected_alignment != 0) { + return InternalError( + "Address of buffer %d must be a multiple of %x, but " + "was %p", + arg_idx, expected_alignment, buffer.opaque()); + } + return Status::OK(); +} + StatusOr GpuExecutable::GenerateBufferAllocations( absl::Span arguments, const GpuExecutable::BufferAllocToDeviceMemoryMap* globals, @@ -378,28 +399,37 @@ StatusOr GpuExecutable::GenerateBufferAllocations( se::DeviceMemoryBase buffer, BufferForAllocation(arguments, globals, allocation, memory_allocator, executor->device_ordinal(), i)); - const int64 expected_alignment = [&] { - if (allocation.is_entry_computation_parameter()) { - return kEntryParameterAlignBytes; - } else if (allocation.is_constant()) { - return kConstantBufferAlignBytes; - } else { - return kXlaAllocatedBufferAlignBytes; - } - }(); - if (!buffer.is_null() && - reinterpret_cast(buffer.opaque()) % expected_alignment != - 0) { - return InternalError( - "Address of buffer %d must be a multiple of %x, but " - "was %p", - i, expected_alignment, buffer.opaque()); - } buffers.push_back(buffer); + TF_RETURN_IF_ERROR(CheckAlignment(allocation, buffer, i)); } return {{buffers, executor->device_ordinal(), memory_allocator}}; } +// Returns `true` if the entire tuple contents is aliased. +static bool EntireTupleContentsAliased( + const Shape& output_shape, const ShapeIndex& index, + const HloInputOutputAliasConfig& alias_config) { + const Shape& indexed_shape = ShapeUtil::GetSubshape(output_shape, index); + if (!indexed_shape.IsTuple()) { + return false; + } + bool all_aliased = true; + ShapeUtil::ForEachSubshape( + indexed_shape, [&](const Shape& subshape, const ShapeIndex& subindex) { + if (subindex.empty()) { + return; + } + std::vector full_index; + absl::c_copy(index, std::back_inserter(full_index)); + absl::c_copy(subindex, std::back_inserter(full_index)); + if (!alias_config.OutputHasAlias( + ShapeIndex(full_index.begin(), full_index.end()))) { + all_aliased = false; + } + }); + return all_aliased; +} + StatusOr GpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, std::vector arguments, @@ -425,84 +455,102 @@ StatusOr GpuExecutable::ExecuteAsyncOnStream( } se::StreamExecutor* executor = run_options->stream()->parent(); - TF_ASSIGN_OR_RETURN(BufferAllocations buffer_allocations, - GenerateBufferAllocations(arguments, globals, - memory_allocator, executor)); - - for (Thunk* thunk : thunk_schedule_->TotalOrder()) { - TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor)); - } - VLOG(2) << buffer_allocations.ToString(); - TF_RETURN_IF_ERROR(ExecuteThunks(run_options, buffer_allocations, - block_host_until_done, - hlo_execution_profile)); HloInstruction* root = hlo_module_->entry_computation()->root_instruction(); auto device_ordinal = executor->device_ordinal(); - ExecutionOutput result(root->shape(), root->shape(), memory_allocator, + ExecutionOutput result(/*on_host_shape=*/root->shape(), + /*on_device_shape=*/root->shape(), memory_allocator, device_ordinal); + TF_ASSIGN_OR_RETURN(BufferAllocations buffer_allocations, + GenerateBufferAllocations(arguments, globals, + memory_allocator, executor)); + VLOG(2) << buffer_allocations.ToString(); std::set buffers_in_result; for (auto& p : result.MutableResult()->buffers()) { const ShapeIndex& index = p.first; - se::DeviceMemoryBase& device_memory = p.second; + se::DeviceMemoryBase& result_buffer = p.second; const auto& sources = GetRootValueSet().element(index); // The points-to set is unambiguous so the set should be a // singleton. That is, we know exactly which instruction // produced the array at this element. CHECK_EQ(1, sources.values().size()); - auto src_hlo = sources.values()[0]->instruction(); + HloInstruction* src_hlo = sources.values()[0]->instruction(); VLOG(4) << "Looking at: " << sources.values()[0]; - // The source instruction should have a non-parameter buffer - // assigned. - TF_ASSIGN_OR_RETURN( - const BufferAllocation::Slice slice, - assignment_->GetUniqueSlice(src_hlo, sources.values()[0]->index())); - - se::DeviceMemoryBase src_base = - buffer_allocations.GetDeviceAddress(slice.index()); - CHECK(!src_base.is_null() || src_base.size() == 0); - if (!slice.allocation()->is_entry_computation_parameter()) { - // If the buffer coming out of the result is from a parameter, it - // means the caller aliased some parameter buffer to an output one - // (via the HloInputOutputAliasConfig API). If that is the case, the - // caller will receive a partially complete scoped shaped buffer, - // which they will have to fill up on return. - // Unfortunately the interface to the execute APIs are ShapedBuffer - // pointer based, which assumes caller ownership, and hence a buffer - // coming from there cannot be part of the new ScopedShapedBuffer we - // create for the result (which assumes ownership). - device_memory = src_base; - } else { - const HloInputOutputAliasConfig& input_output_alias = - module().input_output_alias_config(); - auto output_alias = input_output_alias.GetAliasedOutput( - slice.allocation()->parameter_number(), - slice.allocation()->param_shape_index()); - CHECK(output_alias) << "Output buffer is coming from parameter " - << slice.allocation()->parameter_number() - << " at index " - << slice.allocation()->param_shape_index() - << ", but no alias exists"; - CHECK_EQ(*output_alias, index); + const HloInputOutputAliasConfig& input_output_alias = + module().input_output_alias_config(); + absl::optional alias = + input_output_alias.GetAliasedParameter(index); + if (alias) { + CHECK_LT(alias->parameter_number, arguments.size()); + ExecutionInput& input = arguments[alias->parameter_number]; + MaybeOwningDeviceMemory* maybe_owning_memory = + input.MutableBuffer(alias->parameter_index); + if (absl::optional owning = + maybe_owning_memory->Release()) { + // If the caller passes the ownership of the device memory, reuse it + // as the output buffer. It is up to the caller whether or not to + // donate a buffer; the aliasing information describes which buffers + // may alias, not buffers that must alias. + se::DeviceMemoryBase argument_buffer = owning->Release(); + *maybe_owning_memory = argument_buffer; + result_buffer = argument_buffer; + if (alias->kind == HloInputOutputAliasConfig::kUserAlias) { + // This is a user alias, so a must alias. The caller is giving us the + // input buffer, but in case of error from the execute call, we should + // not be releasing it as it contains valid data (for example, it is a + // parameter which the user wants us to alias, in a gradient update + // computation). So we store the index into the result in the aliased + // vector, which will be fed to the ExecutionOutput, which will use + // the indices to drop the addresses from its own ScopedShapedBuffer + // result, if the ExecutionOutput is not committed. + result.AddAliasedIndex(index); + } + } } - buffers_in_result.insert(src_base); + + if (result_buffer.is_null()) { + // The source instruction should have a non-parameter buffer + // assigned. + TF_ASSIGN_OR_RETURN( + const BufferAllocation::Slice slice, + assignment_->GetUniqueSlice(src_hlo, sources.values()[0]->index())); + result_buffer = buffer_allocations.GetDeviceAddress(slice.index()); + + // If the entire tuple contents is aliased, the copy insertion will *not* + // materialize a new tuple, so we mark it as aliased as well. + if (EntireTupleContentsAliased(root->shape(), index, + input_output_alias)) { + result.AddAliasedIndex(index); + } + } + buffers_in_result.insert(result_buffer); } + + for (Thunk* thunk : thunk_schedule_->TotalOrder()) { + TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor)); + } + TF_RETURN_IF_ERROR(ExecuteThunks(run_options, buffer_allocations, + block_host_until_done, + hlo_execution_profile)); + + // Free all temporary allocations. TF_RETURN_IF_ERROR( buffer_allocations.TearDown(buffers_in_result, assignment_.get())); - std::vector buffers_to_free; - for (auto& argument : arguments) { + // Free allocations for arguments. + for (ExecutionInput& argument : arguments) { for (auto& index_buffer : *argument.MutableBuffers()) { - auto maybe_owning_buffer = index_buffer.second.Release(); - if (maybe_owning_buffer) { - buffers_to_free.push_back(std::move(*maybe_owning_buffer)); + if (absl::optional owning = + index_buffer.second.Release()) { + result.AddToBeReleased(std::move(*owning)); } } } - return result; + + return std::move(result); } const InstructionValueSet& GpuExecutable::GetRootValueSet() const { diff --git a/tensorflow/compiler/xla/tests/buffer_donation_test.cc b/tensorflow/compiler/xla/tests/buffer_donation_test.cc index be76fa74ae2..af2c0b1b3ce 100644 --- a/tensorflow/compiler/xla/tests/buffer_donation_test.cc +++ b/tensorflow/compiler/xla/tests/buffer_donation_test.cc @@ -216,8 +216,8 @@ TEST_F(BufferDonationTest, SimpleWhileTupleTest) { HloInstruction::CreateGetTupleElement(f32v1_, while0, 1)); builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1})); module->AddEntryComputation(builder.Build()); - // Input output aliasing is only supported on TPU. -#if defined(XLA_TEST_BACKEND_TPU) + // Input output aliasing is supported on CPU and GPU. +#if defined(XLA_TEST_BACKEND_TPU) || defined(XLA_TEST_BACKEND_GPU) TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({0}, 0, {0})); TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({1}, 0, {1})); #endif