[XLA:GPU] Support buffer aliasing for XLA:GPU

This is the change on XLA side, the change on the TF/XLA bridge is still TBD.

Dropping the check in xla_launch_util, as aliased buffers are no longer
represented as nulls.

PiperOrigin-RevId: 315607922
Change-Id: I24903d9288604cff142a7f0872d924a5da621e49
This commit is contained in:
George Karpenkov 2020-06-09 19:10:24 -07:00 committed by TensorFlower Gardener
parent 4738f8a6e6
commit bbe0c4590c
3 changed files with 120 additions and 76 deletions

View File

@ -468,10 +468,6 @@ Status XlaComputationLaunchContext::PopulateOutputs(
<< "Invalid input for outputs " << i << ": " << input_index;
ctx->set_output(i, ctx->input(input_index));
} else {
if (MustAliasOutput(input_output_alias, output_num)) {
DCHECK(output.buffer({output_num}).is_null())
<< "Expected output buffer to be aliased, but it is not nil.";
}
if (allocate_xla_tensors_) {
TF_RETURN_IF_ERROR(SetBufferForTensorUnderAllocateXlaTensors(
input_output_alias, output_num, ctx, i, shape, &output,

View File

@ -360,6 +360,27 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
}
}
static Status CheckAlignment(const BufferAllocation& allocation,
se::DeviceMemoryBase buffer, int arg_idx) {
const int64 expected_alignment = [&] {
if (allocation.is_entry_computation_parameter()) {
return kEntryParameterAlignBytes;
} else if (allocation.is_constant()) {
return kConstantBufferAlignBytes;
} else {
return kXlaAllocatedBufferAlignBytes;
}
}();
if (!buffer.is_null() &&
reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment != 0) {
return InternalError(
"Address of buffer %d must be a multiple of %x, but "
"was %p",
arg_idx, expected_alignment, buffer.opaque());
}
return Status::OK();
}
StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
absl::Span<ExecutionInput const> arguments,
const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
@ -378,28 +399,37 @@ StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
se::DeviceMemoryBase buffer,
BufferForAllocation(arguments, globals, allocation, memory_allocator,
executor->device_ordinal(), i));
const int64 expected_alignment = [&] {
if (allocation.is_entry_computation_parameter()) {
return kEntryParameterAlignBytes;
} else if (allocation.is_constant()) {
return kConstantBufferAlignBytes;
} else {
return kXlaAllocatedBufferAlignBytes;
}
}();
if (!buffer.is_null() &&
reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment !=
0) {
return InternalError(
"Address of buffer %d must be a multiple of %x, but "
"was %p",
i, expected_alignment, buffer.opaque());
}
buffers.push_back(buffer);
TF_RETURN_IF_ERROR(CheckAlignment(allocation, buffer, i));
}
return {{buffers, executor->device_ordinal(), memory_allocator}};
}
// Returns `true` if the entire tuple contents is aliased.
static bool EntireTupleContentsAliased(
const Shape& output_shape, const ShapeIndex& index,
const HloInputOutputAliasConfig& alias_config) {
const Shape& indexed_shape = ShapeUtil::GetSubshape(output_shape, index);
if (!indexed_shape.IsTuple()) {
return false;
}
bool all_aliased = true;
ShapeUtil::ForEachSubshape(
indexed_shape, [&](const Shape& subshape, const ShapeIndex& subindex) {
if (subindex.empty()) {
return;
}
std::vector<int64> full_index;
absl::c_copy(index, std::back_inserter(full_index));
absl::c_copy(subindex, std::back_inserter(full_index));
if (!alias_config.OutputHasAlias(
ShapeIndex(full_index.begin(), full_index.end()))) {
all_aliased = false;
}
});
return all_aliased;
}
StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
const ServiceExecutableRunOptions* run_options,
std::vector<ExecutionInput> arguments,
@ -425,84 +455,102 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
}
se::StreamExecutor* executor = run_options->stream()->parent();
TF_ASSIGN_OR_RETURN(BufferAllocations buffer_allocations,
GenerateBufferAllocations(arguments, globals,
memory_allocator, executor));
for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
}
VLOG(2) << buffer_allocations.ToString();
TF_RETURN_IF_ERROR(ExecuteThunks(run_options, buffer_allocations,
block_host_until_done,
hlo_execution_profile));
HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
auto device_ordinal = executor->device_ordinal();
ExecutionOutput result(root->shape(), root->shape(), memory_allocator,
ExecutionOutput result(/*on_host_shape=*/root->shape(),
/*on_device_shape=*/root->shape(), memory_allocator,
device_ordinal);
TF_ASSIGN_OR_RETURN(BufferAllocations buffer_allocations,
GenerateBufferAllocations(arguments, globals,
memory_allocator, executor));
VLOG(2) << buffer_allocations.ToString();
std::set<se::DeviceMemoryBase> buffers_in_result;
for (auto& p : result.MutableResult()->buffers()) {
const ShapeIndex& index = p.first;
se::DeviceMemoryBase& device_memory = p.second;
se::DeviceMemoryBase& result_buffer = p.second;
const auto& sources = GetRootValueSet().element(index);
// The points-to set is unambiguous so the set should be a
// singleton. That is, we know exactly which instruction
// produced the array at this element.
CHECK_EQ(1, sources.values().size());
auto src_hlo = sources.values()[0]->instruction();
HloInstruction* src_hlo = sources.values()[0]->instruction();
VLOG(4) << "Looking at: " << sources.values()[0];
// The source instruction should have a non-parameter buffer
// assigned.
TF_ASSIGN_OR_RETURN(
const BufferAllocation::Slice slice,
assignment_->GetUniqueSlice(src_hlo, sources.values()[0]->index()));
se::DeviceMemoryBase src_base =
buffer_allocations.GetDeviceAddress(slice.index());
CHECK(!src_base.is_null() || src_base.size() == 0);
if (!slice.allocation()->is_entry_computation_parameter()) {
// If the buffer coming out of the result is from a parameter, it
// means the caller aliased some parameter buffer to an output one
// (via the HloInputOutputAliasConfig API). If that is the case, the
// caller will receive a partially complete scoped shaped buffer,
// which they will have to fill up on return.
// Unfortunately the interface to the execute APIs are ShapedBuffer
// pointer based, which assumes caller ownership, and hence a buffer
// coming from there cannot be part of the new ScopedShapedBuffer we
// create for the result (which assumes ownership).
device_memory = src_base;
} else {
const HloInputOutputAliasConfig& input_output_alias =
module().input_output_alias_config();
auto output_alias = input_output_alias.GetAliasedOutput(
slice.allocation()->parameter_number(),
slice.allocation()->param_shape_index());
CHECK(output_alias) << "Output buffer is coming from parameter "
<< slice.allocation()->parameter_number()
<< " at index "
<< slice.allocation()->param_shape_index()
<< ", but no alias exists";
CHECK_EQ(*output_alias, index);
const HloInputOutputAliasConfig& input_output_alias =
module().input_output_alias_config();
absl::optional<HloInputOutputAliasConfig::Alias> alias =
input_output_alias.GetAliasedParameter(index);
if (alias) {
CHECK_LT(alias->parameter_number, arguments.size());
ExecutionInput& input = arguments[alias->parameter_number];
MaybeOwningDeviceMemory* maybe_owning_memory =
input.MutableBuffer(alias->parameter_index);
if (absl::optional<se::OwningDeviceMemory> owning =
maybe_owning_memory->Release()) {
// If the caller passes the ownership of the device memory, reuse it
// as the output buffer. It is up to the caller whether or not to
// donate a buffer; the aliasing information describes which buffers
// may alias, not buffers that must alias.
se::DeviceMemoryBase argument_buffer = owning->Release();
*maybe_owning_memory = argument_buffer;
result_buffer = argument_buffer;
if (alias->kind == HloInputOutputAliasConfig::kUserAlias) {
// This is a user alias, so a must alias. The caller is giving us the
// input buffer, but in case of error from the execute call, we should
// not be releasing it as it contains valid data (for example, it is a
// parameter which the user wants us to alias, in a gradient update
// computation). So we store the index into the result in the aliased
// vector, which will be fed to the ExecutionOutput, which will use
// the indices to drop the addresses from its own ScopedShapedBuffer
// result, if the ExecutionOutput is not committed.
result.AddAliasedIndex(index);
}
}
}
buffers_in_result.insert(src_base);
if (result_buffer.is_null()) {
// The source instruction should have a non-parameter buffer
// assigned.
TF_ASSIGN_OR_RETURN(
const BufferAllocation::Slice slice,
assignment_->GetUniqueSlice(src_hlo, sources.values()[0]->index()));
result_buffer = buffer_allocations.GetDeviceAddress(slice.index());
// If the entire tuple contents is aliased, the copy insertion will *not*
// materialize a new tuple, so we mark it as aliased as well.
if (EntireTupleContentsAliased(root->shape(), index,
input_output_alias)) {
result.AddAliasedIndex(index);
}
}
buffers_in_result.insert(result_buffer);
}
for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
}
TF_RETURN_IF_ERROR(ExecuteThunks(run_options, buffer_allocations,
block_host_until_done,
hlo_execution_profile));
// Free all temporary allocations.
TF_RETURN_IF_ERROR(
buffer_allocations.TearDown(buffers_in_result, assignment_.get()));
std::vector<se::OwningDeviceMemory> buffers_to_free;
for (auto& argument : arguments) {
// Free allocations for arguments.
for (ExecutionInput& argument : arguments) {
for (auto& index_buffer : *argument.MutableBuffers()) {
auto maybe_owning_buffer = index_buffer.second.Release();
if (maybe_owning_buffer) {
buffers_to_free.push_back(std::move(*maybe_owning_buffer));
if (absl::optional<se::OwningDeviceMemory> owning =
index_buffer.second.Release()) {
result.AddToBeReleased(std::move(*owning));
}
}
}
return result;
return std::move(result);
}
const InstructionValueSet& GpuExecutable::GetRootValueSet() const {

View File

@ -216,8 +216,8 @@ TEST_F(BufferDonationTest, SimpleWhileTupleTest) {
HloInstruction::CreateGetTupleElement(f32v1_, while0, 1));
builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
module->AddEntryComputation(builder.Build());
// Input output aliasing is only supported on TPU.
#if defined(XLA_TEST_BACKEND_TPU)
// Input output aliasing is supported on CPU and GPU.
#if defined(XLA_TEST_BACKEND_TPU) || defined(XLA_TEST_BACKEND_GPU)
TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({0}, 0, {0}));
TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({1}, 0, {1}));
#endif