[XLA:GPU] Support buffer aliasing for XLA:GPU
This is the change on XLA side, the change on the TF/XLA bridge is still TBD. Dropping the check in xla_launch_util, as aliased buffers are no longer represented as nulls. PiperOrigin-RevId: 315607922 Change-Id: I24903d9288604cff142a7f0872d924a5da621e49
This commit is contained in:
parent
4738f8a6e6
commit
bbe0c4590c
@ -468,10 +468,6 @@ Status XlaComputationLaunchContext::PopulateOutputs(
|
||||
<< "Invalid input for outputs " << i << ": " << input_index;
|
||||
ctx->set_output(i, ctx->input(input_index));
|
||||
} else {
|
||||
if (MustAliasOutput(input_output_alias, output_num)) {
|
||||
DCHECK(output.buffer({output_num}).is_null())
|
||||
<< "Expected output buffer to be aliased, but it is not nil.";
|
||||
}
|
||||
if (allocate_xla_tensors_) {
|
||||
TF_RETURN_IF_ERROR(SetBufferForTensorUnderAllocateXlaTensors(
|
||||
input_output_alias, output_num, ctx, i, shape, &output,
|
||||
|
@ -360,6 +360,27 @@ StatusOr<se::DeviceMemoryBase> GpuExecutable::BufferForAllocation(
|
||||
}
|
||||
}
|
||||
|
||||
static Status CheckAlignment(const BufferAllocation& allocation,
|
||||
se::DeviceMemoryBase buffer, int arg_idx) {
|
||||
const int64 expected_alignment = [&] {
|
||||
if (allocation.is_entry_computation_parameter()) {
|
||||
return kEntryParameterAlignBytes;
|
||||
} else if (allocation.is_constant()) {
|
||||
return kConstantBufferAlignBytes;
|
||||
} else {
|
||||
return kXlaAllocatedBufferAlignBytes;
|
||||
}
|
||||
}();
|
||||
if (!buffer.is_null() &&
|
||||
reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment != 0) {
|
||||
return InternalError(
|
||||
"Address of buffer %d must be a multiple of %x, but "
|
||||
"was %p",
|
||||
arg_idx, expected_alignment, buffer.opaque());
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
|
||||
absl::Span<ExecutionInput const> arguments,
|
||||
const GpuExecutable::BufferAllocToDeviceMemoryMap* globals,
|
||||
@ -378,28 +399,37 @@ StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
|
||||
se::DeviceMemoryBase buffer,
|
||||
BufferForAllocation(arguments, globals, allocation, memory_allocator,
|
||||
executor->device_ordinal(), i));
|
||||
const int64 expected_alignment = [&] {
|
||||
if (allocation.is_entry_computation_parameter()) {
|
||||
return kEntryParameterAlignBytes;
|
||||
} else if (allocation.is_constant()) {
|
||||
return kConstantBufferAlignBytes;
|
||||
} else {
|
||||
return kXlaAllocatedBufferAlignBytes;
|
||||
}
|
||||
}();
|
||||
if (!buffer.is_null() &&
|
||||
reinterpret_cast<uintptr_t>(buffer.opaque()) % expected_alignment !=
|
||||
0) {
|
||||
return InternalError(
|
||||
"Address of buffer %d must be a multiple of %x, but "
|
||||
"was %p",
|
||||
i, expected_alignment, buffer.opaque());
|
||||
}
|
||||
buffers.push_back(buffer);
|
||||
TF_RETURN_IF_ERROR(CheckAlignment(allocation, buffer, i));
|
||||
}
|
||||
return {{buffers, executor->device_ordinal(), memory_allocator}};
|
||||
}
|
||||
|
||||
// Returns `true` if the entire tuple contents is aliased.
|
||||
static bool EntireTupleContentsAliased(
|
||||
const Shape& output_shape, const ShapeIndex& index,
|
||||
const HloInputOutputAliasConfig& alias_config) {
|
||||
const Shape& indexed_shape = ShapeUtil::GetSubshape(output_shape, index);
|
||||
if (!indexed_shape.IsTuple()) {
|
||||
return false;
|
||||
}
|
||||
bool all_aliased = true;
|
||||
ShapeUtil::ForEachSubshape(
|
||||
indexed_shape, [&](const Shape& subshape, const ShapeIndex& subindex) {
|
||||
if (subindex.empty()) {
|
||||
return;
|
||||
}
|
||||
std::vector<int64> full_index;
|
||||
absl::c_copy(index, std::back_inserter(full_index));
|
||||
absl::c_copy(subindex, std::back_inserter(full_index));
|
||||
if (!alias_config.OutputHasAlias(
|
||||
ShapeIndex(full_index.begin(), full_index.end()))) {
|
||||
all_aliased = false;
|
||||
}
|
||||
});
|
||||
return all_aliased;
|
||||
}
|
||||
|
||||
StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
|
||||
const ServiceExecutableRunOptions* run_options,
|
||||
std::vector<ExecutionInput> arguments,
|
||||
@ -425,84 +455,102 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
|
||||
}
|
||||
|
||||
se::StreamExecutor* executor = run_options->stream()->parent();
|
||||
TF_ASSIGN_OR_RETURN(BufferAllocations buffer_allocations,
|
||||
GenerateBufferAllocations(arguments, globals,
|
||||
memory_allocator, executor));
|
||||
|
||||
for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
|
||||
TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
|
||||
}
|
||||
VLOG(2) << buffer_allocations.ToString();
|
||||
TF_RETURN_IF_ERROR(ExecuteThunks(run_options, buffer_allocations,
|
||||
block_host_until_done,
|
||||
hlo_execution_profile));
|
||||
|
||||
HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
|
||||
auto device_ordinal = executor->device_ordinal();
|
||||
ExecutionOutput result(root->shape(), root->shape(), memory_allocator,
|
||||
ExecutionOutput result(/*on_host_shape=*/root->shape(),
|
||||
/*on_device_shape=*/root->shape(), memory_allocator,
|
||||
device_ordinal);
|
||||
|
||||
TF_ASSIGN_OR_RETURN(BufferAllocations buffer_allocations,
|
||||
GenerateBufferAllocations(arguments, globals,
|
||||
memory_allocator, executor));
|
||||
VLOG(2) << buffer_allocations.ToString();
|
||||
std::set<se::DeviceMemoryBase> buffers_in_result;
|
||||
for (auto& p : result.MutableResult()->buffers()) {
|
||||
const ShapeIndex& index = p.first;
|
||||
se::DeviceMemoryBase& device_memory = p.second;
|
||||
se::DeviceMemoryBase& result_buffer = p.second;
|
||||
const auto& sources = GetRootValueSet().element(index);
|
||||
// The points-to set is unambiguous so the set should be a
|
||||
// singleton. That is, we know exactly which instruction
|
||||
// produced the array at this element.
|
||||
CHECK_EQ(1, sources.values().size());
|
||||
auto src_hlo = sources.values()[0]->instruction();
|
||||
HloInstruction* src_hlo = sources.values()[0]->instruction();
|
||||
|
||||
VLOG(4) << "Looking at: " << sources.values()[0];
|
||||
|
||||
// The source instruction should have a non-parameter buffer
|
||||
// assigned.
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
const BufferAllocation::Slice slice,
|
||||
assignment_->GetUniqueSlice(src_hlo, sources.values()[0]->index()));
|
||||
|
||||
se::DeviceMemoryBase src_base =
|
||||
buffer_allocations.GetDeviceAddress(slice.index());
|
||||
CHECK(!src_base.is_null() || src_base.size() == 0);
|
||||
if (!slice.allocation()->is_entry_computation_parameter()) {
|
||||
// If the buffer coming out of the result is from a parameter, it
|
||||
// means the caller aliased some parameter buffer to an output one
|
||||
// (via the HloInputOutputAliasConfig API). If that is the case, the
|
||||
// caller will receive a partially complete scoped shaped buffer,
|
||||
// which they will have to fill up on return.
|
||||
// Unfortunately the interface to the execute APIs are ShapedBuffer
|
||||
// pointer based, which assumes caller ownership, and hence a buffer
|
||||
// coming from there cannot be part of the new ScopedShapedBuffer we
|
||||
// create for the result (which assumes ownership).
|
||||
device_memory = src_base;
|
||||
} else {
|
||||
const HloInputOutputAliasConfig& input_output_alias =
|
||||
module().input_output_alias_config();
|
||||
auto output_alias = input_output_alias.GetAliasedOutput(
|
||||
slice.allocation()->parameter_number(),
|
||||
slice.allocation()->param_shape_index());
|
||||
CHECK(output_alias) << "Output buffer is coming from parameter "
|
||||
<< slice.allocation()->parameter_number()
|
||||
<< " at index "
|
||||
<< slice.allocation()->param_shape_index()
|
||||
<< ", but no alias exists";
|
||||
CHECK_EQ(*output_alias, index);
|
||||
const HloInputOutputAliasConfig& input_output_alias =
|
||||
module().input_output_alias_config();
|
||||
absl::optional<HloInputOutputAliasConfig::Alias> alias =
|
||||
input_output_alias.GetAliasedParameter(index);
|
||||
if (alias) {
|
||||
CHECK_LT(alias->parameter_number, arguments.size());
|
||||
ExecutionInput& input = arguments[alias->parameter_number];
|
||||
MaybeOwningDeviceMemory* maybe_owning_memory =
|
||||
input.MutableBuffer(alias->parameter_index);
|
||||
if (absl::optional<se::OwningDeviceMemory> owning =
|
||||
maybe_owning_memory->Release()) {
|
||||
// If the caller passes the ownership of the device memory, reuse it
|
||||
// as the output buffer. It is up to the caller whether or not to
|
||||
// donate a buffer; the aliasing information describes which buffers
|
||||
// may alias, not buffers that must alias.
|
||||
se::DeviceMemoryBase argument_buffer = owning->Release();
|
||||
*maybe_owning_memory = argument_buffer;
|
||||
result_buffer = argument_buffer;
|
||||
if (alias->kind == HloInputOutputAliasConfig::kUserAlias) {
|
||||
// This is a user alias, so a must alias. The caller is giving us the
|
||||
// input buffer, but in case of error from the execute call, we should
|
||||
// not be releasing it as it contains valid data (for example, it is a
|
||||
// parameter which the user wants us to alias, in a gradient update
|
||||
// computation). So we store the index into the result in the aliased
|
||||
// vector, which will be fed to the ExecutionOutput, which will use
|
||||
// the indices to drop the addresses from its own ScopedShapedBuffer
|
||||
// result, if the ExecutionOutput is not committed.
|
||||
result.AddAliasedIndex(index);
|
||||
}
|
||||
}
|
||||
}
|
||||
buffers_in_result.insert(src_base);
|
||||
|
||||
if (result_buffer.is_null()) {
|
||||
// The source instruction should have a non-parameter buffer
|
||||
// assigned.
|
||||
TF_ASSIGN_OR_RETURN(
|
||||
const BufferAllocation::Slice slice,
|
||||
assignment_->GetUniqueSlice(src_hlo, sources.values()[0]->index()));
|
||||
result_buffer = buffer_allocations.GetDeviceAddress(slice.index());
|
||||
|
||||
// If the entire tuple contents is aliased, the copy insertion will *not*
|
||||
// materialize a new tuple, so we mark it as aliased as well.
|
||||
if (EntireTupleContentsAliased(root->shape(), index,
|
||||
input_output_alias)) {
|
||||
result.AddAliasedIndex(index);
|
||||
}
|
||||
}
|
||||
buffers_in_result.insert(result_buffer);
|
||||
}
|
||||
|
||||
for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
|
||||
TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
|
||||
}
|
||||
TF_RETURN_IF_ERROR(ExecuteThunks(run_options, buffer_allocations,
|
||||
block_host_until_done,
|
||||
hlo_execution_profile));
|
||||
|
||||
// Free all temporary allocations.
|
||||
TF_RETURN_IF_ERROR(
|
||||
buffer_allocations.TearDown(buffers_in_result, assignment_.get()));
|
||||
|
||||
std::vector<se::OwningDeviceMemory> buffers_to_free;
|
||||
for (auto& argument : arguments) {
|
||||
// Free allocations for arguments.
|
||||
for (ExecutionInput& argument : arguments) {
|
||||
for (auto& index_buffer : *argument.MutableBuffers()) {
|
||||
auto maybe_owning_buffer = index_buffer.second.Release();
|
||||
if (maybe_owning_buffer) {
|
||||
buffers_to_free.push_back(std::move(*maybe_owning_buffer));
|
||||
if (absl::optional<se::OwningDeviceMemory> owning =
|
||||
index_buffer.second.Release()) {
|
||||
result.AddToBeReleased(std::move(*owning));
|
||||
}
|
||||
}
|
||||
}
|
||||
return result;
|
||||
|
||||
return std::move(result);
|
||||
}
|
||||
|
||||
const InstructionValueSet& GpuExecutable::GetRootValueSet() const {
|
||||
|
@ -216,8 +216,8 @@ TEST_F(BufferDonationTest, SimpleWhileTupleTest) {
|
||||
HloInstruction::CreateGetTupleElement(f32v1_, while0, 1));
|
||||
builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
|
||||
module->AddEntryComputation(builder.Build());
|
||||
// Input output aliasing is only supported on TPU.
|
||||
#if defined(XLA_TEST_BACKEND_TPU)
|
||||
// Input output aliasing is supported on CPU and GPU.
|
||||
#if defined(XLA_TEST_BACKEND_TPU) || defined(XLA_TEST_BACKEND_GPU)
|
||||
TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({0}, 0, {0}));
|
||||
TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({1}, 0, {1}));
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user